In [1]:
import pandas as pd
import seaborn as sns
import  matplotlib.pyplot as plt
import numpy as np

# LOADING DATA  

In [2]:
data1 = pd.read_csv("youtube-spam-collection-v1/Youtube01-Psy.csv") 
data2 = pd.read_csv("youtube-spam-collection-v1/Youtube02-KatyPerry.csv")
data3 = pd.read_csv("youtube-spam-collection-v1/Youtube03-LMFAO.csv")
data4 = pd.read_csv("youtube-spam-collection-v1/Youtube04-Eminem.csv")
data5 = pd.read_csv("youtube-spam-collection-v1/Youtube05-Shakira.csv")

# Data Visualization

In [3]:
data = pd.concat([data1,data2,data3,data4,data5])
data.reset_index(drop=True, inplace=True)
data.tail(5)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
1951,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
1952,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
1953,_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs,jeffrey jules,2013-07-13T12:09:31.188000,wow,0
1954,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0
1955,_2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA,Latin Bosch,2013-07-12T22:33:27.916000,Shakira is the best dancer,0


In [4]:
data.describe()

Unnamed: 0,CLASS
count,1956.0
mean,0.513804
std,0.499937
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


# Inferance

This output is the result of calling the `describe()` method on a DataFrame named `data`, specifically on the 'CLASS' column. Here's what you can infer from this output:

1. **Count**: There are 1956 entries in the 'CLASS' column.
2. **Mean**: The mean value of the 'CLASS' column is approximately 0.514, indicating that about 51.4% of the entries are labeled as spam (assuming 1 represents spam and 0 represents non-spam).
3. **Standard Deviation (std)**: The standard deviation is approximately 0.500, which indicates the dispersion of values around the mean. Since this is close to 0.5, it suggests that the data is somewhat evenly distributed around the mean.
4. **Min**: The minimum value in the 'CLASS' column is 0, indicating the lowest label observed.
5. **25th Percentile (25%)**: 25% of the data falls below this value, which is 0 in this case.
6. **50th Percentile (Median or 50%)**: This is the median value, which is 1 in this case. It indicates that 50% of the data falls below this value, and 50% falls above it.
7. **75th Percentile (75%)**: 75% of the data falls below this value, which is 1 in this case.
8. **Max**: The maximum value in the 'CLASS' column is 1, indicating the highest label observed.

Overall, this summary provides insights into the distribution and characteristics of the 'CLASS' column, which likely represents whether a particular entry in the dataset is classified as spam (1) or not spam (0).

In [5]:
data.shape

(1956, 5)

In [6]:
data.columns

Index(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'], dtype='object')

# Data Preprocessing

In [7]:
#Checking null value presence
data.isnull().sum()

COMMENT_ID      0
AUTHOR          0
DATE          245
CONTENT         0
CLASS           0
dtype: int64

In [8]:
data['DATE']=data['DATE'].ffill()

In [9]:
data.isnull().sum()

COMMENT_ID    0
AUTHOR        0
DATE          0
CONTENT       0
CLASS         0
dtype: int64

In [10]:
#Checking the data-types
data.dtypes

COMMENT_ID    object
AUTHOR        object
DATE          object
CONTENT       object
CLASS          int64
dtype: object

In [11]:
# Select 'CONTENT' and 'CLASS' columns
data = data[['CONTENT', 'CLASS']]

# Verify the selection
print(data.head())


                                             CONTENT  CLASS
0  Huh, anyway check out this you[tube] channel: ...      1
1  Hey guys check out my new channel and our firs...      1
2             just for test I have to say murdev.com      1
3   me shaking my sexy ass on my channel enjoy ^_^ ﻿      1
4            watch?v=vtaRGgvGtWQ   Check this out .﻿      1


# Feature Selection

In [16]:
# Install the better-profanity library
# pip install better_profanity

from better_profanity import profanity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(data['CONTENT'])

# Profanity Detection
def contains_profanity(text):
    # Check if the text contains profanity
    return profanity.contains_profanity(text)

# Add a new column indicating if text contains profanity
data['PROFANITY'] = data['CONTENT'].apply(contains_profanity)

# Feature selection using SelectKBest and profanity detection
k_best_profanity = SelectKBest(score_func=chi2, k=1000)
X_selected_profanity = k_best_profanity.fit_transform(X_tfidf, data['PROFANITY'])

# Get selected feature names
selected_feature_names = tfidf_vectorizer.get_feature_names_out()
selected_indices_profanity = k_best_profanity.get_support(indices=True)
selected_features_profanity = [selected_feature_names[i] for i in selected_indices_profanity]
print(selected_features_profanity)




# MODEL TRAIN

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected_profanity, data['CLASS'], test_size=0.2, random_state=42)

# Initialize and train a logistic regression model
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_regression.predict(X_test)
print(y_pred)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[0 0 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 0 1 0 1
 1 0 0 0 1 0 1 1 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 1 0
 1 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0
 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 1 0 0 1 0 1 1 1 1 0 1 0 1 0
 1 1 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1
 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 0 1 1
 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 1 1
 1 0 1 1 0 1 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 1 0
 1 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0 0 1
 0 1 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0]
Accuracy: 0.8979591836734694


In [None]:
X = np.array(data['CONTENT'])
y = np.array(data['CLASS'])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay

In [None]:
CV = CountVectorizer()
X = CV.fit_transform(X)

In [None]:
X

<1956x4454 sparse matrix of type '<class 'numpy.int64'>'
	with 25765 stored elements in Compressed Sparse Row format>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


In [None]:
X_train.shape, y_train.shape

((1564, 4454), (1564,))

In [None]:
BNB = BernoulliNB()

In [None]:
BNB.fit(X_train, y_train)

In [None]:
y_pred = BNB.predict(X_test)

In [None]:
y_pred

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,

In [None]:
accuracy_score(y_test,y_pred)

0.8852040816326531

In [None]:
test = "check this www.kkl.com hi bad ass i am your bitch .com"
test_data = CV.transform([test]).toarray()
BNB.predict(test_data)[0]

1