# Spam comments detection

### Import modules

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report

### Import dataset files from Google Drive

In [2]:
Psy = pd.read_csv("Youtube01-Psy.csv")
Katy = pd.read_csv("Youtube02-KatyPerry.csv")
LMFAO = pd.read_csv("Youtube03-LMFAO.csv")
Eminem = pd.read_csv("Youtube04-Eminem.csv")
Shakira = pd.read_csv("Youtube05-Shakira.csv")

In [3]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira])
data.drop(["COMMENT_ID", "DATE", "AUTHOR"], axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956 entries, 0 to 369
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   CONTENT  1956 non-null   object
 1   CLASS    1956 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 45.8+ KB


In [18]:
data.head()

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [19]:
data.tail()

Unnamed: 0,CONTENT,CLASS
365,I love this song because we sing it at Camp al...,0
366,I love this song for two reasons: 1.it is abou...,0
367,wow,0
368,Shakira u are so wiredo,0
369,Shakira is the best dancer,0


### Splitting dataset into train/test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"])
print(X_train.shape)
print(y_train.shape)

(1467,)
(1467,)


### Tokenizing comments in training set and applying TF-IDF vectorizer on training set

In [9]:
tfidf_vect = TfidfVectorizer(use_idf=True, lowercase=True)
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_train_tfidf.shape

(1467, 3611)

### Training the multinomial Naive Bayes model

In [10]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Storing the model in pickle file

In [11]:
with open('spam_detection_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [12]:
with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vect, vectorizer_file)

# Checking the accuracy of model

In [14]:
X_test_tfidf = tfidf_vect.transform(X_test)
predictions = model.predict(X_test_tfidf)

In [15]:
model.score(X_test_tfidf, y_test)

0.8957055214723927

In [16]:
confusion_matrix(y_test, predictions)

array([[198,  32],
       [ 19, 240]])

In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.91      0.86      0.89       230
           1       0.88      0.93      0.90       259

    accuracy                           0.90       489
   macro avg       0.90      0.89      0.89       489
weighted avg       0.90      0.90      0.90       489

