[Spam Detection for YouTube Comments using Python and scikit-learn | Machine Learning by Jackson Yuan](https://www.youtube.com/watch?v=FmmlHbHqgGY)

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, plotly

In [2]:
import pandas as pd
import zipfile
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report

In [3]:
z = zipfile.ZipFile(r"./youtube+spam+collection.zip")

In [4]:
z = zipfile.ZipFile("youtube+spam+collection.zip")

In [5]:
Psy = pd.read_csv(z.open("Youtube01-Psy.csv"))
Katy = pd.read_csv(z.open("Youtube02-KatyPerry.csv"))
LMFAO = pd.read_csv(z.open("Youtube03-LMFAO.csv"))
Eminem = pd.read_csv(z.open("Youtube04-Eminem.csv"))
Shakira = pd.read_csv(z.open("Youtube05-Shakira.csv"))

In [6]:
data = pd.concat([Psy, Katy, LMFAO, Eminem, Shakira], axis = 0)

In [7]:
data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [8]:
data.drop(["COMMENT_ID", "AUTHOR", "DATE"], axis = 1, inplace = True)

In [9]:
data.head()

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data["CONTENT"], data["CLASS"]) # by default de will have 66.66% training and 33.33% test

In [11]:
X_train.shape

(1467,)

In [12]:
X_test.shape

(489,)

In [13]:
489/1467 * 100

33.33333333333333

In [14]:
tfidf_vect = TfidfVectorizer(use_idf=True, lowercase=True)

In [15]:
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_train_tfidf.shape

(1467, 3615)

In [16]:
model = MultinomialNB() # we are using the default hyperparameter values

In [17]:
model.fit(X_train_tfidf, y_train)

In [18]:
X_test_tfidf = tfidf_vect.transform(X_test)
X_test_tfidf.shape

(489, 3615)

In [19]:
predictions = model.predict(X_test_tfidf)
predictions.shape, y_test.shape

((489,), (489,))

In [20]:
predictions[:10]

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

In [21]:
confusion_matrix(y_test, predictions)

array([[204,  32],
       [  8, 245]])

In [22]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.96      0.86      0.91       236
           1       0.88      0.97      0.92       253

    accuracy                           0.92       489
   macro avg       0.92      0.92      0.92       489
weighted avg       0.92      0.92      0.92       489



In [23]:
model.score(X_test_tfidf, y_test)

0.918200408997955

In [24]:
(213+239)/489 # True positve + True negative / total data points

0.9243353783231084

In [25]:
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

In [26]:
with open("tfidf-vect.pkl", "wb") as tfidf_vect_file:
    pickle.dump(tfidf_vect, tfidf_vect_file)

In [27]:
!ls

'01 Spam Detection for YouTube Comments.ipynb'	 tfidf-vect.pkl
 dataset					 youtube+spam+collection.zip
 model.pkl


#### Loading model to perform prediction

In [28]:
with open("model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

In [29]:
Spam Detection for YouTube Commentswith open("tfidf-vect.pkl", "rb") as tfidf_vect_file:
    tfidf_vect = pickle.load(tfidf_vect_file)

In [30]:
comment = np.array(["Who else just randomly listening old songs"])

In [31]:
comment

array(['Who else just randomly listening old songs'], dtype='<U42')

In [32]:
vect_comment = tfidf_vect.transform(comment)

In [33]:
vect_comment.shape

(1, 3615)

In [34]:
model.predict(vect_comment)

array([0])

In [35]:
data.sample(10)

Unnamed: 0,CONTENT,CLASS
325,adam b beats check out my page,1
93,waka waka:-):-):-)﻿,0
124,Imagine this in the news crazy woman found act...,0
338,CHECK OUT THESE LYRICS /watch?v=yUTTX04oyqQ,1
191,The Guy in the yellow suit kinda looks like Ja...,0
16,Take a look at this video on YouTube:﻿,1
69,It's been back for quite a while now.,0
310,everyone come and check out the new GTA 5 Game...,1
356,Check out this video on YouTube:﻿,1
102,Amazing song﻿,0


#### Do cross validation

In [36]:
from sklearn.model_selection import cross_validate

In [37]:
estimator = MultinomialNB()

In [38]:
scoring = ['precision', 'recall', 'f1']

In [39]:
scores_cross_validate = cross_validate(estimator=estimator, X=tfidf_vect.fit_transform(X_train), y=y_train, scoring=scoring, cv=10, verbose=1000)

[CV] START .....................................................................
[CV] END  f1: (test=0.935) precision: (test=0.911) recall: (test=0.960) total time=   0.0s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.954) precision: (test=0.947) recall: (test=0.960) total time=   0.0s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.934) precision: (test=0.922) recall: (test=0.947) total time=   0.0s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.922) precision: (test=0.899) recall: (test=0.947) total time=   0.0s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.0s
[CV] START .........................................................

In [40]:
scores_cross_validate

{'fit_time': array([0.00223851, 0.00138044, 0.00136232, 0.00143003, 0.00135803,
        0.00133491, 0.00138783, 0.00133514, 0.00143862, 0.00138211]),
 'score_time': array([0.00395775, 0.00390077, 0.00377774, 0.00383687, 0.00381804,
        0.00373507, 0.00367785, 0.00374889, 0.00378895, 0.00383019]),
 'test_precision': array([0.91139241, 0.94736842, 0.92207792, 0.89873418, 0.8625    ,
        0.94736842, 0.8961039 , 0.90410959, 0.90123457, 0.88607595]),
 'test_recall': array([0.96      , 0.96      , 0.94666667, 0.94666667, 0.92      ,
        0.94736842, 0.90789474, 0.88      , 0.97333333, 0.93333333]),
 'test_f1': array([0.93506494, 0.95364238, 0.93421053, 0.92207792, 0.89032258,
        0.94736842, 0.90196078, 0.89189189, 0.93589744, 0.90909091])}

In [41]:
from sklearn.model_selection import cross_val_score

In [42]:
scores_cross_val_score = cross_val_score(estimator=estimator, X=tfidf_vect.fit_transform(X_train), y = y_train, scoring="f1", cv = 10, verbose=300)

[CV] START .....................................................................
[CV] END ................................ score: (test=0.935) total time=   0.0s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.954) total time=   0.0s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.934) total time=   0.0s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.922) total time=   0.0s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ..................

In [43]:
scores_cross_val_score

array([0.93506494, 0.95364238, 0.93421053, 0.92207792, 0.89032258,
       0.94736842, 0.90196078, 0.89189189, 0.93589744, 0.90909091])

In [44]:
scores_cross_validate["test_f1"]

array([0.93506494, 0.95364238, 0.93421053, 0.92207792, 0.89032258,
       0.94736842, 0.90196078, 0.89189189, 0.93589744, 0.90909091])

In [45]:
scores_cross_validate_cv5 = cross_validate(estimator=estimator, X=tfidf_vect.fit_transform(X_train), y=y_train, scoring=scoring, cv=5, verbose=1000)

[CV] START .....................................................................
[CV] END  f1: (test=0.945) precision: (test=0.929) recall: (test=0.960) total time=   0.0s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.926) precision: (test=0.905) recall: (test=0.947) total time=   0.0s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.921) precision: (test=0.909) recall: (test=0.933) total time=   0.0s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END  f1: (test=0.903) precision: (test=0.906) recall: (test=0.900) total time=   0.0s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.0s
[CV] START .........................................................

In [46]:
scores_cross_val_score_cv5 = cross_val_score(estimator=estimator, X=tfidf_vect.fit_transform(X_train), y = y_train, scoring="f1", cv = 5, verbose=300)

[CV] START .....................................................................
[CV] END ................................ score: (test=0.945) total time=   0.0s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.926) total time=   0.0s
[Parallel(n_jobs=1)]: Done   2 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.921) total time=   0.0s
[Parallel(n_jobs=1)]: Done   3 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ................................ score: (test=0.903) total time=   0.0s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.0s
[CV] START .....................................................................
[CV] END ..................

In [47]:
scores_cross_val_score_cv5

array([0.94462541, 0.92556634, 0.92105263, 0.90301003, 0.9201278 ])

In [48]:
scores_cross_validate_cv5["test"]

KeyError: 'test'