<a href="https://colab.research.google.com/github/siddharthapdutta/Machine-Learning-Projects/blob/master/YouTube_Comment_Spam_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# YouTube Comment Spam Classification
Made on [Google Colab](https://colab.research.google.com/).

## Obtaining the Dataset
[UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection)

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip
!unzip YouTube-Spam-Collection-v1.zip

--2021-08-15 10:09:50--  https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 163567 (160K) [application/x-httpd-php]
Saving to: ‘YouTube-Spam-Collection-v1.zip’


2021-08-15 10:09:50 (5.40 MB/s) - ‘YouTube-Spam-Collection-v1.zip’ saved [163567/163567]

Archive:  YouTube-Spam-Collection-v1.zip
  inflating: Youtube01-Psy.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._Youtube01-Psy.csv  
  inflating: Youtube02-KatyPerry.csv  
  inflating: __MACOSX/._Youtube02-KatyPerry.csv  
  inflating: Youtube03-LMFAO.csv     
  inflating: __MACOSX/._Youtube03-LMFAO.csv  
  inflating: Youtube04-Eminem.csv    
  inflating: __MACOSX/._Youtube04-Eminem.csv  
  inflating: Youtube05-Shakira.csv   
  inflating: __MACOSX/._Youtube05-Shakira.csv  

## Importing Required Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from pickle import dump, load
from os import environ
from googleapiclient.discovery import build  # YouTube API
from urllib.parse import urlparse, parse_qs

## Dataset Preprocessing

In [None]:
# Loading Datasets
df = pd.concat([pd.read_csv('Youtube01-Psy.csv'),
                pd.read_csv('Youtube02-KatyPerry.csv'),
                pd.read_csv('Youtube03-LMFAO.csv'),
                pd.read_csv('Youtube04-Eminem.csv'),
                pd.read_csv('Youtube05-Shakira.csv')])
df.tail()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
365,_2viQ_Qnc6-bMSjqyL1NKj57ROicCSJV5SwTrw-RFFA,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0
366,_2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI,Sabina Pearson-Smith,2013-07-13T13:14:30.021000,I love this song for two reasons: 1.it is abou...,0
367,_2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs,jeffrey jules,2013-07-13T12:09:31.188000,wow,0
368,_2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0,Aishlin Maciel,2013-07-13T11:17:52.308000,Shakira u are so wiredo,0
369,_2viQ_Qnc685RPw1aSa1tfrIuHXRvAQ2rPT9R06KTqA,Latin Bosch,2013-07-12T22:33:27.916000,Shakira is the best dancer,0


In [None]:
# Filtering Dataset
df = df[['CONTENT', 'CLASS']]  # Only require the comment content and its class
print('Spam Comments:', sum(df.CLASS == 1))  # Spam is classified as 1
print('Non-Spam Comments:', sum(df.CLASS == 0))  # Not spam is classified as 0

Spam Comments: 1005
Non-Spam Comments: 951


In [None]:
# Performing Train-Test Split
x = df['CONTENT']
y = df['CLASS']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
print(f'Training Data - x_train: {x_train.shape}  y_train: {y_train.shape}')
print(f'Testing Data  - x_test:  {x_test.shape}   y_test: {y_test.shape}')

Training Data - x_train: (1467,)  y_train: (1467,)
Testing Data  - x_test:  (489,)   y_test: (489,)


## Model Training

In [None]:
# Setting up a pipeline
pipeline = make_pipeline(CountVectorizer(),
                         TfidfTransformer(norm=None), 
                         RandomForestClassifier())
pipeline.steps

[('countvectorizer',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowercase=True, max_df=1.0, max_features=None, min_df=1,
                  ngram_range=(1, 1), preprocessor=None, stop_words=None,
                  strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                  tokenizer=None, vocabulary=None)),
 ('tfidftransformer',
  TfidfTransformer(norm=None, smooth_idf=True, sublinear_tf=False, use_idf=True)),
 ('randomforestclassifier',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, 

In [None]:
pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, voc...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

## Model Evaluation

In [None]:
# Evaulate against testing data
accuracy = pipeline.score(x_test, y_test)
print("Accuracy: %0.3f" % (accuracy))

Accuracy: 0.965


In [None]:
# Cross validation evaluation
scores = cross_val_score(pipeline, x, y, cv=5)
print("Accuracy %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))

Accuracy 0.94 (+/- 0.03)


In [None]:
# Confusion matrix
y_pred = pipeline.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[234,   3],
       [ 14, 238]])

## Model Tuning

In [None]:
# Best Parameters Search using Grid Search
parameters = {
    'countvectorizer__max_features': (None, 1000, 2000),  # Number of words
    'countvectorizer__ngram_range': ((1, 1), (1, 2)),  # Unigrams or bigrams
    'countvectorizer__stop_words': ('english', None),  # Stop words inclusion
    'tfidftransformer__use_idf': (True, False),  # TF-IDF use
    'randomforestclassifier__n_estimators': (20, 50, 100)  # Classifier estimators
}
grid_search = GridSearchCV(estimator=pipeline, 
                           param_grid=parameters,
                           n_jobs=-1,
                           verbose=1)
grid_search.fit(x, y)  # Use data for grid search

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                              

In [None]:
# Use Data for Grid Search
grid_search.fit(x, y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('countvectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                              

In [None]:
# Evaluation of Best Parameters
print("Best Accuracy: %0.3f" % grid_search.best_score_)
print("Best Parameter Set:")
best_parameters = grid_search.best_estimator_.get_params()
for parameter in sorted(parameters.keys()):
  print("\t%s:\t%r" % (parameter, best_parameters[parameter]))

Best Accuracy: 0.944
Best Parameter Set:
	countvectorizer__max_features:	None
	countvectorizer__ngram_range:	(1, 2)
	countvectorizer__stop_words:	None
	randomforestclassifier__n_estimators:	100
	tfidftransformer__use_idf:	False


In [None]:
# Save Best Model
dump(grid_search.best_estimator_, open("YouTube_Spam_Classifier.pkl", "wb"))

## Using the ML Model

Uses the [YouTube Data API](https://developers.google.com/youtube/v3) to obtain top-level comments from a YouTube video. 

[YouTube API Credentials](https://www.geeksforgeeks.org/youtube-data-api-set-1/) are required.

Set environment variable using the code block below.

In [None]:
%env API_KEY=<YOUR_API_KEY_HERE>
DEVELOPER_KEY = environ.get('API_KEY')
try:
  assert DEVELOPER_KEY
except AssertionError:
  raise Exception("API_KEY is required.")
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

In [None]:
def get_video_comments(video_id):
  ''' Returns top level commments for a youtube video. '''
  youtube = build(YOUTUBE_API_SERVICE_NAME,
                  YOUTUBE_API_VERSION,
                  developerKey=DEVELOPER_KEY)
  video_response = youtube.commentThreads().list(
      part='snippet',
      videoId=video_id
  ).execute()

  comments = list()
  while video_response:
    for item in video_response['items']:
      comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
      comments.append(comment)
    
    if 'nextPageToken' in video_response:
      video_response = youtube.commentThreads().list(
          part='snippet',
          videoId=video_id,
          pageToken=video_response['nextPageToken']
      ).execute()
    else:
      break

  return comments

In [None]:
# Retrieve comments from YouTube URL
youtube_link = urlparse(input("Enter YouTube URL: "))
youtube_id = parse_qs(youtube_link.query)['v'][0]
comments = get_video_comments(youtube_id)
print(f'{len(comments)} top-level comments found.')

Enter YouTube URL: https://www.youtube.com/watch?v=Vhh_GeBPOhs
1710 top-level comments found.


In [None]:
# Use trained model to classify comments
model = load(open('YouTube_Spam_Classifier.pkl','rb'))
pred = model.predict(comments)

In [None]:
# Parse predictions
results = {
    'spam': {
        'count': 0,
        'comments': list()
    },
    'non-spam': {
        'count': 0,
        'comments': list()
    }
}
for comment, clazz in zip(comments, pred):
  if clazz == 1:
    results['spam']['count'] += 1
    results['spam']['comments'].append(comment)
  else:
    results['non-spam']['count'] += 1
    results['non-spam']['comments'].append(comment)


print('Spam comments found:', results['spam']['count'])

Spam comments found: 58
