In [None]:
from googleapiclient.discovery import build
import json
import requests
import time
import pandas as pd
import re
import numpy as np

## Taking input and Splitting the video Id

In [None]:
#Test url with emoji's https://www.youtube.com/watch?v=las-iT6Vp6g
#Test url with links https://www.youtube.com/watch?v=Ou3v8-ngN6Y
#Test url with punctuations https://www.youtube.com/watch?v=tktKJWdLv10
#Test with most comments https://www.youtube.com/watch?v=gdZLi9oWNZg
url_input = "https://www.youtube.com/watch?v=qCUeCiuvxI8"
video_id_split =url_input.split('=')
video_id=video_id_split[1]

## Total number of comments

#### *Quota impact: A call to this method has a quota cost of 1 unit.*
#### *1 call returns 20 comments* 
#### *Daily quota limit = 10,000 unit (Daily quotas reset at midnight Pacific Time (PT) = 12.45 PM NPT)* 
#### *Total number of comments that can be retrived in a day ~ 10,000 * 20 = 200,000* 
#### *Approximately < 195,000 comment's can be retrived in a day*


In [None]:
api_key = ''
url = 'https://www.googleapis.com/youtube/v3/videos?id='+video_id+'&key='+api_key+'&part=snippet,contentDetails,statistics,status'
response_info=requests.get(url).json()
comments=[]
for comment_count in response_info['items']:
 comments.append(comment_count['statistics'])
for val in comments:
  total = int(val['commentCount'])
print(total)
if(total>=195000):
  print("Cannot retrive more than 195,000 comments")

30


## Storing all the comments in a list


In [None]:

def getAllTopLevelCommentReplies(topCommentId, replies, token): 
    replies_response=youtube.comments().list(part='snippet',
                                               maxResults=100,
                                               parentId=topCommentId,
                                               pageToken=token).execute()

    for item in replies_response['items']:
        replies.append(item['snippet']['textDisplay'])
    if "nextPageToken" in replies_response: 
      return getAllTopLevelCommentReplies(topCommentId, replies, replies_response['nextPageToken'])
    else:
      return replies
      
def get_comments(youtube, video_id, comments=[], token=''):
  totalReplyCount = 0
  replies=[]

  video_response=youtube.commentThreads().list(part='snippet',
                                               videoId=video_id,
                                               pageToken=token).execute()
  for item in video_response['items']:
            comment = item['snippet']['topLevelComment']
            text = comment['snippet']['textDisplay']
            totalReplyCount = item['snippet']['totalReplyCount']
            if (totalReplyCount > 0): 
               comments.extend(getAllTopLevelCommentReplies(comment['id'], replies, None)) 
            else: 
               comments.append(text)
            replies = []

  if "nextPageToken" in video_response: 
        return get_comments(youtube, video_id, comments, video_response['nextPageToken'])
  else:
        return comments

youtube = build('youtube', 'v3',developerKey=api_key)
comments = get_comments(youtube,video_id)
print(len(comments))
  

27


## Converting the list to dataframe


In [None]:
df = pd.DataFrame(comments,columns=['Comments'])
print(df)

                                             Comments
0   UltraSam Yt: &quot; Jalwa hai humara yeha.. &q...
1                    Chalo koi to janta h mutahar ko!
2   Hn ab ye uncle ka ded gemwire ko koi puchta nh...
3   I respect everyone who were involved in this s...
4   Automatic channel pe video ban ke upload ho ja...
5                 Ultra sam yt ne sponsor kiya hoga🙂😂
6   IGN gives $20 coz it&#39;s not an Indian compa...
7   یا اللّه مجھے سپورٹ کرنے والوں کی زندگی میں ڈھ...
8                                                   🤨
9                          HYDRAxMortaL<br>SouLDynamø
10  <a href="https://www.youtube.com/watch?v=qCUeC...
11              Gametube after seeing this video :- 🌝
12  Next Article - Extra Ordinary Exposes Sports K...
13  best news coverage Bhai .. this was like real ...
14         Ign se compare krege is roadside site ko😂😂
15                        Literally informative vedio
16                    @ραιη • OG mujhe b dedo bhai⚠️😡
17                          

# Data Cleaning


## Removing all the emoji's from the dataframe


In [None]:
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
df

Unnamed: 0,Comments
0,UltraSam Yt: &quot; Jalwa hai humara yeha.. &q...
1,Chalo koi to janta h mutahar ko!
2,Hn ab ye uncle ka ded gemwire ko koi puchta nh...
3,I respect everyone who were involved in this s...
4,Automatic channel pe video ban ke upload ho ja...
5,Ultra sam yt ne sponsor kiya hoga
6,IGN gives $20 coz it&#39;s not an Indian compa...
7,<br> ......
8,
9,HYDRAxMortaL<br>SouLDynam


## Removing all the url's from the data frame

In [None]:
df['Comments'] = df['Comments'].apply(lambda x: re.split('<a href="https:\/\/.*', str(x))[0])
df

Unnamed: 0,Comments
0,UltraSam Yt: &quot; Jalwa hai humara yeha.. &q...
1,Chalo koi to janta h mutahar ko!
2,Hn ab ye uncle ka ded gemwire ko koi puchta nh...
3,I respect everyone who were involved in this s...
4,Automatic channel pe video ban ke upload ho ja...
5,Ultra sam yt ne sponsor kiya hoga
6,IGN gives $20 coz it&#39;s not an Indian compa...
7,<br> ......
8,
9,HYDRAxMortaL<br>SouLDynam


## Removing all special characters

In [None]:
def process_content(content):
    return " ".join(re.findall("[A-Za-z]+",content))

df['Comments'] = df['Comments'].apply(process_content)
df

Unnamed: 0,Comments
0,UltraSam Yt quot Jalwa hai humara yeha quot
1,Chalo koi to janta h mutahar ko
2,Hn ab ye uncle ka ded gemwire ko koi puchta nh...
3,I respect everyone who were involved in this s...
4,Automatic channel pe video ban ke upload ho ja...
5,Ultra sam yt ne sponsor kiya hoga
6,IGN gives coz it s not an Indian company Just ...
7,br
8,
9,HYDRAxMortaL br SouLDynam


## Converting to lower case

In [None]:
df['Comments'] = df['Comments'].str.lower()
df

Unnamed: 0,Comments
0,ultrasam yt quot jalwa hai humara yeha quot
1,chalo koi to janta h mutahar ko
2,hn ab ye uncle ka ded gemwire ko koi puchta nh...
3,i respect everyone who were involved in this s...
4,automatic channel pe video ban ke upload ho ja...
5,ultra sam yt ne sponsor kiya hoga
6,ign gives coz it s not an indian company just ...
7,br
8,
9,hydraxmortal br souldynam


## Removing empty rows

In [None]:
df['Comments'].replace('', np.nan, inplace=True)
df.dropna()

Unnamed: 0,Comments
0,ultrasam yt quot jalwa hai humara yeha quot
1,chalo koi to janta h mutahar ko
2,hn ab ye uncle ka ded gemwire ko koi puchta nh...
3,i respect everyone who were involved in this s...
4,automatic channel pe video ban ke upload ho ja...
5,ultra sam yt ne sponsor kiya hoga
6,ign gives coz it s not an indian company just ...
7,br
9,hydraxmortal br souldynam
11,gametube after seeing this video


# Training model for spam detection

In [None]:
train_data=[]
data_files = ['/content/drive/MyDrive/Colab--Notebooks/Youtube01-Psy.csv','/content/drive/MyDrive/Colab--Notebooks/Youtube02-KatyPerry.csv','/content/drive/MyDrive/Colab--Notebooks/Youtube03-LMFAO.csv','/content/drive/MyDrive/Colab--Notebooks/Youtube04-Eminem.csv','/content/drive/MyDrive/Colab--Notebooks/Youtube05-Shakira.csv']
for file in data_files:
    data = pd.read_csv(file)
    train_data.append(data)
train_data = pd.concat(train_data)

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956 entries, 0 to 369
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  1956 non-null   object
 1   AUTHOR      1956 non-null   object
 2   DATE        1711 non-null   object
 3   CONTENT     1956 non-null   object
 4   CLASS       1956 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 91.7+ KB


In [None]:
def drop_fectures(features,data):
    data.drop(features,axis=1,inplace=True)
drop_fectures(['COMMENT_ID','AUTHOR','DATE'],train_data)
def process_content(content):
    return " ".join(re.findall("[A-Za-z]+",content.lower()))
train_data['processed_content'] = train_data['CONTENT'].apply(process_content)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data['processed_content'],train_data['CLASS'],test_size=0.2,random_state=57)

In [None]:
drop_fectures(['CONTENT'],train_data)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data['processed_content'],train_data['CLASS'],test_size=0.2,random_state=57)
# Using CountVectorizer for text preprocessing, tokenizing and filtering of stopwords. It builds a dictionary of features and transform documents to feature vectors.

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words='english')
x_train_counts = count_vect.fit_transform(x_train)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)

In [None]:
x_test_counts = count_vect.transform(x_test)

In [None]:
x_test_tfidf = tranformer.transform(x_test_counts)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train_tfidf,y_train)

LogisticRegression()

In [None]:
predictions = model.predict(x_test_tfidf)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
confusion_matrix(y_test,predictions)

array([[176,   5],
       [ 24, 187]])

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train_tfidf,y_train)

RandomForestClassifier()

In [None]:
predictions = model.predict(x_test_tfidf)
confusion_matrix(y_test,predictions)

array([[176,   5],
       [ 20, 191]])

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
                     'max_depth' : [1,3,4],
                     'n_estimators': [10,30,50],
                     'max_features': ['sqrt', 'auto', 'log2'],
                     'min_samples_split': [10,20,30],
                     'min_samples_leaf': [1, 3, 10],
                     'bootstrap': [True, False],
                     }
model = GridSearchCV(RandomForestClassifier(),parameters)
model.fit(x_train_tfidf,y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False], 'max_depth': [1, 3, 4],
                         'max_features': ['sqrt', 'auto', 'log2'],
                         'min_samples_leaf': [1, 3, 10],
                         'min_samples_split': [10, 20, 30],
                         'n_estimators': [10, 30, 50]})