## Mounting and redirecting

In [1]:
#Drive mounting
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
#redirecting to the desired path
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [3]:
%cd dataset/

/content/drive/My Drive/Colab Notebooks/dataset


In [4]:
!ls

Youtube01-Psy.csv	 Youtube03-LMFAO.csv
Youtube02-KatyPerry.csv  Youtube04-Eminem.csv


In [5]:
#importing and concatinating all .csv files
import pandas as pd
import numpy as np
import glob
path = r'/content/drive/My Drive/Colab Notebooks/dataset'
all_files = glob.glob(path + "/*.csv")
df_files = (pd.read_csv(f) for f in all_files)
df   = pd.concat(df_files, ignore_index=True)

In [6]:
# visualizing the data
df

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1
...,...,...,...,...,...
1581,LneaDw26bFu3RCmyrWyP9S6wh1h9dBv3X95g1HzKAb4,Dany PK,,SUBSCRIBE TO MY CHANNEL X PLEASE!. SPARE,1
1582,LneaDw26bFsD65dtIvAEObWYIYnFTqQDKBek_Ypz3J8,SmexyFriedChicken,,Check out my videos guy! :) Hope you guys had ...,1
1583,LneaDw26bFuvs-8oWkLpAFa6g3QHpWD8k7sbbMP3Bg8,The Guy That's Done Everything,,3 yrs ago I had a health scare but thankfully ...,1
1584,z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0k,Jesse Pinkman,2015-05-06T11:42:44.601000,Rihanna looks so beautiful with red hair ;)﻿,0


In [7]:
# specifying the features
df.drop(columns=["COMMENT_ID","AUTHOR","DATE"],inplace=True)

### Data-Preprocessing

In [8]:
#visualizing 4th row comment
df["CONTENT"][4]

'watch?v=vtaRGgvGtWQ   Check this out .\ufeff'

In [9]:
import html
df["CONTENT"]=df["CONTENT"].apply(html.unescape)
df["CONTENT"]=df["CONTENT"].str.replace("\ufeff","")

In [10]:
df["CONTENT"][4]

'watch?v=vtaRGgvGtWQ   Check this out .'

In [11]:
#trying to resolve spam link issues
df["CONTENT"]=df["CONTENT"].str.replace("(<a.+>)","htmllink")

In [12]:
df[df["CONTENT"].str.contains("<.+>")]["CONTENT"]

381                      <script>document.write('htmllink
702     Hey guys, I'm a human.<br /><br /><br />But I ...
708                                          Awsome<br />
728                             Super awesome video<br />
730     This Will Always Be My Favorite Song<br />But ...
                              ...                        
1406                    Hello. İ am from Azerbaijan<br />
1409                EMINEM<3 <br />the best rapper ever<3
1499    If you are a person that loves real music you ...
1546               Love your songs<br />Supper cool<br />
1566     Really good song .<br />you know love song song.
Name: CONTENT, Length: 65, dtype: object

In [13]:
df["CONTENT"]=df["CONTENT"].str.replace("<.+>","")

In [14]:
df["CONTENT"]=df["CONTENT"].str.replace("\'","")

In [15]:
df["CONTENT"]=df["CONTENT"].str.lower()

In [16]:
df[df["CONTENT"].str.contains("\.com|watch\?")]

Unnamed: 0,CONTENT,CLASS
2,just for test i have to say murdev.com,1
4,watch?v=vtarggvgtwq check this out .,1
12,https://twitter.com/gbphotographygb,1
14,please like :d https://premium.easypromosapp.c...,1
17,http://www.ebay.com/itm/171183229277?sspagenam...,1
...,...,...
1448,everyone come and check out the new gta 5 game...,1
1476,check out these lyrics /watch?v=yuttx04oyqq,1
1521,hello to everyone! please check out my video: ...,1
1522,/watch?v=aimbwbfqbzg watch and subscrible,1


In [17]:
df["CONTENT"][17]

'http://www.ebay.com/itm/171183229277?sspagename=strk:meselx:it&_trksid=p3984.m1555.l2649 '

In [18]:
#cleaning spam comments
df["CONTENT"]=df["CONTENT"].str.replace(r"\S*\.com\S*|\S*watch\?\S*","htmllink")

In [19]:
df["CONTENT"]=df["CONTENT"].str.replace("\W"," ")

In [20]:
#visualizing 14th row comment after data cleaning
df["CONTENT"][14]

'please like  d htmllink'

In [21]:
#checking comment no. 17 if spammed link is removed or not
df["CONTENT"][17]

'htmllink '

In [22]:
df

Unnamed: 0,CONTENT,CLASS
0,huh anyway check out this you tube channel ...,1
1,hey guys check out my new channel and our firs...,1
2,just for test i have to say htmllink,1
3,me shaking my sexy ass on my channel enjoy _,1
4,htmllink check this out,1
...,...,...
1581,subscribe to my channel x please spare,1
1582,check out my videos guy hope you guys had ...,1
1583,3 yrs ago i had a health scare but thankfully ...,1
1584,rihanna looks so beautiful with red hair,0


## Model Creation

In [None]:
#normalization is used to change the values of numeric columns in the dataset to use a common scale, without distorting differences in the ranges of values or losing information.
df["CLASS"].value_counts(normalize=True)

1    0.52396
0    0.47604
Name: CLASS, dtype: float64

In [None]:
vocab=[]
for comment in df["CONTENT"]:
    for word in comment.split():
        vocab.append(word)

In [None]:
#no. of different words in the dataset
vocabulary=list(set(vocab))
len(vocabulary)

3363

In [None]:
# Create a column for each of the unique word in our vocabulary inorder to get the count of words 
for word in vocabulary:
    df[word]=0

In [None]:
df.head()

Unnamed: 0,CONTENT,CLASS,uplifting,economic,told,need,later,more,refurbished,700,bus,blows,jenny,performing,sleep,micheal,robox,loads,twitch,radio,titles,miss,ermail,2012,your,allways,6,fell,3873,olielle,beibs,mice,fictional,dated,dress,crowd,wages,watching,jbs,his,...,quickest,shes,weight,use,superbbb,starlitnightsky,reunion,koean,yours,shell,sense,delightful,apps,greatest,actually,dede,bottom,total,1k,substantial,that,spread,018,characterized,parody,oreo,exactly,type,everyday,freind,u,woozy,there,school,sign,deep,night,size,bad,blow
0,huh anyway check out this you tube channel ...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,hey guys check out my new channel and our firs...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,just for test i have to say htmllink,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,me shaking my sexy ass on my channel enjoy _,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,htmllink check this out,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# looping through data frame and counting words 
for index,value in enumerate(df["CONTENT"]):
  for l in value.split():
    df[l][index]+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
df.sample(10)

Unnamed: 0,CONTENT,CLASS,uplifting,economic,told,need,later,more,refurbished,700,bus,blows,jenny,performing,sleep,micheal,robox,loads,twitch,radio,titles,miss,ermail,2012,your,allways,6,fell,3873,olielle,beibs,mice,fictional,dated,dress,crowd,wages,watching,jbs,his,...,quickest,shes,weight,use,superbbb,starlitnightsky,reunion,koean,yours,shell,sense,delightful,apps,greatest,actually,dede,bottom,total,1k,substantial,that,spread,018,characterized,parody,oreo,exactly,type,everyday,freind,u,woozy,there,school,sign,deep,night,size,bad,blow
815,i like so much this music good,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
695,this song means so much to me thank you soooo...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1263,i hate rap and i like this song,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136,dance dance psy htmllink,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
638,htmllink,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
713,2011 the last year of decent music,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
996,check out this video on youtube,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1460,i hope everyone is in good spirits im a hard w...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
393,htmllink,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
339,htmllink bing rewards earn free money and no...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [None]:
#Total number of words in each class
df.groupby("CLASS").sum().sum(axis=1)

CLASS
0     7417
1    15560
dtype: int64

In [None]:
# Assign variables to all values required in calculation
p_ham=0.47604
p_spam=0.52396
n_spam=df[df["CLASS"]==1].drop(columns=["CONTENT","CLASS"]).sum().sum()
n_ham=df[df["CLASS"]==0].drop(columns=["CONTENT","CLASS"]).sum().sum()
n_vocabulary=len(vocabulary)

In [None]:
# Slicing dataframe for each class
df_sspam=df[df["CLASS"]==1]
df_hham=df[df["CLASS"]==0]

In [None]:
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}
for word in vocabulary:
    n_word_given_spam = df_sspam[word].sum()   # spam_messages already defined in a cell above
    p_word_given_spam = (n_word_given_spam + 1) / (n_spam + 1*n_vocabulary)
    parameters_spam[word] = p_word_given_spam
    n_word_given_ham = df_hham[word].sum()   # ham_messages already defined in a cell above
    p_word_given_ham = (n_word_given_ham + 1) / (n_ham + 1*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

## Model Testing

In [None]:
# Creating the model classifier
def classifier(string):
    message=html.unescape(string)
    message=string.replace("\ufeff","")
    message=string.replace("(<a.+>)","htmllink")
    message=string.replace("\'|<.+>","")
    message=string.replace("\S*\.com\S*|\S*watch\?\S*","htmllink")
    message=string.replace("\W"," ").lower()
    p_string_s=1
    p_string_h=1
    for word in message.split():
        if word in parameters_spam:
            p_string_s*=parameters_spam[word]
            p_string_h*=parameters_ham[word]
    if (p_string_s*p_spam)>(p_string_h*p_ham):
        return(1)
    elif (p_string_s*p_spam)<(p_string_h*p_ham):
        return(0)
    else:
        return(-1)

In [None]:
# Reading the dataframe for testing model
df_artist=pd.read_csv("Youtube02-KatyPerry.csv")

In [None]:
df_artist.sample(4)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
161,z13gu5abbqrhtz3sd235fbfysyemhzwb304,MR magic man,2014-09-25T10:42:47,Please look at my channel﻿,1
324,z13tf12gvt2owtxpl04ceprwxmqnxx5a2gs,Kwon Kee,2014-11-09T19:35:44,"Hey yall its the real Kevin Hart, shout out to...",1
168,z13qxj4avoquihrc322byvrxrxasvviac04,Romix &amp; Muffy,2014-09-27T19:10:39,NOKIA spotted﻿,0
309,z120thzonkjsz3euu233jn0alpmcupqtk,xsilvermistx,2014-11-07T13:17:57,I've figured out why I dislike this song: it's...,0


In [None]:
df_artist["Pred_Class"]=df_artist["CONTENT"].apply(classifier)

In [None]:
# Checking model accuracy
correct_predictions=0
total_rows=0
for row in df_artist.iterrows():
    row=row[1]
    total_rows+=1
    if row["CLASS"]==row["Pred_Class"]:
        correct_predictions+=1
accuracy=correct_predictions/total_rows
print("accuracy=",accuracy)

accuracy= 0.9314285714285714


## Conclusion

In [None]:
# Checking result1
classifier("This song gives me goosebumps!!")

0

In [None]:
# Checking result2
classifier("Please subscribe to my channel as I'm approaching 1M subscribers")

1

In [None]:
# Checking result3
classifier("If you want to be a mastercoder, consider buying my course for 50% off at www.buymycourse.com")

1

In [None]:
# Checking result4
classifier("she is sings so nice AF")

0

In [None]:
# Checking result5
classifier("click on this ID and set a chance to 1 lakh INR")

1