In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
import string
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [55]:
data = pd.read_csv('dataset/Youtube01-Psy.csv')
data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [56]:
data['CLASS'].value_counts()

CLASS
1    175
0    175
Name: count, dtype: int64

In [57]:
data = data[["CONTENT", "CLASS"]]

In [58]:
# داده ها را بور می زنیم
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,CONTENT,CLASS
157,Follow me on Twitter @mscalifornia95﻿,1
341,Free my apps get 1m crdits ! Just click on the...,1
315,PLEASE SUBSCRIBE ME!!!!!!!!!!!!!!!!!!!!!!!!!!!...,1
234,What Can i say....This Song He Just Change The...,0
155,What free gift cards? Go here http://www.swag...,1


In [59]:
# داده ها را به حروف کوچک تبدیل می کنیم

data['CONTENT'] = data['CONTENT'].str.lower()
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me on twitter @mscalifornia95﻿,1
341,free my apps get 1m crdits ! just click on the...,1
315,please subscribe me!!!!!!!!!!!!!!!!!!!!!!!!!!!...,1
234,what can i say....this song he just change the...,0
155,what free gift cards? go here http://www.swag...,1


In [60]:
# حذف علایم punctuation
data["CONTENT"] = data["CONTENT"].apply(lambda data: data.translate(str.maketrans('', '', string.punctuation)))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me on twitter mscalifornia95﻿,1
341,free my apps get 1m crdits just click on the ...,1
315,please subscribe me﻿,1
234,what can i saythis song he just change the wor...,0
155,what free gift cards go here httpwwwswagbucks...,1


In [61]:
# حذف stopwords
stopwords = ["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of",
             "on", "or", "such", "that", "the", "their", "then", "there", "these","they", "this", "to", "was", "will", "with"]
data["CONTENT"] = data["CONTENT"].apply(lambda text: " ".join([word for word in text.split() if word not in stopwords]))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter mscalifornia95﻿,1
341,free my apps get 1m crdits just click link dow...,1
315,please subscribe me﻿,1
234,what can i saythis song he just change world c...,0
155,what free gift cards go here httpwwwswagbucksc...,1


In [62]:
#  حدف کلمات نادر با بیش از یکبار تکرار
word_count = data["CONTENT"].apply(lambda text: pd.Series(text.split()).value_counts()).sum()
data["CONTENT"] = data["CONTENT"].apply(lambda text: " ".join([word for word in text.split() if word_count[word] > 1]))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [63]:
# حذف ایموجی ها
emoticons = [":)", ":(", ":D", ";)", ":-)", ":-(", ":-D", ";-)", ";-("]
for emoticon in emoticons:
    data["CONTENT"] = data["CONTENT"].apply(lambda text: text.replace(emoticon, ""))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [64]:
# حذف آدرس های اینترنتی

data["CONTENT"] = data["CONTENT"].apply(lambda text: re.sub(r"http\S+|www\S+", "", text))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [65]:
# حذف تگ های html

data["CONTENT"] = data["CONTENT"].apply(lambda text: BeautifulSoup(text, "html.parser").get_text())
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [66]:
# حذف اعداد از متن
data["CONTENT"] = data["CONTENT"].apply(lambda text: re.sub(r"\d+", "", text))
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [67]:
# گسترش کلمات کوتاه
contractions = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "I would",
    "i'll": "I will",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it's": "it is",
    "let's": "let us",
    "might've": "might have",
    "must've": "must have",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "would've": "would have",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        if word in contractions:
            expanded_text.append(contractions[word])
        else:
            expanded_text.append(word)
    return " ".join(expanded_text)

data["CONTENT"] = data["CONTENT"].apply(expand_contractions)
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,1
341,free my apps get just click link download done...,1
315,please subscribe me﻿,1
234,what can i song he just world so good psy your...,0
155,what free gift cards go here,1


In [68]:
# تبدیل نام صفر و یک ها به spam and not spam
data['CLASS'] = data['CLASS'].map({0:'NOT SPAM', 1:'SPAM'})
data.head()

Unnamed: 0,CONTENT,CLASS
157,follow me twitter,SPAM
341,free my apps get just click link download done...,SPAM
315,please subscribe me﻿,SPAM
234,what can i song he just world so good psy your...,NOT SPAM
155,what free gift cards go here,SPAM


In [69]:
# دادن داده ها به مدل
X = np.array(data["CONTENT"])
y = np.array(data["CLASS"])

# تبدیل داده های متنی به عددی
cv = CountVectorizer()
X = cv.fit_transform(X)

#تقسیم داده ها
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# استفاده از مدل
model = BernoulliNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.8428571428571429


In [70]:
# مثالی برای تست نتیجه
sample = "Check this out: https://thecleverprogrammer.com/" 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['SPAM']


In [71]:
# مثالی برای تست نتیجه
sample = "Lack of information!" 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['NOT SPAM']
