In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [38]:
df = pd.read_csv('/content/spam.csv', encoding='latin-1')

In [39]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [40]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [41]:
df.shape

(5572, 5)

In [42]:
null = df.isnull().sum()

In [43]:
null[null > 0]

Unnamed: 0,0
Unnamed: 2,5522
Unnamed: 3,5560
Unnamed: 4,5566


In [44]:
df.duplicated().sum()

np.int64(403)

In [45]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [46]:
df.isnull().sum()

Unnamed: 0,0
v1,0
v2,0


In [47]:
df.duplicated().sum()

np.int64(403)

In [48]:
df.drop_duplicates(inplace=True)

In [49]:
df.shape

(5169, 2)

In [50]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
df['v2'] = df['v2'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
df['v2'] = df['v2'].apply(lambda x: re.sub(r'\W', ' ', x))
df['v2'] = df['v2'].apply(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
df['v2'] = df['v2'].apply(lambda x: re.sub(r'\^[a-zA-Z]\s+', ' ', x))
df['v2'] = df['v2'].apply(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
df['v2'] = df['v2'].str.lower()

In [53]:
df.head()

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif oni
2,spam,free entry in 2 wkly comp to win fa cup final ...
3,ham,u dun say so early hor c already then say
4,ham,nah don think he goes to usf he lives around h...


In [54]:
from nltk.stem import WordNetLemmatizer

In [55]:
lemmatizer = WordNetLemmatizer()

In [56]:
def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [57]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [58]:
df['v2']=df['v2'].apply(lambda x:lemmatize_words(x))

In [59]:
df.head()

Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif oni
2,spam,free entry in 2 wkly comp to win fa cup final ...
3,ham,u dun say so early hor c already then say
4,ham,nah don think he go to usf he life around here...


In [61]:
df['v2'][2]

'free entry in 2 wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate c apply 08452810075over18 s'

In [63]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df['v2'],df['v1'],test_size=0.2,)

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
boe = CountVectorizer()
x_train_boe=boe.fit_transform(x_train).toarray()
x_test_boe=boe.transform(x_test).toarray()

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid=TfidfVectorizer()
x_train_tfid=tfid.fit_transform(x_train).toarray()
x_test_tfid=tfid.transform(x_test).toarray()

In [66]:
x_train_boe

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [67]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(x_train_boe,y_train)
nb_model_tfid = GaussianNB().fit(x_train_tfid,y_train)

In [68]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [69]:
y_pred_bow=nb_model_bow.predict(x_test_boe)
y_pred_tfid=nb_model_tfid.predict(x_test_tfid)

In [70]:
confusion_matrix(y_test,y_pred_bow)

array([[816,  84],
       [ 11, 123]])

In [71]:
print("BOW accuracy:",accuracy_score(y_test,y_pred_bow))

BOW accuracy: 0.9081237911025145


In [72]:
print(classification_report(y_test,y_pred_bow))

              precision    recall  f1-score   support

         ham       0.99      0.91      0.94       900
        spam       0.59      0.92      0.72       134

    accuracy                           0.91      1034
   macro avg       0.79      0.91      0.83      1034
weighted avg       0.94      0.91      0.92      1034



In [73]:
print ("TFIDF accuracy:",accuracy_score(y_test,y_pred_tfid))

TFIDF accuracy: 0.902321083172147


In [74]:
print(classification_report(y_test,y_pred_tfid))

              precision    recall  f1-score   support

         ham       0.98      0.91      0.94       900
        spam       0.58      0.88      0.70       134

    accuracy                           0.90      1034
   macro avg       0.78      0.89      0.82      1034
weighted avg       0.93      0.90      0.91      1034



In [75]:
new_review = "the story was boring and the characters where poorly developed."

In [76]:
new_bow = boe.transform([new_review]).toarray()
pred_bow = nb_model_bow.predict(new_bow)
print(pred_bow[0])

ham


In [77]:
new_tfid = tfid.transform([new_review]).toarray()
pred_tfid = nb_model_tfid.predict(new_tfid)
print(pred_tfid[0])

ham
