In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import BernoulliNB , MultinomialNB , GaussianNB

from sklearn.metrics import accuracy_score

In [None]:
filepath = '/content/spam.csv'
data_import = pd.read_csv(filepath , encoding = 'ISO-8859-1')
data_import.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df =  data_import.drop(['Unnamed: 2' , 'Unnamed: 3' , 'Unnamed: 4'] , axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
### Removing stopwords from the feature column.

sw = stopwords.words('english')

def stopword(text) :
    txt = [word.lower() for word in text.split() if word.lower() not in sw]
    return txt

df['v2'] = df['v2'].apply(stopword)

df.head()

Unnamed: 0,v1,v2
0,ham,"[go, jurong, point,, crazy.., available, bugis..."
1,ham,"[ok, lar..., joking, wif, u, oni...]"
2,spam,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, early, hor..., u, c, already, sa..."
4,ham,"[nah, think, goes, usf,, lives, around, though]"


In [None]:
from nltk.stem.snowball import SnowballStemmer

ss = SnowballStemmer("english")

def stemming(text) :
    text = [ss.stem(word) for word in text if word.split()]
    return "".join(text)

df['v2'] = df['v2'].apply(stemming)

In [None]:
df.head()

Unnamed: 0,v1,v2
0,ham,"gojurongpoint,crazy..availbugingreatworldlaebu..."
1,ham,oklar...jokewifuoni...
2,spam,freeentri2wklicompwinfacupfinaltkts21stmay2005...
3,ham,udunsayearlihor...ucalreadisay...
4,ham,"nahthinkgoeusf,livearoundthough"


In [None]:
### TF-IDF { Term Frequency , Inverse Document Frequency }

from sklearn.feature_extraction.text import TfidfVectorizer

tfid_vect = TfidfVectorizer()

# Extract the tfid representation matrix of the test data.
tfid_matrix = tfid_vect.fit_transform(df['v2'])

print(f"Type :{type(tfid_matrix)} , Matrix at 0 : {tfid_matrix[0]} , Shape : {tfid_matrix.shape}")

Type :<class 'scipy.sparse._csr.csr_matrix'> , Matrix at 0 :   (0, 1827)	0.5056391989470028
  (0, 1030)	0.5056391989470028
  (0, 2166)	0.48268727087494234
  (0, 3635)	0.5056391989470028 , Shape : (5572, 12124)


In [None]:
# Collect sparse matrix into dense

array = tfid_matrix.todense()

In [None]:
df1 = pd.DataFrame(array)
df1[df1[10]  != 0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12114,12115,12116,12117,12118,12119,12120,12121,12122,12123
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df1['v1'] = df['v1']

In [None]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12115,12116,12117,12118,12119,12120,12121,12122,12123,v1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham


In [None]:
from sklearn.model_selection import train_test_split

features = df1.drop('v1' , axis = 1)
label = df1['v1']

x_train , x_test , y_train , y_test = train_test_split(features , label , test_size = 0.3)
print(f"X train shape : {x_train.shape}\nY train shape : {y_train.shape}\nX test shape : {x_test.shape}\nY test shape : {y_test.shape}")

X train shape : (3900, 12124)
Y train shape : (3900,)
X test shape : (1672, 12124)
Y test shape : (1672,)


In [None]:
ber_pipe = Pipeline(steps = [
   ( 'ber_model' , BernoulliNB())
])

multi_pipe = Pipeline(steps = [
    ('multi_model' , MultinomialNB())
])

guass_pipe = Pipeline(steps = [
    ('guass_model' , GaussianNB())
])

In [None]:
def model_evaluation(model) :
    model.fit(x_train , y_train)
    y_pred_model = model.predict(x_test)

    acc_score = accuracy_score(y_test , y_pred_model)

    print(f"Accuracy Score of {model[0]} : {acc_score}")

model_evaluation(ber_pipe)
model_evaluation(multi_pipe)
model_evaluation(guass_pipe)

Accuracy Score of BernoulliNB() : 0.881578947368421
Accuracy Score of MultinomialNB() : 0.9013157894736842
Accuracy Score of GaussianNB() : 0.46351674641148327
