In [68]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [51]:
nltk.download('stopwords')

dataset = pd.read_csv("spam.csv", encoding = "ISO-8859-1")
df = pd.DataFrame(dataset)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# df1 = df[df.columns[0:2]]
df=df[df.columns[0:2]]
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [55]:
##Pre-Processing
#######Data Cleaning
##1. Convert text to lowercase
df['v2']=df['v2'].str.lower()
df['v2']

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
v2                                                    NaN
Name: v2, Length: 5573, dtype: object

In [56]:
##2. Removing digits
df['v2']=df['v2'].str.replace('\d+' ,'')
df['v2']

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in  a wkly comp to win fa cup final...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
v2                                                    NaN
Name: v2, Length: 5573, dtype: object

In [57]:
##3. Remove Punctuation
df['v2']=df['v2'].str.replace('[^\w\s]' ,'')
df['v2']=df['v2'].str.replace('\_+' ,'')
df['v2']

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in  a wkly comp to win fa cup final...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5568                  will ì b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
v2                                                    NaN
Name: v2, Length: 5573, dtype: object

In [58]:
##4. Tokenization
df['v2']=df.apply(lambda row: nltk.word_tokenize(str(row['v2'])), axis=1)
df['v2']

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, a, wkly, comp, to, win, fa, ...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5568         [will, ì, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
v2                                                  [nan]
Name: v2, Length: 5573, dtype: object

In [59]:
##5. Removal of Stop words
stop= stopwords.words('english')
df['v2']=df['v2'].apply(lambda x: [item for item in x if item not in stop])
df['v2']

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, wkly, comp, win, fa, cup, final,...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5568                   [ì, b, going, esplanade, fr, home]
5569                     [pity, mood, soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
v2                                                  [nan]
Name: v2, Length: 5573, dtype: object

In [62]:
##7. Stemming
snowball=SnowballStemmer("english")
df['v2']=df['v2'].apply(lambda x: ' '.join([snowball.stem(y) for y in x]))
df['v2']

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri wkli comp win fa cup final tkts st ...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
                              ...                        
5568                              ì b go esplanad fr home
5569                              piti mood soani suggest
5570    guy bitch act like id interest buy someth els ...
5571                                       rofl true name
v2                                                    nan
Name: v2, Length: 5573, dtype: object

In [64]:
# Creating the Bag of Words
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(df['v2']).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [65]:
y=pd.get_dummies(df['v1'])
y=y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [71]:
model = MultinomialNB().fit(X_train, y_train)
y_pred=model.predict(X_test)

In [72]:
Accuracy_score1 = accuracy_score(y_test, y_pred)
print("Accuracy_Score:", Accuracy_score1)

Accuracy_Score: 0.9829596412556054
