In [35]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import nltk
nltk.download()
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import warnings
warnings.filterwarnings("ignore")
max_feat = 1000

In [36]:
df = pd.read_csv(r"spam.csv",encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [37]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [38]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [39]:
col = ['v1','v2']
df = df[col]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# PreProcessing the data

In [40]:
# removing numbers 
df['v2'] = df['v2'].str.replace(r'\d+(\.\d+)?', 'numbers')
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in numbers a wkly comp to win FA Cu...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
df['v1'].unique()

array(['ham', 'spam'], dtype=object)

In [42]:
#REPLACING NEXT LINES BY 'WHITE SPACE'
df['v2']=df['v2'].str.replace(r'\n'," ") 
# REPLACING CURRENCY SIGNS BY 'MONEY'
df['v2']=df['v2'].str.replace(r'£|\$', 'Money')
# REPLACING LARGE WHITE SPACE BY SINGLE WHITE SPACE
df['v2']=df['v2'].str.replace(r'\s+', ' ')
#REPLACING SPECIAL CHARACTERS  BY WHITE SPACE 
df['v2']=df['v2'].str.replace(r"[^a-zA-Z0-9]+", " ")

In [43]:
df['v2']

0       Go until jurong point crazy Available only in ...
1                                Ok lar Joking wif u oni 
2       Free entry in numbers a wkly comp to win FA Cu...
3            U dun say so early hor U c already then say 
4       Nah I don t think he goes to usf he lives arou...
                              ...                        
5567    This is the numbersnd time we have tried numbe...
5568                   Will b going to esplanade fr home 
5569    Pity was in mood for that So any other suggest...
5570    The guy did some bitching but I acted like i d...
5571                            Rofl Its true to its name
Name: v2, Length: 5572, dtype: object

In [44]:
# tokenizing the documents
l = []
for i in range(len(df['v2'])):
  temp = df['v2'][i].split(' ')
  l.append(temp)

In [45]:
# converting l into numpy array
l = np.array(l)

In [46]:
stemmer = PorterStemmer()

In [47]:
# performing stemming & stop word removal on the tokenized words 
for i  in range(len(l)):
  words = [stemmer.stem(word) for word in l[i] if word not in set(stopwords.words('english'))]
  l[i] = words

In [48]:
# performing lemmatizing 
lst = []
lemmatizer = WordNetLemmatizer()
for i in range(len(l)):
  words = [lemmatizer.lemmatize(word) for word in l[i]]
  words = ' '.join(words)
  lst.append(words)

In [49]:
lst = list(lst)

# Performing Bag Of Words

In [50]:
cv = CountVectorizer(max_features=max_feat)
X_BOW = cv.fit_transform(lst).toarray()

In [51]:
X_BOW[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Performing Classification

In [52]:
y = df['v1'].map({'ham':1,'spam':0})
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64

In [53]:
# splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_BOW,y,random_state=42,test_size=0.2)
# defining the model
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# predicting the data
y_pred = classifier.predict(X_test)

# accuracy of the model
print(accuracy_score(y_test,y_pred))

0.9838565022421525


In [54]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[142,   8],
       [ 10, 955]], dtype=int64)

In [55]:
precision = cm[0][0] / np.sum(cm[0])
precision

0.9466666666666667

In [56]:
recall = cm[0][0] / (cm[0][0] + cm[1][0])
recall

0.9342105263157895