# Loading the Data

In [11]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
import string

In [13]:
df = pd.read_csv('set_spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Data Preprocessing

In [28]:
#Duplicates 
df.drop_duplicates(inplace = True)
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [17]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

# Count Vectorizer

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Sanjana
[nltk_data]     Akella\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [25]:
stop_words = set(stopwords.words('english'))
def text_process(Message):
    
    #punctuation
    nopunc = [char for char in Message if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    #stop words, the, is, am, are, was, were, a, an, and, or, but, to, of, in, on, for etc
    clean_words = [word for word in nopunc.split() if word.lower() not in stop_words]

    #clean words 
    return clean_words

In [26]:
df['Message'].head().apply(text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Message, dtype: object

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
message_bow = CountVectorizer(analyzer = text_process).fit_transform(df['Message'])

In [30]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(message_bow, df['Category'], test_size = 0.2)

# Naive Bayes Classification

In [39]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(x_train, y_train)
classifier.predict(x_test)

array(['spam', 'ham', 'ham', ..., 'ham', 'ham', 'ham'],
      shape=(1032,), dtype='<U4')

In [40]:
#actual values
y_test.values

array(['spam', 'ham', 'ham', ..., 'ham', 'ham', 'ham'],
      shape=(1032,), dtype=object)

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report
pred = classifier.predict(x_test)
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n         ham       0.99      0.96      0.98       903\n        spam       0.78      0.91      0.84       129\n\n    accuracy                           0.96      1032\n   macro avg       0.88      0.94      0.91      1032\nweighted avg       0.96      0.96      0.96      1032\n'

In [42]:
confusion_matrix(y_test, pred)

array([[870,  33],
       [ 11, 118]])

In [43]:
accuracy_score(y_test, pred)

0.9573643410852714