In [13]:
import pandas as pd
import nltk
import re

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [6]:
data = pd.read_csv('smsspamcollection/SMSSpamCollection', delimiter='\t', names=['label', 'message'])

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
lemma = WordNetLemmatizer()

In [25]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
data['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [23]:
corpus = []
#cleaning the data
for  sentence in data['message']:
    cleaned_sent = re.sub('[^a-zA-z]', ' ', sentence)
    cleaned_sent = cleaned_sent.lower()
    cleaned_sent = cleaned_sent.split()
    
    word = [lemma.lemmatize(word) for word in cleaned_sent if word not in stopwords.words('english')]
    word = ' '.join(word)
    corpus.append(word)

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=4500)

In [83]:
X = cv.fit_transform(corpus).toarray()

In [84]:
y = pd.get_dummies(data['label'], drop_first= True)

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [86]:
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(X_train, y_train.values.ravel())
#here y_train.values.ravel() this is used because:
# .values will give the values in an array. (shape: (n,1)
# .ravel will convert that array shape to (n, )
# or alternatively y_train.iloc[:,0].values can be used

In [87]:
y_pred = spam_detection_model.predict(X_test)

In [88]:
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(y_test, y_pred)

In [90]:
print(confusion_mat)

[[951  15]
 [  6 143]]


In [92]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(accuracy*100)

98.11659192825111
