In [1]:
import pandas as pd
import re
import nltk
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#Dataset link -  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
sms = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

#Data cleaning and preprocessing
wordnet=WordNetLemmatizer()
corpus = []
for i in range(len(sms)):
    review = re.sub('[^a-zA-Z]', ' ', sms['message'][i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [3]:
X=sms['message']

y=pd.get_dummies(sms['label'])
y=y.iloc[:,1].values

In [4]:
# 80-20 Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Creating the Tf-idf
cv=TfidfVectorizer()
X = cv.fit(corpus)
x_train = cv.transform(X_train).toarray()
x_test  = cv.transform(X_test).toarray()

In [5]:
# Training model using Naive bayes classifier

model = MultinomialNB().fit(x_train, y_train)
y_pred=model.predict(x_test)

In [6]:
# Building confusion matrix for spam classifier

confusion_m=confusion_matrix(y_test, y_pred)
acc=accuracy_score( y_test, y_pred)

print('Accuracy of test dataset = ',acc)
print('\n')

print (classification_report(y_test, y_pred) )

Accuracy of test dataset =  0.9641255605381166


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       955
           1       1.00      0.75      0.86       160

    accuracy                           0.96      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.96      0.96      1115



In [7]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('multinomialNB', MultinomialNB())])

In [8]:
with open('spamclassifier.pkl','wb') as f:
    pickle.dump(pipe, f)

In [9]:
text = "Free entry in 2 a wkly comp to win FA Cup final."
y = pipe.predict([text])

In [10]:
print('ham' if y[0]==0 else 'spam')

spam
