## Preprocessing and Model Training

In [57]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

#Data cleaning and preprocessing



ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
    
    
# Creating the Bag of Words model
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()




y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values


# Train Test Split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Training model using Naive bayes classifier


spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)



## Model Evaluation

In [58]:
#confusion Matrix

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

conf_matrx=confusion_matrix(y_test,y_pred)
conf_matrx


array([[946,   9],
       [  7, 153]], dtype=int64)

In [59]:
# Recall

class_report=classification_report(y_test,y_pred,output_dict=True)
print(f"spam: {class_report['1']['recall']}")
print(f"not spam: {class_report['0']['recall']}")

spam: 0.95625
not spam: 0.9905759162303664


In [60]:
#f1-score

class_report=classification_report(y_test,y_pred,output_dict=True)
print(f"spam: {class_report['1']['f1-score']}")
print(f"not spam: {class_report['0']['f1-score']}")

spam: 0.9503105590062112
not spam: 0.9916142557651991


In [61]:
#accuracy
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9856502242152466

## Testing with use case input

In [62]:
a=input('write a msg: ')

vector=cv.transform([a])  #[a] is converting to list
my_pred=spam_detect_model.predict(vector)
# print(my_pred)

if my_pred==1:
    print('msg is a spam')
else:
    print('msg is not a spam')

write a msg: URGENT!! You paytm password has expired. To prevent it from being suspended you have to activate it. To agree, click here htm/fjf/sj.com
msg is a spam


In [None]:
URGENT!! You paytm password has expired. To prevent it from being suspended you have to activate it. To agree, click here htm/fjf/sj.com

In [63]:
# saving the model

import joblib

model_filename='SpamClassifier_model.pkl'
vector_filename='SpamClassifier_vector.pkl'
joblib.dump(spam_detect_model,model_filename)
joblib.dump(cv,vector_filename)

['SpamClassifier_vector.pkl']