# SMS Spam Predictive ML Model

In [1]:
## Import All libraries

import nltk
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [2]:
## Read SMS Messages
df=pd.read_csv('SMSSpamCollection', sep='\t', names=['Target','SMS_Messages'])

# Re-order Feature and target variable
df=df[['SMS_Messages', 'Target']]
df.head()

Unnamed: 0,SMS_Messages,Target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [3]:
## Initialize WordNetLemmatizer
ps=PorterStemmer()

## Sentence Tokenization
#sentences=nltk.sent_tokenize(df['SMS_Messages'])

corpus=[]

for i in range(len(df['SMS_Messages'])):
    ## Remove . , etc
    review=re.sub('[^a-zA-Z]',' ',df['SMS_Messages'][i])
    # Make all sentences in lower case
    review=review.lower()
    ## Word Tokenization
    words=nltk.word_tokenize(review)
    # Apply lemmatization
    words=[ps.stem(word) for word in words if word not in set(stopwords.words('english'))]
    # Join words into sentences
    review=' '.join(words)
    corpus.append(review)

## Creating Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer(max_features=3000)
X=bow.fit_transform(corpus).toarray()

## Encode target variable

In [4]:
df["Target"]=df["Target"].map({'ham':0,'spam':1})

In [5]:
y=df['Target'].values

## Train test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=0)

## Train ML Model

In [7]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
y_pred=model.predict(X_test)

### Evaluate ML Model Performance

In [9]:
# Confusion Matrix
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

acc=accuracy_score(y_test,y_pred)
print('Accuracy (%): ', acc)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy (%):  0.9865470852017937
[[947   8]
 [  7 153]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.95      0.96      0.95       160

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

