In [2]:
import nltk


In [3]:
#nltk.download_shell()
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/reefayatbinshahjahan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#reading in the messages

messages = [line.rstrip() for line in open('SMSSpamCollection')]

In [5]:
print(len(messages))

5574


In [29]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
import pandas as pd


  from pandas.core import (


In [8]:
## putting the messages into a pandas dataframe
messages = pd.read_csv('SMSSpamCollection', sep='\t', names = ['label', 'message'])

In [9]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string

In [11]:
puncs = string.punctuation
redun = stopwords.words('english')
mess = "I, Love, chocolate cake, this!"

#Testing logic for tokenizing 

mess_token = word_tokenize(mess)
mess_token_clean = []

for index, word in enumerate(mess_token):
    if (word not in puncs) and (word.lower() not in redun):
        mess_token_clean.append(word)
        

print(mess_token_clean)
print(puncs)



['Love', 'chocolate', 'cake']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
##Preprocessing

def process_text(message):
    words = word_tokenize(message)
    words_clean = []

    for index, word in enumerate(words):
        if (word not in puncs) and (word.lower() not in redun):
            words_clean.append(word)

    return words_clean

In [13]:
test = messages['message'].apply(process_text)

test.head()

0    [Go, jurong, point, crazy, .., Available, bugi...
1             [Ok, lar, ..., Joking, wif, u, oni, ...]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3    [U, dun, say, early, hor, ..., U, c, already, ...
4    [Nah, n't, think, goes, usf, lives, around, th...
Name: message, dtype: object

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X = messages.drop('label', axis=1)
y = messages['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, shuffle= True, stratify=y)

In [16]:
messages_train = pd.concat([X_train, y_train], axis=1)
messages_test = pd.concat([X_test, y_test], axis=1)

In [17]:
bow_transformer_train = CountVectorizer(analyzer=process_text)
messages_bow_train = bow_transformer_train.fit_transform(messages_train['message'])

# Transform the test data using the same vectorizer
messages_bow_test = bow_transformer_train.transform(messages_test['message'])

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
# Initialize and fit the TfidfTransformer on the training data
tfidf_transformer = TfidfTransformer()
messages_tfidf_train = tfidf_transformer.fit_transform(messages_bow_train)

# Transform the test data using the same transformer
messages_tfidf_test = tfidf_transformer.transform(messages_bow_test)

In [19]:
# Train the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB()
spam_detect_model.fit(messages_tfidf_train, messages_train['label'])

# Make predictions
all_predictions = spam_detect_model.predict(messages_tfidf_test)

In [20]:
from sklearn.metrics import classification_report
print (classification_report(messages_test['label'], all_predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [21]:
##Automating the process
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=process_text)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# Fit the pipeline on training data
pipeline.fit(X_train['message'], y_train)

# Make predictions on test data
predictions = pipeline.predict(X_test['message'])

# Print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [22]:
##dumping the model, the CountVectorizer instance, and the TFIDF transformer instance
import pickle

# Save the vectorizer and transformer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(bow_transformer_train, f)

with open('tfidf_transformer.pkl', 'wb') as f:
    pickle.dump(tfidf_transformer, f)

# Save the model
with open('model.pkl', 'wb') as f:
    pickle.dump(spam_detect_model, f)


In [23]:
##test

message = ['I! Love, going to The Movies!!']

message_bow = bow_transformer_train.transform(message)
message_tfidf_example = tfidf_transformer.transform(message_bow)

spam_detect_model.predict(message_tfidf_example)

array(['ham'], dtype='<U4')