In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('SMSSpamCollection.csv',sep='\t',names=['label','message'])

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# from nltk import download
# download(['stopwords','wordnet'])

In [5]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
def clean_sms(message):
    punc_free = ''.join([char for char in message if char not in punctuation])
    stop_free = [word for word in punc_free.split() if word not in stopwords.words('english')]
    lem = WordNetLemmatizer()
    lemmatized = [lem.lemmatize(word) for word in stop_free]
    return lemmatized

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [8]:
from sklearn.naive_bayes import MultinomialNB

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
nlp = Pipeline([
    ('Bag of Words', CountVectorizer(analyzer=clean_sms)),
    ('TF-IDF Value', TfidfTransformer()),
    ('Naive Bayes', MultinomialNB())
])

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
message_train,message_test,label_train,label_test = train_test_split(messages['message'],messages['label'],test_size=0.3)

In [13]:
nlp.fit(message_train,label_train)

Pipeline(steps=[('Bag of Words',
                 CountVectorizer(analyzer=<function clean_sms at 0x000002439856BB80>)),
                ('TF-IDF Value', TfidfTransformer()),
                ('Naive Bayes', MultinomialNB())])

In [14]:
predictions = nlp.predict(message_test)

In [15]:
from sklearn.metrics import classification_report

In [16]:
print(classification_report(label_test,predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1450
        spam       1.00      0.73      0.85       222

    accuracy                           0.96      1672
   macro avg       0.98      0.87      0.91      1672
weighted avg       0.97      0.96      0.96      1672

