In [30]:
# import libraries

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
# load sms dataset

data = pd.read_csv('sms_dataset.csv')
print(data.shape)
data.head()

(5574, 2)


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
# utility for text preprocessing

def preprocess_text(text):
    # change the words in lower case
    text = text.lower()

    # tokenize the text
    tokens = word_tokenize(text)

    # remove stop words
    words = [token for token in tokens if token not in stopwords.words('english')]

    # stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # join the words to form document
    document = ' '.join(words)
    return document

In [19]:
# apply preprocessing on data

data['text'] = data['text'].apply(lambda x: preprocess_text(x))

In [36]:
# create bag of words

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])

In [37]:
# create X and y

X = X.toarray()
y = data['label'].map({'ham': 0, 'spam': 1})

In [38]:
# create X_train and y_train

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=100)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4459, 7407)
(1115, 7407)
(4459,)
(1115,)


In [39]:
# build the model

model = MultinomialNB()
model.fit(X_train, y_train)

In [46]:
# predictions on train set

y_train_pred = model.predict(X_train)

In [50]:
# train accuracy

accuracy_score(y_train, y_train_pred)

0.9932720340883606

In [51]:
# confusion matrix

confusion_matrix(y_train, y_train_pred)

array([[3836,   14],
       [  16,  593]], dtype=int64)

In [52]:
# predictions on test set

y_test_pred = model.predict(X_test)

In [53]:
# test accuracy

accuracy_score(y_test, y_test_pred)

0.9874439461883409

In [55]:
# confusion matrix

confusion_matrix(y_test, y_test_pred)

array([[969,   8],
       [  6, 132]], dtype=int64)