In [1]:
import pandas as pd
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
df = pd.read_csv('dataset/SMSSpamCollection', sep='\t', names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
ps = PorterStemmer()

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index


False

In [8]:
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = nltk.word_tokenize(review)
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
cv = CountVectorizer(max_features=5000)

In [10]:
X = cv.fit_transform(corpus).toarray()
X.shape

(5572, 5000)

In [11]:
y = pd.get_dummies(df['label'])
y = y.iloc[:, 1]
y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam, dtype: uint8

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [14]:
y_pred = spam_detect_model.predict(X_test)

In [15]:
a = {'Actual': y_test, 'Predicted': y_pred}
pd.DataFrame(a).tail()

Unnamed: 0,Actual,Predicted
2906,0,0
1270,0,0
3944,0,0
2124,1,1
253,0,0


In [16]:
confusion_matrix(y_test, y_pred)

array([[945,  10],
       [  8, 152]], dtype=int64)

In [17]:
accuracy_score(y_test, y_pred)

0.9838565022421525