# Implementing spam classifier | Natural Language Processing

In [1]:
#load data, with tabs \t, and two columns names

import pandas as pd
messages = pd.read_csv("smsspamcollection/SMSSpamCollection", sep="\t", names=["label", "message"])
messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
#Data cleaning and prepocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


wordnet = WordNetLemmatizer()
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()

    review = [wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [30]:
#Implementing TFIDF also using SKlearn library

from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
# convert labels to dummy variable
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [32]:
#test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 0)

In [33]:
#training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
spam_detecting_model = MultinomialNB().fit(X_train, y_train)

In [34]:
#testing the model with prediction
y_pred = spam_detecting_model.predict(X_test)

In [36]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred)
confusion

array([[955,   0],
       [ 31, 129]])

In [37]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
score

0.9721973094170404

In [40]:
#save model for future reuse.
import joblib
joblib.dump(spam_detecting_model, 'spamdetect.ml')

['spamdetect.ml']

In [42]:
model = joblib.load('spamdetect.ml')
model.predict(X_test)


array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)