## Buiding Classification Model on Text Data - Spam Email Detection

In [76]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
df = pd.read_csv("Spam_ham_email.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Basic pre-processing

In [78]:
df['Target_Category'] = np.where(df['Category'] =="ham",0,1)
df.head()

Unnamed: 0,Category,Message,Target_Category
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [79]:
df['Message'] = df['Message'].astype(str).str.lower()
df.head(3)

Unnamed: 0,Category,Message,Target_Category
0,ham,"go until jurong point, crazy.. available only ...",0
1,ham,ok lar... joking wif u oni...,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,1


In [80]:
X = df[['Message']]
y = df[['Target_Category']]

#### Splitting the dataset into train and test sets

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [82]:
print(X_train.shape)
print(X_test.shape)

(4179, 1)
(1394, 1)


In [83]:
corpus_train = [i for i in X_train.Message]
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(corpus_train)

In [84]:
corpus_test = [i for i in X_test.Message]
X_test = vectorizer.transform(corpus_test)

In [85]:
X_train

<4179x7506 sparse matrix of type '<class 'numpy.int64'>'
	with 55860 stored elements in Compressed Sparse Row format>

#### Model building

In [86]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()


In [87]:
X_train = X_train.toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Train model

In [88]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [89]:
prediction = model.predict(X_test.toarray())
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [90]:
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9856527977044476


In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_test, prediction)

array([[1202,    6],
       [  14,  172]], dtype=int64)

In [92]:
accuracy_score(y_test, prediction)

0.9856527977044476

#### Testing with new data

In [93]:
new_message1 = "WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward!Claim it now! Valid 12 hours only."
new_message2 = "It's okay!, you can refund me next week. I will wait"

In [94]:
test1 = vectorizer.transform([new_message1,])
test2 = vectorizer.transform([new_message2,])

In [95]:
# 1 for spam
#0 is not spam
print("prediction Category is :", model.predict(test1.toarray()))
print("prediction Category is :", model.predict(test2.toarray()))

prediction Category is : [1]
prediction Category is : [0]
