In [1]:
#importing Required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [2]:
#reading the dataset 
text = open("/SMSSpamCollection")
sms = text.read()
list_of_sms = sms.split("\n")
print("Number of SMS present : {}".format(len(list_of_sms)))
label = []
message = []
for index in range(len(list_of_sms)-2):
  one_sms = list_of_sms[index]
  temp = one_sms.split("\t")
  label.append(temp[0])
  message.append(temp[1])

Number of SMS present : 5575


In [3]:
#converting the dataset into pandas Dataframe
sms_data = pd.DataFrame(label)
sms_data["Mesaage"] = message
sms_data.columns = ['Label','Message']
sms_data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
print(sms_data.shape)
#prints out descriptive statistics of data
sms_data.describe(include="all")

(5573, 2)


Unnamed: 0,Label,Message
count,5573,5573
unique,2,5170
top,ham,"Sorry, I'll call later"
freq,4826,30


In [5]:
sms_data['Label'].value_counts()

ham     4826
spam     747
Name: Label, dtype: int64

In [6]:
#view of 5 last rows of data
sms_data.tail()

Unnamed: 0,Label,Message
5568,spam,REMINDER FROM O2: To get 2.50 pounds free call...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [7]:
#Inspecting the data types of columns present in our data
sms_data.dtypes

Label      object
Message    object
dtype: object

In [8]:
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    #cleaned = re.sub(r'...',r' ',cleaned)
    cleaned = re.sub(r'[.|,|)|(|\|/|*]',r' ',cleaned)
    return  cleaned

In [9]:
#Cleaning Messages
msgs = []
for msg in sms_data['Message'].values:
  cleaned_msg = cleanpunc(msg)
  msgs.append(cleaned_msg)
msgs[0]

'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

In [10]:
#creating a new column with name CleanedMessage
sms_data['CleanedMessage'] = msgs

In [11]:
sms_data.head()

Unnamed: 0,Label,Message,CleanedMessage
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives arou...


In [12]:
sms_data.drop('Message' , axis=1 , inplace=True)

In [13]:
sms_data.head()

Unnamed: 0,Label,CleanedMessage
0,ham,Go until jurong point crazy Available only ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives arou...


In [14]:
label = sms_data['Label']
sms_data.drop('Label',axis=1,inplace=True)

In [15]:
#splitting the dataset into train-test sets
from sklearn.model_selection import train_test_split
X_train , X_test , Y_train , Y_test = train_test_split(sms_data , label , test_size = 0.2 ,random_state = 17 ,stratify = label )
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4458, 1)
(1115, 1)
(4458,)
(1115,)


In [16]:
X_train.head()

Unnamed: 0,CleanedMessage
3416,He remains a bro amongst bros
1076,Oi Ami parchi na re Kicchu kaaj korte iccha ...
4431,Yar lor How u noe U used dat route too
1143,Have you had a good day Mine was really busy a...
1121,Cancel cheyyamoand get some money back


# Bag of words Document Term Matrix (DTM)

In [17]:
#converting messages(collection of strings) to numbers
bow = CountVectorizer()
bow.fit(X_train['CleanedMessage'].values)
bow_dtm_train = bow.transform(X_train['CleanedMessage'].values)
bow_dtm_test = bow.transform(X_test['CleanedMessage'].values)

In [18]:
print(bow_dtm_train.shape)
print(bow_dtm_test.shape)

(4458, 7892)
(1115, 7892)


# Machine Learning : Implementing K-NN

In [19]:
from sklearn.neighbors import KNeighborsClassifier
optimal_k = 5
knn_bow = KNeighborsClassifier(optimal_k)
knn_bow.fit(bow_dtm_train,Y_train)
prediction_bow = knn_bow.predict(bow_dtm_test)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_bow = accuracy_score(prediction_bow,Y_test)*100
print("The accuracy of K-NN model is {}% ".format(accuracy_bow))

The accuracy of K-NN model is 91.56950672645739% 


# Machine Learning : Implementing Logistic Regression

In [24]:
lr = LogisticRegression()
lr.fit(bow_dtm_train , Y_train)
prediction_lr = lr.predict(bow_dtm_test)

In [26]:
accuracy_bow = accuracy_score(prediction_lr,Y_test)*100
print("The accuracy of Logistic Regression model is {}% ".format(accuracy_bow))

The accuracy of Logistic Regression model is 98.02690582959642% 


____
_______

# Logistic Regression outperforms K-NN with accuracy = 98.02%