In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import chardet
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#checking the encoding style of the input csv file.By mentioning the encoding style in read_csv(), we can avoid the chances of getting an error due to encoding style mismatch 
with open('../input/sms-spam-collection-dataset/spam.csv', 'rb') as rawdata:
    encode_style =  chardet.detect(rawdata.read(100000))
print(encode_style)

In [None]:
data = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding='Windows-1252')

In [None]:
# checking the data information 
# the last 3 column (Unnamed: 2, Unnamed: 3, Unnamed: 4) has maximum null values.To confirm the same, we will do another check using isnull()
data.info()

In [None]:
# there is no missing values in first two column but maximum values are missing in the last 3 columns. So, we will drop these last 3 columns 
data.isnull().sum()


In [None]:
#dropping the last 3 columns
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
# To increase the readbility , lets change the column name of first two column (v1 and v2)
data.columns = ['Label','Message']

In [None]:
#checking the data 
data.head(3)

In [None]:
# To understand the details information of these two column. 
data.describe()

In [None]:
#creating a new column to understand the charactertistics of two type message label 
data['Message_length'] = data['Message'].apply(len)

In [None]:
# checking the distribution of message size
data.groupby('Label')['Message_length'].describe()

The mean length of spam messages are larger than the mean length of the ham messages. Usually, the length of the spam messages are larger in length of the non-spam messages. To verify it again, we will check the distribution plot 

In [None]:
dist_message = data['Message_length'].hist(bins=100,by=data['Label'],figsize=(10,6))
dist_message[0].set_xlabel("Message Length")
dist_message[0].set_ylabel("Freequency")
dist_message[1].set_xlabel("Message Length")
dist_message[1].set_ylabel("Freequency")

Using this basic EDA we can understand that the spam messages are larger in length. In message label = ham distrbution, we can see that there is a message whose length is much higher than the other messages in ham group. Now, it is difficult to get the actual length of this long message from this plot. 
From the output of our previous data.groupby('Label')['Message_length'].describe(), we 
can see that the max length is 910. 
Now, we can also check which message is this in ham group. 

In [None]:
# To find out the message which has a length of 910 
data[data['Message_length'] == 910]['Message'].iloc[0]

# Data pre-processing

we need to clean the messages before processing further

In [None]:
import string
from nltk.corpus import stopwords

To convert the normal text strings in a list of tokens(these tokens will be needed in next step) we will use the following function:

In [None]:
 

def text_clean(message):
    
    # first, remove all punctuation
    nopunc = [letter for letter in message if letter not in string.punctuation]
    punc_filtered = "".join(nopunc)
    # second, remove all stopwords
    return [words for words in punc_filtered.split(" ") if words.lower() not in stopwords.words('english')]
    # return the words as list 

As an example,we can see how this text_clean() works on messages :


In [None]:
data['Message'].apply(text_clean)

In [None]:
# The original dataframe is:
data.head()

Before performing Vectorization, we will divide the dataset into training and test set to avoid Data leakage. Once these partitions are done, we will convert each of these sets(training and test set) into vectors


In [None]:
from sklearn.model_selection import train_test_split

Here, we are using only the 'Message' column to perform the classfication and we are taking 70% of dataset as training data and the remainder 30% as test set.

In [None]:
x_Train,x_Test,y_Train,y_Test = train_test_split(data['Message'],data['Label'],test_size=0.3)

# Vectorization

To convert the messages into a vector with which SciKit Learn's model can work, we will use the 
bag-of-words model:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

we will use our defined text_clean() as the analyzer:

In [None]:
train_bow = CountVectorizer(analyzer=text_clean).fit(x_Train)

In [None]:
# total number of vocab words
print(len(train_bow.vocabulary_))

In [None]:
# to see the entire vocabulary
#train_bow.vocabulary_   # execute this command to see the entire vocabulary and the index position of each word

In [None]:
# we can take one message as a sampel and can see how bag or words model works:

Samp_message = data['Message'][3]
# the samepl message is U dun say so early hor... U c already then say...
bow_samp = train_bow.transform([Samp_message])
print(bow_samp)



From this output, we can understand that there are seven unique words(after removing common stop words)in the sampel message( which is actually the 4th message of dataframe).
Two of them appear twice, the rest only once. We can check  and confirm that which word is appearing twice 

In [None]:
print(train_bow.get_feature_names()[3190])
print(train_bow.get_feature_names()[7659])

As exepcted, U and Say are the two word that are apperaing twice 

Now to transform the entire training data set messages:


In [None]:
train_matrix = train_bow.transform(x_Train)

As a result, we will get a sparse matrix.To get the shape of ths matrix and to check the number of non zero entries in the matrix:


In [None]:
print('Shape of Sparse Matrix: ', train_matrix.shape)
print('Amount of Non-Zero occurences: ', train_matrix.nnz)

To assign a weight to each word of the vocabulary, we will use TF-IDF. The words which has higher freequency will be assigned less weighatge and the words which are rare and has lower freequency will be assigned higher weighatge. 

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tf_idf_train = TfidfTransformer().fit(train_matrix)

In [None]:
messages_tf_idf_train = tf_idf_train.transform(train_matrix)

To confirm that the word with higher freequency has given a lower weightage than the word with lower freequency, we will consider two word from the entire document- 'want' and 'come'(more freequent)

In [None]:
print(tf_idf_train.idf_[train_bow.vocabulary_['want']])
print(tf_idf_train.idf_[train_bow.vocabulary_['come']])

as the word 'come' is more frequent than 'want' in the entire dataframe, it receives lower weightage 

Now, we will convert the test set. 


In [None]:
test_matrix = train_bow.transform(x_Test)

In [None]:
tf_idf_test = TfidfTransformer().fit(test_matrix)

In [None]:
messages_tf_idf_test = tf_idf_test.transform(test_matrix)

Here we will use two scikit-learn models. Naive Bayes and KNN and will compare their accuracy.

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

we will create an instance of Naive Bayes and will fit it using our training data. 

In [None]:
nb = MultinomialNB()


In [None]:
nb.fit(messages_tf_idf_train,y_Train)

In [None]:
y_pred = nb.predict(messages_tf_idf_test)

To check the performance of the model, we will use classification report and accuracy score from scikit learn module

In [None]:
from sklearn.metrics import classification_report,accuracy_score

In [None]:
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score is: ")
print(accuracy_score(y_Test,y_pred))

# KNN:

We can use GridSearchCV to identify the best value of k for KNN model. But instead of using GridSearchCv, we can guess the best value of K in the following way: 

we are assuming that the best value of K lies in between 1 to 40. we are creating an instance of KNN here and trying to mesaure the prediction error made by that instance of KNN.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
import numpy as np
error_rate = []
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(messages_tf_idf_train,y_Train)
    y_pred_elbow = knn.predict(messages_tf_idf_test)
    error = np.mean((y_Test != y_pred_elbow))
    error_rate.append(error)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,linestyle='--',marker='o',markersize=8,markerfacecolor='red')
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

From the above plot, we can understand that the error rate is increaseing after K=5. The minimum error that we can get in this task is for k=3 or k=5. 
The error rate for k=4 will be higher than k=3 or k=5. 
We will check the performance for these two k value.

In [None]:
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(messages_tf_idf_train,y_Train)
y_pred = knn_3.predict(messages_tf_idf_test)

In [None]:
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score is: ")
print(accuracy_score(y_Test,y_pred))

In [None]:
#For K= 5

In [None]:
knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(messages_tf_idf_train,y_Train)
y_pred = knn_5.predict(messages_tf_idf_test)

In [None]:
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score is: ")
print(accuracy_score(y_Test,y_pred))

For K=4,

In [None]:
knn_4 = KNeighborsClassifier(n_neighbors=4)
knn_4.fit(messages_tf_idf_train,y_Train)
y_pred = knn_4.predict(messages_tf_idf_test)

In [None]:
print("Classification report is: ")
print(classification_report(y_Test,y_pred))
print("Accuracy Score is: ")
print(accuracy_score(y_Test,y_pred))


The accuracy score of K=4 is lower than the K=3 or K=5 ( as it is visible already in the error-rate vs K plot). Between K=3 and K=5, K=3 will be a good choice interms of accuracy, precision and recall. 

Thank you!