In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Getting the sms data
sms = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding='latin-1')

In [None]:
sms.head()

In [None]:
# Let's see the columns:
sms.columns

In [None]:
# Lets remove the unwanted columns:
sms.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis = 1,inplace=True)

In [None]:
# Lets change the v1 and v2 column into something meaningful
# v1 can be renamed as label and v2 can be renamed message

sms.columns = ['label','message']

# lets check the first five rows
sms.head()

In [None]:
# Lets see if there is any null value 
sms.isnull().sum()

There are no null values present in the data

In [None]:
# Lets see if there are any blank message i.e empty string as message
empty = []

for i,label,msg in sms.itertuples():
    if type(sms)==str:
        if sms.isspace():
            empty.append(i)

print(f"There are {len(empty)} empty string as message")

In [None]:
# Number of ham and spam messages:
print(sms['label'].value_counts())
print('\n')
sns.countplot(x='label',data=sms)

**Lets check the length of each message and see if using length we can diffrentiate between ham and spam**

In [None]:
# we will create a new column called : length
sms['length'] = sms['message'].apply(len)

# First five rows:
sms.head()

In [None]:
# Lets plot the length of the message

plt.xscale('log')
bins= 1.16**np.arange(1,50)

sms[sms['label']=='ham']['length'].plot(kind='hist',bins=bins,alpha=0.8)
sms[sms['label']=='spam']['length'].plot(kind='hist',bins=bins,alpha=0.8)

plt.legend(['ham','spam'])

**Clearly, spam messges are longer in length as compared to ham messages**

# Data Cleaning

**Punctuation**

In [None]:
# Let's remove the punctuation
import string

# Function to remove the punctuation
def remove_punct(text):
    new_text = ''
    for char in text:
        if char in string.punctuation:
            continue
        else:
            new_text += char
    return new_text


# applying the above function on single message to check if it works
single_message = sms['message'][0]

print('message before removing the punctuation:')
print('\n')
print(single_message)
print('\n')
print('message after removing the punctuation:')
print()
print(remove_punct(single_message))

In [None]:
# Let's apply the above function to all the messages
sms['message'] = sms['message'].apply(remove_punct)

**Stemming**

In [None]:
# To perform stemming we will use Porterstemmer from nltk
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

# Function to do the stemming
def stemming(text):
    text = text.split(' ') # to seperate the words
    text = [p_stemmer.stem(word) for word in text]
    
    return text
    
# Lets test it on single message
single_message = sms['message'][4]

print('message before stemming:')
print('\n')
print(single_message)
print('\n')
print('message after stemming:')
print()
print(stemming(single_message))

Check how 'goes' became 'goe' and 'lives' became 'live'. We will keep the message in the form of list for sometime. We will convert it back to text for creating the model

In [None]:
# Let's apply the above function to all the messages
sms['message'] = sms['message'].apply(stemming)

**Stopwords**

In [None]:
# Stopwords can be is,are,the etc.
# We will use stopword method from nltk corpus library
from nltk.corpus import stopwords

# function to remove the stopword
def remove_stopwords(text):
    text = [word for word in text if word not in stopwords.words('english')]
    
    # converting the list back to text
    return ' '.join(text)

# Let's test it on single message
single_message = sms['message'][4]

print('message before removing the stopwords:')
print('\n')
print(single_message)
print('\n')
print('message after removing the stopwords:')
print()
print(remove_stopwords(single_message))

**'he','here','to' are removed from the message.**

In [None]:
# Let's apply the above function to all the messages
sms['message'] = sms['message'].apply(remove_stopwords)

**Let's create a model.We will use navie_bayes,LinearSVC,logisticRegression to create and train our model and then compare the scores**

In [None]:
# Libraries needed
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

**The TfidfVectorizer function create a sparse matrix of words and it's frequency. It then calculates the tf-idf features from this matrix.**

In [None]:
# Let's divide the data into X and y

X = sms['message']
y = sms['label']

# Let's perform the one hot encoding for y 
encoded_y = pd.get_dummies(y,drop_first=True).values.reshape(-1,)

**if y = 1 it is a spam message and if y = 0 it is a ham message**

In [None]:
# Let's divide the data into train and test set. We will take 25% of data as test set
X_train,X_test,y_train,y_test = train_test_split(X,encoded_y,test_size=0.25)

**Logistic Regression**

In [None]:
logistic = Pipeline([
    ('vector',TfidfVectorizer()),
    ('classifier',LogisticRegression())
])

logistic.fit(X_train,y_train)

In [None]:
predict_log = logistic.predict(X_test)

Evaluation of Logistic Regression model

In [None]:
acc_log = accuracy_score(y_test,predict_log)
print(f"Accuracy of log model is {np.round(acc_log*100,2)}%")
print('\n')
print(classification_report(y_test,predict_log))
print('\n')

ax=plt.subplot()
sns.heatmap(confusion_matrix(y_test,predict_log),annot = True,cmap='flare',fmt='g',ax=ax)

ax.xaxis.set_ticklabels(['ham', 'spam'])
ax.yaxis.set_ticklabels(['ham', 'spam'])

**Naive_Bayes**

In [None]:
naive = Pipeline([
    ('vector',TfidfVectorizer()),
    ('classifier',MultinomialNB())
])

naive.fit(X_train,y_train)

In [None]:
predict_naive = naive.predict(X_test)

Evaluation of Naive_Bayes model

In [None]:
acc_naive = accuracy_score(y_test,predict_log)
print(f"Accuracy of naive model is {np.round(acc_naive*100,2)}%")
print('\n')
print(classification_report(y_test,predict_naive))
print('\n')

ax=plt.subplot()
sns.heatmap(confusion_matrix(y_test,predict_naive),annot = True,cmap='flare',fmt='g',ax=ax)

ax.xaxis.set_ticklabels(['ham', 'spam'])
ax.yaxis.set_ticklabels(['ham', 'spam'])

**Linear SVC**

In [None]:
svc = Pipeline([
    ('vector',TfidfVectorizer()),
    ('classifier',LinearSVC())
])

svc.fit(X_train,y_train)

In [None]:
predict_svc = svc.predict(X_test)

Evaluation of LinearSVC model

In [None]:
acc_svc = accuracy_score(y_test,predict_svc)
print(f"Accuracy of svc model is {np.round(acc_svc*100,2)}%")
print('\n')
print(classification_report(y_test,predict_svc))
print('\n')

ax=plt.subplot()
sns.heatmap(confusion_matrix(y_test,predict_svc),annot = True,cmap='flare',fmt='g',ax=ax)

ax.xaxis.set_ticklabels(['ham', 'spam'])
ax.yaxis.set_ticklabels(['ham', 'spam'])

**Let's compare the accuracy of all three models**

In [None]:
accuracy = {
    'naive':acc_naive,
    'svc':acc_svc,
    'logistic':acc_log
}

sns.set_style('darkgrid')
plt.plot(accuracy.keys(),accuracy.values(),color='blue',markeredgecolor='red',marker='*',markeredgewidth=4)
plt.ylim(0.9,1.0)

**As we can see in the above graph, Linear SVC model performed better then other models**

**Let's  check the Cross_Validation_Score for Linear SVC**

In [None]:
from sklearn.model_selection import cross_val_score

# svc is the Pipeline model we created above using LinearSVC
score = cross_val_score(svc,X,encoded_y,cv=15)

In [None]:
# Lets check the socre fro all 15 experiment
print(score)

In [None]:
# Lets check the average score
print(f"The average score is {np.round(score.mean()*100,2)}")

# Thank You