# 1. Data import, cleaning and defining functions:

Importing the libraries (**NLTK for NLP**):

In [None]:
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import numpy as np
import matplotlib.pyplot as plt
import nltk
%matplotlib inline

Importing the SMS UCI dataset (spam vs ham labels):

In [None]:
messages = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='latin-1')
messages.head()

Drop extra columns and rename the other two:

In [None]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

In [None]:
messages.describe()

In [None]:
messages.groupby("label").describe()

Feature engineer a length of message column:

In [None]:
messages['length']=messages['text'].apply(len)
messages.head()

In [None]:
messages.info()

In [None]:
messages.describe()

## Performance Functions defined:

Classification report and accuracy score:

In [None]:
def print_validation_report(y_true, y_pred):
    print("Classification Report")
    print(classification_report(y_true, y_pred))
    acc_sc = accuracy_score(y_true, y_pred)
    print("Accuracy : "+ str(acc_sc))

Confusion matrix:

In [None]:
def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    #fig, ax = plt.subplots(figsize=(4,4))
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5,  
                cmap="Blues", cbar=False)
    #  square=True,
    plt.ylabel('true label')
    plt.xlabel('predicted label')

# 2. Exploratory Data Analysis (EDA)

Visualize the percentage of Spam vs. Ham on piechart:

In [None]:
messages["label"].value_counts().plot(kind = 'pie', figsize = (8, 8), autopct = '%1.1f%%', shadow = True)
plt.ylabel("Spam vs Ham")
plt.legend(["Ham", "Spam"])
plt.show()

A lot of messages are actually not spam. About 86% of our dataset consists of normal messages.

A very basic model would be a model that predicts everything as ham. It would have a decent accuracy. But then again, is that right? No. We will then have to use an accuracy metric that keeps this in mind. Goal : We don't mind if we miss the odd spam message but we surely don't want to mark a ham message as spam i.e Precision is very important. Hence we will use **fbeta** score as our accuracy metric with inclination towards Precision.

In [None]:
messages.hist(column='length',by='label',bins=50, figsize=(20,6))

Looks like spam texts are usually longer, and the whole dataset has a binormal distribution overall.

## Wordclouds: SPAM vs HAM

Split into 2 datasets based on label:

In [None]:
ham  = messages[messages['label'] == 'ham'].copy()
spam = messages[messages['label'] == 'spam'].copy()

ham.head()

**WordCloud Function:**

In [None]:
import wordcloud

def show_wordcloud(data, title):
    text = ' '.join(data['text'].astype(str).tolist())
    stopwords = set(wordcloud.STOPWORDS)
    
    fig_wordcloud = wordcloud.WordCloud(stopwords=stopwords,background_color='white',
                    colormap='viridis', width=800, height=600).generate(text)
    
    plt.figure(figsize=(10,7), frameon=True)
    plt.imshow(fig_wordcloud, interpolation='bilinear')  
    plt.axis('off')
    plt.title(title, fontsize=20 )
    plt.show()

In [None]:
show_wordcloud(ham, "Ham top words")

In [None]:
show_wordcloud(spam, "Spam top words")

# 3. Pre-processing Data

In [None]:
messages.head()

Remove punctuation and stopwords fucntion:

In [None]:
import string
from nltk.corpus import stopwords
stopwords.words('english')[0:10] # Show some stop words

In [None]:
def remove_punct_stop(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

Check if the function works well:

In [None]:
messages['text'].apply(remove_punct_stop)

### Train-test Split:

(NOTE) For now, the length column is ignored as a feature, but it seems a good idea to add it later

In [None]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(messages['text'], messages['label'], test_size=0.3)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

## Creating a Data Pipeline

For MultinomialNB (can be changed for any Classification algorithm)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline_NB = Pipeline([
    ('bow', CountVectorizer(analyzer=remove_punct_stop)),  # strings to token integer counts | use the DATA CLEANING FUNCTION PREDEFINED
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ CHOSEN ML MODEL (CAN BE CHANGED)
])

(NOTE): Can SCALING the data after TFIDF increase the accuracy score of the model?

# 4. Model Training

## 4.1 Multinomial Naive-Bayes (NB)

In [None]:
pipeline_NB.fit(msg_train,label_train)

In [None]:
predictions = pipeline_NB.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, fbeta_score

In [None]:
print(fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))

In [None]:
plot_confusion_matrix(predictions,label_test)

In [None]:
print_validation_report(predictions,label_test)

## 4.2 KNN Classifier

Pipeline with GridSearchCV | Optimize for best param: n_neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

pipeline_KNN = Pipeline([ ('bow'  , CountVectorizer(analyzer = remove_punct_stop) ),
                   ('tfidf'   , TfidfTransformer()),
                   ('clf_KNN' , KNeighborsClassifier() )
                    ])

parameters_KNN = {'clf_KNN__n_neighbors': (8,15,20), }

grid_KNN = GridSearchCV(pipeline_KNN, parameters_KNN, cv=5,refit=True, verbose=3)

grid_KNN.fit(msg_train,label_train)

Optimal K value is 15. This can be done exploring the plot of error rate/accuracy score vs K between 1 and 40; however, this will take LOTS of TIME! (script attached below)

In [None]:
grid_KNN.best_params_

In [None]:
grid_KNN.best_score_

**Best Score=Cross-validation score**

In [None]:
predictions = grid_KNN.predict(msg_test)

In [None]:
print(fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))

In [None]:
plot_confusion_matrix(predictions,label_test)

In [None]:
print_validation_report(predictions,label_test)

Explore error rate and accuracy vs K value, this will take a lot of time, as K is varies for 40 values, instead of 3 in GridSearchCV, but can be done!

error_rate = []
scores = []

for i in range(1,40): # check all values of K between 1 and 40
    pipeline_KNN = Pipeline([ ('bow'  , CountVectorizer(analyzer = remove_punct_stop) ),
                   ('tfidf'   , TfidfTransformer()),
                   ('clf_KNN' , KNeighborsClassifier(n_neighbors=i) )
                    ])
    pipeline_KNN.fit(msg_train,label_train)
    pred_i = pipeline_KNN.predict(msg_test)
    score=accuracy_score(label_test,pred_i)
    scores.append(score)
    error_rate.append(np.mean(pred_i != label_test)) # ERROR RATE DEF and add it to the list

To plot the error rate/ accuracy use:

plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

## 4.3 SVM Classifier

Pipeline with GridSearchCV | Optimize for best param: gamma set to auto, optimize for C and usage of TFIDF (yes/no)

In [None]:
from sklearn.svm import SVC

pipeline_SVC = Pipeline([ ('bow'  , CountVectorizer(analyzer = remove_punct_stop) ),
                   ('tfidf'   , TfidfTransformer()),
                   ('clf_SVC' , SVC(gamma='auto', C=1000)),
                    ])


parameters_SVC = dict(tfidf=[None, TfidfTransformer()], clf_SVC__C=[500, 1000,1500])

grid_SVC = GridSearchCV(pipeline_SVC, parameters_SVC, cv=5, refit=True, verbose=1)

grid_SVC.fit(msg_train, label_train)

In [None]:
grid_SVC.best_params_

In [None]:
grid_SVC.best_estimator_

In [None]:
grid_SVC.best_score_

**Best Score=Cross-validation score**

In [None]:
predictions = grid_SVC.predict(msg_test)

In [None]:
print(fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))

In [None]:
plot_confusion_matrix(predictions,label_test)

In [None]:
print_validation_report(predictions,label_test)