# **1. Data import, cleaning and defining functions:**

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


#Importing the libraries (NLTK for NLP):
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
        
# Import Matplot and Seaborn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
%matplotlib inline


**Importing the SMS dataset and displaing it's structure[](http://):**

In [None]:
messages = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv",encoding='latin-1')
messages

**Drop extra columns and rename the other two:**

In [None]:
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages

In [None]:
# Extract the data characteristics and show most frequent sms:
messages.groupby("label").describe()

**Feature engineer** a length of message column:

In [None]:
messages['length']=messages['text'].apply(len)
messages

In [None]:
#Show the overall percentage of sms's length: 
messages.describe()

# **Performance Functions defined:**

Classification report and accuracy score:

In [None]:
def print_validation_report(y_true, y_pred):
    print("Classification Report")
    print(classification_report(y_true, y_pred))
    acc_sc = accuracy_score(y_true, y_pred)
    print("Accuracy : "+ str(acc_sc))
    
def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    #fig, ax = plt.subplots(figsize=(4,4))
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5,  
                cmap="Blues", cbar=False)
    #  square=True,
    plt.ylabel('true label')
    plt.xlabel('predicted label')

# **2. Exploratory Data Analysis (EDA)**

Visualize the percentage of Spam vs. Ham on piechart:

In [None]:
messages["label"].value_counts().plot(kind = 'pie', figsize = (8, 8), autopct = '%1.1f%%', shadow = True)
plt.ylabel("Spam vs Ham")
plt.legend(["Ham", "Spam"])
plt.show()

In [None]:
messages.hist(column='length',by='label',bins=50, figsize=(20,6))

Looks like spam texts are usually longer, and the whole dataset has a binormal distribution overall.

**Wordclouds: SPAM vs HAM**

Split into 2 datasets based on label:

In [None]:
ham  = messages[messages['label'] == 'ham'].copy()
spam = messages[messages['label'] == 'spam'].copy()

In [None]:
#WordCloud Function:
import wordcloud

def show_wordcloud(data, title):
    text = ' '.join(data['text'].astype(str).tolist())
    stopwords = set(wordcloud.STOPWORDS)
    
    fig_wordcloud = wordcloud.WordCloud(stopwords=stopwords,background_color='white',
                    colormap='viridis', width=800, height=600).generate(text)
    
    plt.figure(figsize=(10,7), frameon=True)
    plt.imshow(fig_wordcloud, interpolation='bilinear')  
    plt.axis('off')
    plt.title(title, fontsize=20 )
    plt.show()

In [None]:
    show_wordcloud(ham, "Ham top words")

In [None]:
show_wordcloud(spam, "Spam top words")

# 3. Pre-processing Data

Remove punctuation and stopwords fucntion:

In [None]:
stopwords.words('english')[0:10] # Show some stop words

In [None]:
def remove_punct_stop(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
messages['text'].apply(remove_punct_stop) #check if the function works well

# **Train-test Split:**

In [None]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = train_test_split(messages['text'], messages['label'], test_size=0.3)

print('Train:',len(msg_train))

print('Test: ',len(msg_test)) 
                                 
print('Total:', len(msg_train) + len(msg_test))

Creating a Data Pipeline for Naive Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pipeline_NB = Pipeline([
    ('bow', CountVectorizer(analyzer=remove_punct_stop)),  # strings to token integer counts | use the DATA CLEANING FUNCTION PREDEFINED
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors 
])

#  **4. Model Training**

# 4.1 Multinomial Naive-Bayes (NB)

In [None]:
pipeline_NB.fit(msg_train,label_train)

In [None]:
predictions = pipeline_NB.predict(msg_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, fbeta_score
print('Fbeta-measure, calculated using precision and recall:',fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))


In [None]:
print_validation_report(predictions,label_test)

# 4.2 K-nearest neighbors(KNN) algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

pipeline_KNN = Pipeline([ ('bow'  , CountVectorizer(analyzer = remove_punct_stop) ),
                   ('tfidf'   , TfidfTransformer()),
                   ('clf_KNN' , KNeighborsClassifier() )
                    ])

parameters_KNN = {'clf_KNN__n_neighbors': (8,15,20), }

grid_KNN = GridSearchCV(pipeline_KNN, parameters_KNN, cv=5,refit=True, verbose=3)

grid_KNN.fit(msg_train,label_train)

In [None]:
# Optimal K value is 15. This can be done exploring the plot of error rate/accuracy score vs K between 1 and 40;
grid_KNN.best_params_

In [None]:
# Best Score=Cross-validation score
grid_KNN.best_score_ 

In [None]:
predictions = grid_KNN.predict(msg_test)
print('Fbeta-measure, calculated using precision and recall:',fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))

In [None]:
print_validation_report(predictions,label_test)

# **4.3 Support vector machine(SVM) algorithm**

In [None]:
from sklearn.svm import SVC

pipeline_SVC = Pipeline([ ('bow'  , CountVectorizer(analyzer = remove_punct_stop) ),
                   ('tfidf'   , TfidfTransformer()),
                   ('clf_SVC' , SVC(gamma='auto', C=1000)),
                    ])


parameters_SVC = dict(tfidf=[None, TfidfTransformer()], clf_SVC__C=[500, 1000,1500])

grid_SVC = GridSearchCV(pipeline_SVC, parameters_SVC, cv=5, refit=True, verbose=1)

grid_SVC.fit(msg_train, label_train)

In [None]:
grid_SVC.best_params_

In [None]:
# Best Score=Cross-validation score
grid_SVC.best_score_

In [None]:
predictions = grid_SVC.predict(msg_test)
print('Fbeta-measure, calculated using precision and recall:',fbeta_score(predictions,label_test, beta=0.5, pos_label='ham'))

In [None]:
print_validation_report(predictions,label_test)