### Importing the required modules/packages

In [253]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
import scipy as sp
import datetime
import pytz
import graphviz

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.svm.libsvm import cross_validation
from sklearn.model_selection import cross_validate


from textblob import TextBlob, Word

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from random import randint

## Elastic Search for Metrics
from datetime import datetime
from elasticsearch import Elasticsearch


# Naive Bayes
from sklearn.naive_bayes import MultinomialNB         

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# SVC
from sklearn.svm import SVC

# KNN Neighbors
from sklearn.neighbors import KNeighborsClassifier

# Decision tree 
from sklearn.tree import DecisionTreeClassifier

# Random forest 
from sklearn.ensemble import RandomForestClassifier


# Gradient Booster Classifier
from sklearn.ensemble import GradientBoostingClassifier

### Loading file and looking into the dimensions of data

In [158]:
raw_data = pd.read_csv("SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [159]:
print(raw_data.shape)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)

(5572, 2)


col_0,label
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


In [160]:
# Create Test Train Fit

# Define X and y.
X = raw_data.text
y = raw_data.label

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, test_size= 0.3)

# Calculate Null Accuracy

In [161]:
# Calculate null accuracy.
y_test_binary = np.where(y_test=='ham', 1, 0) # five stars become 1, one stars become 0
print('Percent Ham:', y_test_binary.mean())
print('Percent Spam:', 1 - y_test_binary.mean())

Percent Ham: 0.8624401913875598
Percent Spam: 0.13755980861244022


In [162]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

# Function to cleanup the data through pre-processing 

In [163]:
import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

#      def fit(self, X, y=None):
#          return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

# Calculate Metrics and Generate 

In [255]:


def calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, score, scores_cv,test_scores_csv_means_std, y_test,y_pred):
    
    macro_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
    micro_score = precision_recall_fscore_support(y_test, y_pred, average='micro')
    weighted_score = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  

    macro_score_insert = {'macro_precision': macro_score[0] * 100, 'macro_recall': macro_score[1]  * 100, 'macro_fscore':macro_score[2]  * 100}
    micro_score_insert = {'micro_precision': micro_score[0] * 100, 'micro_recall': micro_score[1] * 100, 'micro_fscore':micro_score[2] * 100}
    weighted_score_insert = {'weighted_precision': weighted_score[0] * 100, 'weighted_recall': weighted_score[1] * 100, 'weighted_fscore':weighted_score[2] * 100}
    score_insert = {'score': score  * 100}
    scores_cv_insert = {'score_cv': str(scores_cv)}

    
    ## Print Accuracy of the current Test
    print(algorithm_name_insert , ' pipeline test accuracy: %.3f' % score)
    
    

    
    ## Push the data to ElasticSearch

    ES_Metric_Insert(run_id_insert, algorithm_name_insert, test_parameters_insert, score_insert, scores_cv_insert,test_scores_csv_means_std, macro_score_insert,micro_score_insert,weighted_score_insert)
    
    return()

# Pushing Data into Elastic Search

In [257]:
def ES_Metric_Insert(run_id_insert,algorithm_name, test_parameters, score, scores_cv, test_scores_csv_means_std, macro_scores, micro_scores, weighted_scores):
    es = Elasticsearch()
    
    final_dict = {}
    
    print(algorithm_name)
    
    my_current_time = datetime.now(tz=pytz.utc)
    timestamp_insert = {'timestamp': my_current_time}
    author_insert = {'author': 'Rahul'}
    final_dict.update(run_id_insert)
    final_dict.update(timestamp_insert)
    final_dict.update(author_insert)
    final_dict.update(algorithm_name)
    final_dict.update(test_parameters)
    final_dict.update(score)
    final_dict.update(scores_cv)
    final_dict.update(test_scores_csv_means_std)
    final_dict.update(macro_scores)
    final_dict.update(micro_scores)
    final_dict.update(weighted_scores)
        
    res = es.index(index="ml-performance-metrics", doc_type='text', body=final_dict)
    es.indices.refresh(index="ml-performance-metrics")


    return()

# Use Naive Bayes to predict the ham vs spam label.

# NB With CountVectorizer()

In [168]:
pipe_cv_nb_features = []    

pipe_cv_nb_features.append(('vect', CountVectorizer( tokenizer=LemmaTokenizer(), 
                                             lowercase=False, 
                                             min_df=1,
                                             max_features=100000, 
                                             ngram_range=(1, 4), 
                                             stop_words='english', 
                                             decode_error='replace')))
pipe_cv_nb_features.append(('nb', MultinomialNB()))

pipe_cv_nb = Pipeline(pipe_cv_nb_features)

pipe_cv_nb.fit(X_train, y_train)

y_pred = pipe_cv_nb.predict(X_test)

score_cv_nb_cv = pipe_cv_nb.score(X_test, y_test) * 100

algorithm_name_insert = {'Algorithm_Name':'Naive Bayes classifier'}
test_parameters_insert = {'vectorizer' : 'CountVectorizer', 'lowercase':'false','min_df': 1, 'max_features': 100000,'ngram_range': '1-4'}

calculate_metrics_push_to_es(algorithm_name_insert, test_parameters_insert, score_cv_nb_cv, y_test,y_pred)

{'Algorithm_Name': 'Naive Bayes classifier'}  pipeline test accuracy: 98.206


()

# NB With TfdifVectorizer()

In [169]:
pipe_tfdif_nb_features = []


#pipe_tfdif_nb_features.append(('preprocessor', NLTKPreprocessor()))
pipe_tfdif_nb_features.append(('vect', TfidfVectorizer(stop_words='english', 
                                             lowercase=False, 
                                             min_df=1,
                                             max_features=100000, 
                                             ngram_range=(1, 1))))

pipe_tfdif_nb_features.append(('nb', MultinomialNB()))

pipe_tfdif_nb = Pipeline(pipe_tfdif_nb_features)

pipe_tfdif_nb.fit(X_train, y_train)

print(pipe_tfdif_nb)

y_pred = pipe_tfdif_nb.predict(X_test)

score_cv_nb_tfidf = pipe_tfdif_nb.score(X_test, y_test)

algorithm_name_insert = {'Algorithm_Name':'Naive Bayes classifier'}

test_parameters_insert = {'vectorizer': 'Tfidf', 'lowercase':'false','min_df': 1, 'max_features': 100000,'ngram_range': '1-1'}


calculate_metrics_push_to_es(algorithm_name_insert, test_parameters_insert, score_cv_nb_tfidf, y_test,y_pred)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
{'Algorithm_Name': 'Naive Bayes classifier'}  pipeline test accuracy: 0.970


()

# Full ML Pipeline

In [258]:
pipe_spam_ham = []
pipe_spam_ham_features = []

run_id = randint(100000, 999999)


#pipe_tfdif_nb_features.append(('preprocessor', NLTKPreprocessor()))
pipe_spam_ham_features.append(('vect', TfidfVectorizer(stop_words='english', 
                                             lowercase=False, 
                                             min_df=1,
                                             max_features=100000, 
                                             ngram_range=(1, 1))))

## Initializing the classifier to Naieve Bayes
pipe_spam_ham_features.append(('clf', MultinomialNB()))

## Putting together the various classification algorithms to use
clfs = []
clfs.append(MultinomialNB())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())
clfs.append(LogisticRegression())


## Setting up the pipeline
pipe_spam_ham = Pipeline(pipe_spam_ham_features)



## Trying out the various possible classifiers:

for classifier in clfs:
    
    ## Adding the classifier to be used
    pipe_spam_ham.set_params(clf = classifier)
    
    ## Adding individual parameters for each individual classifier
    
    
    
    
       
    pipe_spam_ham.fit(X_train, y_train)

    ## Find predictions for the pipeline
    y_pred = pipe_spam_ham.predict(X_test)
    
    ## Find score of predictions
    score_pipe_spam_ham = pipe_spam_ham.score(X_test, y_test)

    ## Cross Validate the scores
    scores_pipe_spam_ham_cv = cross_validate(pipe_spam_ham, X_train, y_train, cv=2)
    
    
    ## Setting up for reporting to Screen and ElasticSearch
    
    ## Add Run Id for each run. This helps with fishing out the correct dataset in cloud
    run_id_insert = {'run_id' : run_id}
    
    ## Save Classifier name as a string
    
    classifier_string = str(classifier)
    classifer_name_only = classifier_string.split("(")[0]
    
    algorithm_name_insert = {'Algorithm_Name' : classifer_name_only}
    
    ## Add Classifier Parameters to output
    test_parameters_insert = {'test_parameters' : str(pipe_spam_ham)}
    
    
    ## Breaking test cv scores and calculating mean and standard Deviation of each.
    test_scores_csv_means_std = {}
    
    for key, values in scores_pipe_spam_ham_cv.items():
            key_mean = key + '_mean'
            test_scores_csv_means_std[key_mean] = values.mean()
            key_std = key + '_std'
            test_scores_csv_means_std[key_std] = values.std()
    
    ## Send all the collected data to the metric collection and ES insert system.
    calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, score_pipe_spam_ham, scores_pipe_spam_ham_cv, test_scores_csv_means_std, y_test,y_pred)


{'Algorithm_Name': 'MultinomialNB'}  pipeline test accuracy: 0.970
{'Algorithm_Name': 'MultinomialNB'}


  'precision', 'predicted', average, warn_for)


{'Algorithm_Name': 'SVC'}  pipeline test accuracy: 0.862
{'Algorithm_Name': 'SVC'}
{'Algorithm_Name': 'KNeighborsClassifier'}  pipeline test accuracy: 0.918
{'Algorithm_Name': 'KNeighborsClassifier'}
{'Algorithm_Name': 'DecisionTreeClassifier'}  pipeline test accuracy: 0.970
{'Algorithm_Name': 'DecisionTreeClassifier'}
{'Algorithm_Name': 'RandomForestClassifier'}  pipeline test accuracy: 0.964
{'Algorithm_Name': 'RandomForestClassifier'}
{'Algorithm_Name': 'GradientBoostingClassifier'}  pipeline test accuracy: 0.952
{'Algorithm_Name': 'GradientBoostingClassifier'}
{'Algorithm_Name': 'LogisticRegression'}  pipeline test accuracy: 0.950
{'Algorithm_Name': 'LogisticRegression'}
