### Importing the required modules/packages

In [132]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
import scipy as sp
import datetime
import pytz
import graphviz

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.svm.libsvm import cross_validation
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from textblob import TextBlob, Word

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from random import randint

## Elastic Search for Metrics
from datetime import datetime
from elasticsearch import Elasticsearch


# Naive Bayes
from sklearn.naive_bayes import MultinomialNB         

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# SVC
from sklearn.svm import SVC

# KNN Neighbors
from sklearn.neighbors import KNeighborsClassifier

# Decision tree 
from sklearn.tree import DecisionTreeClassifier

# Random forest 
from sklearn.ensemble import RandomForestClassifier


# Gradient Booster Classifier
from sklearn.ensemble import GradientBoostingClassifier

### Loading file and looking into the dimensions of data

In [3]:
raw_data = pd.read_csv("SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [4]:
print(raw_data.shape)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)

(5572, 2)


col_0,label
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


In [5]:
# Create Test Train Fit

# Define X and y.
X = raw_data.text
y = raw_data.label

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, test_size= 0.3)

# Calculate Null Accuracy

In [6]:
# Calculate null accuracy.
y_test_binary = np.where(y_test=='ham', 1, 0) # five stars become 1, one stars become 0
print('Percent Ham:', y_test_binary.mean())
print('Percent Spam:', 1 - y_test_binary.mean())

Percent Ham: 0.8624401913875598
Percent Spam: 0.13755980861244022


In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

# Function to cleanup the data through pre-processing 

# Calculate Metrics and Generate 

In [114]:
def calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, gs_best_parameters_pipe_spam_ham, score,test_scores_csv_means_std, y_test,y_pred):

    macro_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
    micro_score = precision_recall_fscore_support(y_test, y_pred, average='micro')
    weighted_score = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  

    macro_score_insert = {'macro_precision': macro_score[0] * 100, 'macro_recall': macro_score[1]  * 100, 'macro_fscore':macro_score[2]  * 100}
    micro_score_insert = {'micro_precision': micro_score[0] * 100, 'micro_recall': micro_score[1] * 100, 'micro_fscore':micro_score[2] * 100}
    weighted_score_insert = {'weighted_precision': weighted_score[0] * 100, 'weighted_recall': weighted_score[1] * 100, 'weighted_fscore':weighted_score[2] * 100}
    score_insert = {'score': score  * 100}
    

    
    ## Print Accuracy of the current Test
    print(algorithm_name_insert , ' pipeline test accuracy: %.3f' % score)
    
    

    
    ## Push the data to ElasticSearch

    ES_Metric_Insert(run_id_insert, algorithm_name_insert, test_parameters_insert,gs_best_parameters_pipe_spam_ham, score_insert,test_scores_csv_means_std, macro_score_insert,micro_score_insert,weighted_score_insert)
    
    return()

# Pushing Data into Elastic Search

In [115]:
def ES_Metric_Insert(run_id_insert,algorithm_name, test_parameters, gs_best_parameters_pipe_spam_ham, score, test_scores_csv_means_std, macro_scores, micro_scores, weighted_scores):
    es = Elasticsearch()
    
    final_dict = {}
    
    my_current_time = datetime.now(tz=pytz.utc)
    timestamp_insert = {'timestamp': my_current_time}
    author_insert = {'author': 'Rahul'}
    final_dict.update(run_id_insert)
    final_dict.update(timestamp_insert)
    final_dict.update(author_insert)
    final_dict.update(algorithm_name)
    final_dict.update(test_parameters)
    final_dict.update(gs_best_parameters_pipe_spam_ham)
    final_dict.update(score)
    final_dict.update(test_scores_csv_means_std)
    final_dict.update(macro_scores)
    final_dict.update(micro_scores)
    final_dict.update(weighted_scores)
        
    res = es.index(index="ml-performance-metrics", doc_type='text', body=final_dict)
    es.indices.refresh(index="ml-performance-metrics")


    return()

In [126]:
def ML_Pipeline_Processing_And_Metrics(run_id,X_train, y_train, X_test, y_test, grid_search_parameters, gs_clf_pipe_spam_ham, cv_value, classifier_name):
    
    gs_clf_pipe_spam_ham.fit(X_train, y_train)

    ## Find predictions for the pipeline
    y_pred = gs_clf_pipe_spam_ham.predict(X_test)
    
    ## Find score of predictions
    score_pipe_spam_ham = gs_clf_pipe_spam_ham.score(X_test, y_test)    
    
    ## Best Grid Search Parameters selected for this case    
    gs_best_parameters_pipe_spam_ham = {}
    for param_name in sorted(grid_search_parameters.keys()):
        gs_best_parameters_pipe_spam_ham[param_name] = gs_clf_pipe_spam_ham.best_params_[param_name]
        
    
    ## Setting up for reporting to Screen and ElasticSearch
    
    ## Add Run Id for each run. This helps with fishing out the correct dataset in cloud
    run_id_insert = {'run_id' : run_id}
    
    ## Save Classifier name as a string
    
    classifier_string = str(classifier_name)
    classifer_name_only = classifier_string.split("(")[0]
    
    algorithm_name_insert = {'Algorithm_Name' : classifer_name_only}
    
    ## Add Classifier Parameters to output
    test_parameters_insert = {'test_parameters' : str(pipe_spam_ham)}
    
    
    ## Breaking test cv scores and calculating mean and standard Deviation of each.
    cv_scores_df = pd.DataFrame.from_dict(gs_clf_pipe_spam_ham.cv_results_)
    
    test_scores_csv_means_std = {}
    
    test_scores_csv_means_std['mean_fit_time'] = cv_scores_df.loc[0 ,'mean_fit_time']
    test_scores_csv_means_std['std_fit_time'] = cv_scores_df.loc[0 ,'std_fit_time']
    test_scores_csv_means_std['mean_test_score'] = cv_scores_df.loc[0 ,'mean_test_score'] * 100
    test_scores_csv_means_std['std_test_score'] = cv_scores_df.loc[0 ,'std_test_score']
    test_scores_csv_means_std['mean_train_score'] = cv_scores_df.loc[0 ,'mean_train_score']  * 100
    test_scores_csv_means_std['std_train_score'] = cv_scores_df.loc[0 ,'std_train_score']
    

    ## Send all the collected data to the metric collection and ES insert system.
    calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, gs_best_parameters_pipe_spam_ham, score_pipe_spam_ham, test_scores_csv_means_std, y_test,y_pred)
    
    
    return()

# Full ML Pipeline

In [None]:
pipe_spam_ham = []
pipe_spam_ham_features = []

run_id = randint(100000, 999999)

## Cross_Val value
cv_value = 2

# Define 10 fold cross-validation
cv = KFold(n_splits=10)

pipe_spam_ham_features.append(('vect', CountVectorizer()))
pipe_spam_ham_features.append(('tfidf', TfidfTransformer()))
#pipe_spam_ham_features.append(('hash', HashingVectorizer()))

## Initializing the classifier to Naieve Bayes
pipe_spam_ham_features.append(('clf', MultinomialNB()))

## Putting together the various classification algorithms to use
clfs = []
clfs.append(MultinomialNB())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=5))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())
clfs.append(LogisticRegression())


## Setting up the pipeline
pipe_spam_ham = Pipeline(pipe_spam_ham_features)


grid_search_parameters = {'vect__stop_words': ('english',None),                          
                          'vect__ngram_range': [(1, 1),(1, 2),(1, 3), (1, 4)],
                          'vect__max_df': (0.9,1),
                          'vect__lowercase': (True, False),
                          'vect__binary': (True, False),
#                           'vect__tokenizer: (LemmaTokenizer()),
                          'tfidf__use_idf': (True, False),
                          'tfidf__norm': ('l1','l2','max'),
                          'tfidf__smooth_idf': (True, False),
                          'tfidf__sublinear_tf': (True, False)
                         }

print(pipe_spam_ham_features)
## Trying out the various possible classifiers:

for classifier in clfs:
    
    ## Adding the classifier to be used
    pipe_spam_ham.set_params(clf = classifier)
    
    gs_clf_pipe_spam_ham = GridSearchCV(pipe_spam_ham, grid_search_parameters, n_jobs=-1, cv=cv_value, return_train_score=True)
    
    ML_Pipeline_Processing_And_Metrics(run_id,X_train, y_train, X_test, y_test,grid_search_parameters, gs_clf_pipe_spam_ham, cv_value, classifier)
    
    

##                          'vect__analyzer': (‘word’, ‘char’, ‘char_wb’),
##                          'vect__preprocessor: ('callable','None'),
##                          'vect__tokenizer: (LemmaTokenizer()),
       


[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]
