### Importing the required modules/packages

In [56]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
import scipy as sp
import datetime
import pytz


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from textblob import TextBlob, Word

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Elastic Search for Metrics
from datetime import datetime
from elasticsearch import Elasticsearch


# Naive Bayes
from sklearn.naive_bayes import MultinomialNB         

# Logistic Regression
from sklearn.linear_model import LogisticRegression



### Loading file and looking into the dimensions of data

In [57]:
raw_data = pd.read_csv("SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [58]:
print(raw_data.shape)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)

(5572, 2)


col_0,label
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


In [59]:
# Create Test Train Fit

# Define X and y.
X = raw_data.text
y = raw_data.label

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, test_size= 0.3)

# Calculate Null Accuracy

In [60]:
# Calculate null accuracy.
y_test_binary = np.where(y_test=='ham', 1, 0) # five stars become 1, one stars become 0
print('Percent Ham:', y_test_binary.mean())
print('Percent Spam:', 1 - y_test_binary.mean())

Percent Ham: 0.8624401913875598
Percent Spam: 0.13755980861244022


# Calculate Metrics and Generate 

In [62]:
@timeit

def calculate_metrics_push_to_es(algorithm_name_insert, test_parameters_insert, score, y_test,y_pred):
    
    macro_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
    micro_score = precision_recall_fscore_support(y_test, y_pred, average='micro')
    weighted_score = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  

    macro_score_insert = {'macro_precision': macro_score[0] * 100, 'macro_recall': macro_score[1]  * 100, 'macro_fscore':macro_score[2]  * 100}
    micro_score_insert = {'micro_precision': micro_score[0] * 100, 'micro_recall': micro_score[1] * 100, 'micro_fscore':micro_score[2] * 100}
    weighted_score_insert = {'weighted_precision': weighted_score[0] * 100, 'weighted_recall': weighted_score[1] * 100, 'weighted_fscore':weighted_score[2] * 100}
    score_insert = {'score': score  * 100}

    ## Print Accuracy of the current Test
    print(algorithm_name_insert , ' pipeline test accuracy: %.3f' % score)
    
    
    ## Printing all the test Keywords for easy tag search in Kibana
    test_keywords = [algorithm_name_insert['Algorithm_Name']]
    for key, value in test_parameters_insert.items(): 
        test_keywords.append(key)
        test_keywords.append(value)
    
    test_keywordDict_insert = {'test_keywords' : test_keywords}
    
    
    test_parameters_insert = {'test_params' : test_parameters_insert}
    
    ## Push the data to ElasticSearch

    ES_Metric_Insert(algorithm_name_insert, test_parameters_insert, test_keywordDict_insert, score_insert,macro_score_insert,micro_score_insert,weighted_score_insert)
    
    return()

# Pushing Data into Elastic Search

In [63]:
@timeit

def ES_Metric_Insert(algorithm_name, test_parameters, test_keywordDict, score, macro_scores, micro_scores, weighted_scores):
    es = Elasticsearch()
    
    final_dict = {}
    
    my_current_time = datetime.now(tz=pytz.utc)
    timestamp_insert = {'timestamp': my_current_time}
    author_insert = {'author': 'Rahul'}
    
    final_dict.update(timestamp_insert)
    final_dict.update(author_insert)
    final_dict.update(algorithm_name)
    final_dict.update(test_parameters)
    final_dict.update(test_keywordDict)
    final_dict.update(score)
    final_dict.update(macro_scores)
    final_dict.update(micro_scores)
    final_dict.update(weighted_scores)
        
    res = es.index(index="ml-performance-metrics", doc_type='text', body=final_dict)
    es.indices.refresh(index="ml-performance-metrics")


    return()

# Remove before finals

In [131]:
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
                                                    train_size=0.75, test_size=0.25)

pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2)
pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))
pipeline_optimizer.export('tpot_exported_pipeline.py')



Optimization Progress:  33%|███▎      | 40/120 [03:54<04:58,  3.73s/pipeline]

Generation 1 - Current best internal CV score: 0.9606623246308652


Optimization Progress:  50%|█████     | 60/120 [05:12<05:50,  5.84s/pipeline]

Generation 2 - Current best internal CV score: 0.9718126276386357


Optimization Progress:  67%|██████▋   | 80/120 [07:34<08:38, 12.97s/pipeline]

Generation 3 - Current best internal CV score: 0.9718126276386357


Optimization Progress:  83%|████████▎ | 100/120 [14:44<04:50, 14.54s/pipeline]

Generation 4 - Current best internal CV score: 0.9814381404959083


                                                                              

Generation 5 - Current best internal CV score: 0.9822125486811804

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=5, p=2, weights=distance)
0.9911111111111112




True