### Importing the required modules/packages

In [110]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
import scipy as sp
import datetime
import pytz
import graphviz
import copy


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.svm.libsvm import cross_validation
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import linear_model, decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

from textblob import TextBlob, Word

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from random import randint

## Elastic Search for Metrics
from datetime import datetime
from elasticsearch import Elasticsearch


# Naive Bayes
from sklearn.naive_bayes import MultinomialNB         

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# SVC
from sklearn.svm import SVC

# KNN Neighbors
from sklearn.neighbors import KNeighborsClassifier

# Decision tree 
from sklearn.tree import DecisionTreeClassifier

# Random forest 
from sklearn.ensemble import RandomForestClassifier


# Gradient Booster Classifier
from sklearn.ensemble import GradientBoostingClassifier

### Loading file and looking into the dimensions of data

In [2]:
raw_data = pd.read_csv("SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
print(raw_data.shape)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)

(5572, 2)


col_0,label
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


In [50]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

print('X_train Shape', X_train_dtm.shape)

# Last 50 features
print((vect.get_feature_names()[-50:]))

X_train Shape (3900, 7234)
['yet', 'yetty', 'yetunde', 'yi', 'yijue', 'ym', 'ymca', 'yo', 'yoga', 'yogasana', 'yor', 'yorge', 'you', 'youdoing', 'youi', 'young', 'younger', 'youphone', 'your', 'youre', 'yourinclusive', 'yourjob', 'yours', 'yourself', 'youuuuu', 'youwanna', 'yowifes', 'yr', 'yrs', 'ystrday', 'yummmm', 'yummy', 'yun', 'yunny', 'yuo', 'yuou', 'yup', 'zaher', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zindgi', 'zoe', 'zogtorius', 'zoom', 'zouk', 'zyada', 'èn']


In [51]:
## Looks like we have 7234 Vectors after Count Vectorizer. From 3900 lines of information.

In [86]:
## Vocabulary used:
# vect.vocabulary_

print(X_test_dtm)

  (0, 1179)	1
  (0, 3379)	1
  (0, 3605)	1
  (0, 4374)	1
  (0, 4481)	1
  (0, 5611)	1
  (0, 6317)	1
  (0, 6477)	1
  (0, 7196)	2
  (1, 4598)	1
  (1, 6359)	1
  (2, 1172)	1
  (2, 2031)	1
  (2, 5225)	1
  (2, 6382)	1
  (2, 6985)	1
  (2, 6988)	1
  (2, 7017)	1
  (2, 7196)	1
  (3, 1486)	1
  (3, 3749)	1
  (3, 3872)	1
  (3, 5913)	1
  (4, 677)	1
  (4, 1021)	1
  :	:
  (1669, 3476)	1
  (1669, 3487)	1
  (1669, 4588)	1
  (1669, 6254)	1
  (1669, 6302)	1
  (1669, 6364)	1
  (1670, 868)	1
  (1670, 989)	1
  (1670, 1508)	1
  (1670, 4109)	1
  (1670, 6564)	1
  (1671, 875)	1
  (1671, 1205)	1
  (1671, 1295)	1
  (1671, 1516)	1
  (1671, 2734)	1
  (1671, 2916)	1
  (1671, 3151)	1
  (1671, 3352)	1
  (1671, 3493)	1
  (1671, 4313)	1
  (1671, 4364)	1
  (1671, 5939)	1
  (1671, 7046)	1
  (1671, 7196)	1


In [4]:
# Create Test Train Fit

# Define X and y.
X = raw_data.text
y = raw_data.label

# Split the new DataFrame into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99, test_size= 0.3)

# Calculate Null Accuracy

In [5]:
# Calculate null accuracy.
y_test_binary = np.where(y_test=='ham', 1, 0) # five stars become 1, one stars become 0
print('Percent Ham:', y_test_binary.mean())
print('Percent Spam:', 1 - y_test_binary.mean())

Percent Ham: 0.8624401913875598
Percent Spam: 0.13755980861244022


In [6]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

# Function to cleanup the data through pre-processing 

# Calculate Metrics and Generate 

In [7]:
def calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, gs_best_parameters_pipe_spam_ham, score,test_scores_csv_means_std, y_test,y_pred):

    macro_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
    micro_score = precision_recall_fscore_support(y_test, y_pred, average='micro')
    weighted_score = precision_recall_fscore_support(y_test, y_pred, average='weighted')
  

    macro_score_insert = {'macro_precision': macro_score[0] * 100, 'macro_recall': macro_score[1]  * 100, 'macro_fscore':macro_score[2]  * 100}
    micro_score_insert = {'micro_precision': micro_score[0] * 100, 'micro_recall': micro_score[1] * 100, 'micro_fscore':micro_score[2] * 100}
    weighted_score_insert = {'weighted_precision': weighted_score[0] * 100, 'weighted_recall': weighted_score[1] * 100, 'weighted_fscore':weighted_score[2] * 100}
    score_insert = {'score': score}
    
    print(score_insert)
    
    ## Print Accuracy of the current Test
    print(algorithm_name_insert , ' pipeline test accuracy: %.3f' % score)
    
    ## Push the data to ElasticSearch

    ES_Metric_Insert(run_id_insert, algorithm_name_insert, test_parameters_insert,gs_best_parameters_pipe_spam_ham, score_insert,test_scores_csv_means_std, macro_score_insert,micro_score_insert,weighted_score_insert)
    
    return()

# Pushing Data into Elastic Search

In [8]:
def ES_Metric_Insert(run_id_insert,algorithm_name, test_parameters, gs_best_parameters_pipe_spam_ham, score, test_scores_csv_means_std, macro_scores, micro_scores, weighted_scores):
    es = Elasticsearch()
    
    final_dict = {}
    
    my_current_time = datetime.now(tz=pytz.utc)
    timestamp_insert = {'timestamp': my_current_time}
    author_insert = {'author': 'Rahul'}
    final_dict.update(run_id_insert)
    final_dict.update(timestamp_insert)
    final_dict.update(author_insert)
    final_dict.update(algorithm_name)
    final_dict.update(test_parameters)
    final_dict.update(gs_best_parameters_pipe_spam_ham)
    final_dict.update(score)
    final_dict.update(test_scores_csv_means_std)
    final_dict.update(macro_scores)
    final_dict.update(micro_scores)
    final_dict.update(weighted_scores)
        
    res = es.index(index="ml-performance-metrics", doc_type='text', body=final_dict)
    es.indices.refresh(index="ml-performance-metrics")


    return()

# Processing the ML Pipeline and Calculate Metrics (using another function)

In [114]:
def ML_Pipeline_Processing_And_Metrics(run_id,X_train, y_train, X_test, y_test, grid_search_parameters, gs_clf_pipe_spam_ham, cv_value, classifier_name):
    
    gs_clf_pipe_spam_ham.fit(X_train, y_train)

    ## Find predictions for the pipeline
    y_pred = gs_clf_pipe_spam_ham.predict(X_test)
    
    ## Find score of predictions
    score_pipe_spam_ham = gs_clf_pipe_spam_ham.score(X_test, y_test) * 100 
    
    ## Best Grid Search Parameters selected for this case    
    gs_best_parameters_pipe_spam_ham = {}
    for param_name in sorted(grid_search_parameters.keys()):
        if param_name == 'vect__tokenizer':
            gs_best_parameters_pipe_spam_ham[param_name] = 'LemmaTokenizer'
        else:
            gs_best_parameters_pipe_spam_ham[param_name] = gs_clf_pipe_spam_ham.best_params_[param_name]
        
    
    ## Setting up for reporting to Screen and ElasticSearch
    
    ## Add Run Id for each run. This helps with fishing out the correct dataset in cloud
    run_id_insert = {'run_id' : run_id}
    
    ## Save Classifier name as a string
    
    classifier_string = str(classifier_name)
    classifer_name_only = classifier_string.split("(")[0]
    
    algorithm_name_insert = {'Algorithm_Name' : classifer_name_only}
    
    ## Add Classifier Parameters to output
    test_parameters_insert = {'test_parameters' : str(pipe_spam_ham)}
    
    
    ## Breaking test cv scores and calculating mean and standard Deviation of each.
    cv_scores_df = pd.DataFrame.from_dict(gs_clf_pipe_spam_ham.cv_results_)
    
    test_scores_csv_means_std = {}
    
    test_scores_csv_means_std['mean_fit_time'] = cv_scores_df.loc[0 ,'mean_fit_time']
    test_scores_csv_means_std['std_fit_time'] = cv_scores_df.loc[0 ,'std_fit_time']
    test_scores_csv_means_std['mean_test_score'] = cv_scores_df.loc[0 ,'mean_test_score'] * 100
    test_scores_csv_means_std['std_test_score'] = cv_scores_df.loc[0 ,'std_test_score']
    test_scores_csv_means_std['mean_train_score'] = cv_scores_df.loc[0 ,'mean_train_score']  * 100
    test_scores_csv_means_std['std_train_score'] = cv_scores_df.loc[0 ,'std_train_score']
    

    ## Send all the collected data to the metric collection and ES insert system.
    calculate_metrics_push_to_es(run_id_insert, algorithm_name_insert, test_parameters_insert, gs_best_parameters_pipe_spam_ham, score_pipe_spam_ham, test_scores_csv_means_std, y_test,y_pred)
    
    
    return()

# Remove Vectorizers and ML Algorithms 

In [93]:
def remove_vectorizer_ml_algo(vector_ml_keyword):
    
    ## Remove from gridsearch
    for key in grid_search_parameters.copy():
         if vector_ml_keyword in key.lower():
            del grid_search_parameters[key]
    
    
    ## Remove from spam ham pipeline
    
    for item in pipe_spam_ham_features:
        if vector_ml_keyword in item:
            pipe_spam_ham_features.remove(item)
    
    return()



# Add count vectorizer

In [94]:
## Add Count Vectorizer and associated Features for Testing
def add_count_vectorizer(pipe_spam_ham_features,grid_search_parameters):

    grid_search_parameters['vect__binary'] = (False,True)
    grid_search_parameters['vect__lowercase'] = (True,False)
    grid_search_parameters['vect__tokenizer'] = (LemmaTokenizer(),None)
    
##    Grid Search Parameters avialable for testing. After initial tests it looks like the above params work best. So using those. 
#     grid_search_parameters['vect__stop_words'] = ('english',None)
#     grid_search_parameters['vect__ngram_range'] = [(1, 1),(1, 2),(1, 3), (1, 4)]
#     grid_search_parameters['vect__max_df'] = (0.9,1)
#     grid_search_parameters['vect__lowercase'] = (True, False)
#     grid_search_parameters['vect__binary'] = (True, False)
#     grid_search_parameters['vect__tokenizer'] = (LemmaTokenizer())
#     grid_search_parameters['vect__min_df'] = (5,10)
    pipe_spam_ham_features.append(('vect', CountVectorizer()))

    return()

# Add Tf-Idf Vectorizer

In [95]:
## Add Tf-Idf Vectorizer and associated Features for Testing
def add_tfidf_vectorizer(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['tfidf__norm'] = ('l2','l1')
    grid_search_parameters['tfidf__smooth_idf'] = (True,False)
    
#     ## Grid Search Parameters avialable for testing. After initial tests it looks like the above params work best. So using those.
#     grid_search_parameters['tfidf__use_idf'] = (True, False)
#     grid_search_parameters['tfidf__norm'] = ('l1','l2','max')
#     grid_search_parameters['tfidf__smooth_idf'] = (True, False)
#     grid_search_parameters['tfidf__sublinear_tf'] = (True, False)

    pipe_spam_ham_features.append(('tfidf', TfidfVectorizer()))
    

    return()

# TruncatedSVD

In [96]:
## Add Tf-Idf Vectorizer and associated Features for Testing
def add_TruncatedSVD(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['truncatedsvd__n_components'] = (500, 400, 200)
    
    pipe_spam_ham_features.append(('truncatedsvd', TruncatedSVD()))
    

    return()

# Add Naive Bayes

In [97]:
## Add Naive Bayes Algorithm
def add_multinomialNB(pipe_spam_ham_features,grid_search_parameters):

    grid_search_parameters['nb__alpha'] = (1,0.9)
    grid_search_parameters['nb__fit_prior'] = (True,False)
#     ## Grid Search Parameters avialable for testing. After initial tests it looks like the above params work best. So using those.    
#     grid_search_parameters['nb__alpha'] = (0,1)
#     grid_search_parameters['nb__fit_prior'] = (True, False)
    
    
    pipe_spam_ham_features.append(('nb', MultinomialNB()))
    
    return()

# Add KNN 

In [98]:
## Add Naive Bayes Algorithm
def add_knn(pipe_spam_ham_features,grid_search_parameters):

    grid_search_parameters['knn__n_neighbors'] = (1,2,3,4,5,6,7,8,9,10)
    grid_search_parameters['knn__weights'] = ('uniform', 'distance')
    #grid_search_parameters['knn__algorithm'] = ('ball_tree', 'kd_tree')
    
    pipe_spam_ham_features.append(('knn', KNeighborsClassifier()))
    
    return()

# RandomForestClassifier

In [99]:
## Add Random Forest Algorithm
def add_randomforest(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['rf__n_estimators'] = (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
    grid_search_parameters['rf__max_depth'] = (10,100,1000,None)
    
    pipe_spam_ham_features.append(('rf', RandomForestClassifier()))
    
    return()

# LogisticRegression

In [100]:
## Add Logistic Regression Algorithm
def add_logistic_regression(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['lr__penalty'] = ('l1','l2')
    
    pipe_spam_ham_features.append(('lr', LogisticRegression()))
    
    return()

# SVC

In [101]:
## Add SVC Algorithm
def add_svc_regression(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['svc__C'] = (1.0,0.9,0.8)
    
    pipe_spam_ham_features.append(('svc', SVC()))
    
    return()

# GradientBoostingClassifier

In [102]:
## Add GradientBoostingClassifier Algorithm
def add_gradient_boosting_classifer(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['gbc__n_estimators'] = (100,200,300,1000)
    
    pipe_spam_ham_features.append(('gbc', GradientBoostingClassifier()))
    
    return()

# DecisionTreeClassifier

In [103]:
## Add DecisionTreeClassifier Algorithm
def add_decisiontree_classifer(pipe_spam_ham_features,grid_search_parameters):
    
    grid_search_parameters['dtc__max_depth'] = (10,100,1000,None)
    
    pipe_spam_ham_features.append(('dtc', DecisionTreeClassifier()))
    
    return()

# Full ML Pipeline

In [116]:
pipe_spam_ham = []
pipe_spam_ham_features = []
grid_search_parameters = {}
list_ml_algo = {}


run_id = randint(100000, 999999)

## Cross_Val value
cv_value = 2

# Define 10 fold cross-validation
cv = KFold(n_splits=10)


# ## Addition of Count Vectorizer
#add_count_vectorizer(pipe_spam_ham_features,grid_search_parameters)

## Not using these, since the values score isn't much better than with Count Vectorizer.
#add_TruncatedSVD(pipe_spam_ham_features,grid_search_parameters)
#add_tfidf_vectorizer(pipe_spam_ham_features,grid_search_parameters)



## Create a dictionary of all available ML Algos
list_ml_algo['knn'] = 'knn'
list_ml_algo['rf'] = 'randomforest'
list_ml_algo['lr'] = 'logistic_regression'
list_ml_algo['nb'] = 'multinomialNB'
list_ml_algo['svc'] = 'svc_regression'
list_ml_algo['gbc'] = 'gradient_boosting_classifer'
list_ml_algo['dtc'] = 'decisiontree_classifer'


## Kick off the pipeline Execution:

## Iteration 1:
## No Vectorizer

count = 1

while count < 3:
    if count == 1:
        add_count_vectorizer(pipe_spam_ham_features,grid_search_parameters)
    if count == 2:
        add_tfidf_vectorizer(pipe_spam_ham_features,grid_search_parameters)
        
    for key, values in list_ml_algo.items():
        ml_algo_name = 'add_' + values
        returnValueIfAny = globals()[ml_algo_name](pipe_spam_ham_features,grid_search_parameters)

        ## Setting up the pipeline
        pipe_spam_ham = Pipeline(pipe_spam_ham_features)

        classifier = str(pipe_spam_ham_features[-1:][0][1])

        print(pipe_spam_ham)

        print(grid_search_parameters)

        ## Adding the GridSearch CV
        gs_clf_pipe_spam_ham = GridSearchCV(pipe_spam_ham, grid_search_parameters, n_jobs=1, cv = cv_value, return_train_score=True)

        ML_Pipeline_Processing_And_Metrics(run_id,X_train, y_train, X_test, y_test,grid_search_parameters, gs_clf_pipe_spam_ham, cv_value, classifier)

        remove_vectorizer_ml_algo(key)

    # remove_vectorizer_ml_algo('truncatedsvd')
    remove_vectorizer_ml_algo('vect')
    remove_vectorizer_ml_algo('tfidf')
    
    count += 1

## End of Program ..        


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'knn__n_neighbors': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'knn__weights': ('uniform', 'distance')}


2018-07-26 15:48:04,897 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.010s]
2018-07-26 15:48:04,908 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.009s]


{'score': 94.79665071770334}
{'Algorithm_Name': 'KNeighborsClassifier'}  pipeline test accuracy: 94.797
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'rf__n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 'rf__max_depth': (10, 100, 1000, None)}


2018-07-26 16:25:41,894 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.006s]
2018-07-26 16:25:41,903 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.008s]


{'score': 96.71052631578947}
{'Algorithm_Name': 'RandomForestClassifier'}  pipeline test accuracy: 96.711
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'lr__penalty': ('l1', 'l2')}


2018-07-26 16:26:41,074 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.010s]
2018-07-26 16:26:41,092 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.016s]


{'score': 98.14593301435407}
{'Algorithm_Name': 'LogisticRegression'}  pipeline test accuracy: 98.146
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'nb__alpha': (1, 0.9), 'nb__fit_prior': (True, False)}


2018-07-26 16:28:30,247 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.006s]
2018-07-26 16:28:30,258 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.010s]


{'score': 98.74401913875597}
{'Algorithm_Name': 'MultinomialNB'}  pipeline test accuracy: 98.744
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'svc__C': (1.0, 0.9, 0.8)}


  'precision', 'predicted', average, warn_for)
2018-07-26 16:30:32,130 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.008s]
2018-07-26 16:30:32,140 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.008s]


{'score': 86.24401913875597}
{'Algorithm_Name': 'SVC'}  pipeline test accuracy: 86.244
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'gbc__n_estimators': (100, 200, 300, 1000)}


2018-07-26 16:35:58,153 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.008s]
2018-07-26 16:35:58,171 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.015s]


{'score': 97.54784688995215}
{'Algorithm_Name': 'GradientBoostingClassifier'}  pipeline test accuracy: 97.548
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
{'vect__binary': (False, True), 'vect__lowercase': (True, False), 'vect__tokenizer': (<__main__.LemmaTokenizer object at 0x7f8038e426d8>, None), 'dtc__max_depth': (10, 100, 1000, None)}


2018-07-26 16:37:52,016 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.009s]
2018-07-26 16:37:52,034 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.016s]


{'score': 96.17224880382776}
{'Algorithm_Name': 'DecisionTreeClassifier'}  pipeline test accuracy: 96.172
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'knn__n_neighbors': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 'knn__weights': ('uniform', 'distance')}


2018-07-26 16:38:54,976 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.007s]
2018-07-26 16:38:54,986 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.009s]


{'score': 96.71052631578947}
{'Algorithm_Name': 'KNeighborsClassifier'}  pipeline test accuracy: 96.711
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'rf__n_estimators': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), 'rf__max_depth': (10, 100, 1000, None)}


2018-07-26 16:41:21,269 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.007s]
2018-07-26 16:41:21,280 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.010s]


{'score': 97.06937799043062}
{'Algorithm_Name': 'RandomForestClassifier'}  pipeline test accuracy: 97.069
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'lr__penalty': ('l1', 'l2')}


2018-07-26 16:41:24,234 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.006s]
2018-07-26 16:41:24,244 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.009s]


{'score': 96.77033492822966}
{'Algorithm_Name': 'LogisticRegression'}  pipeline test accuracy: 96.770
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'nb__alpha': (1, 0.9), 'nb__fit_prior': (True, False)}


2018-07-26 16:41:30,275 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.012s]
2018-07-26 16:41:30,294 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.017s]


{'score': 98.02631578947368}
{'Algorithm_Name': 'MultinomialNB'}  pipeline test accuracy: 98.026
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'svc__C': (1.0, 0.9, 0.8)}


  'precision', 'predicted', average, warn_for)
2018-07-26 16:41:49,416 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.006s]
2018-07-26 16:41:49,427 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.009s]


{'score': 86.24401913875597}
{'Algorithm_Name': 'SVC'}  pipeline test accuracy: 86.244
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'gbc__n_estimators': (100, 200, 300, 1000)}


2018-07-26 16:44:44,623 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.034s]
2018-07-26 16:44:44,644 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.018s]


{'score': 97.42822966507177}
{'Algorithm_Name': 'GradientBoostingClassifier'}  pipeline test accuracy: 97.428
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
{'tfidf__norm': ('l2', 'l1'), 'tfidf__smooth_idf': (True, False), 'dtc__max_depth': (10, 100, 1000, None)}


2018-07-26 16:44:53,659 : INFO : POST http://localhost:9200/ml-performance-metrics/text [status:201 request:0.007s]
2018-07-26 16:44:53,670 : INFO : POST http://localhost:9200/ml-performance-metrics/_refresh [status:200 request:0.010s]


{'score': 95.27511961722487}
{'Algorithm_Name': 'DecisionTreeClassifier'}  pipeline test accuracy: 95.275


In [44]:
# imports needed and logging
import gzip
import gensim 
import logging
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
# build vocabulary and train model
model = gensim.models.Word2Vec(
    raw_data,
    size=150,
    window=10,
    min_count=2,
    workers=10)
model.train(raw_data, total_examples=len(raw_data), epochs=10)


2018-07-25 22:14:04,572 : INFO : collecting all words and their counts
2018-07-25 22:14:04,574 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-25 22:14:04,575 : INFO : collected 6 word types from a corpus of 9 raw words and 2 sentences
2018-07-25 22:14:04,578 : INFO : Loading a fresh vocabulary
2018-07-25 22:14:04,579 : INFO : min_count=2 retains 3 unique words (50% of original 6, drops 3)
2018-07-25 22:14:04,580 : INFO : min_count=2 leaves 6 word corpus (66% of original 9, drops 3)
2018-07-25 22:14:04,581 : INFO : deleting the raw counts dictionary of 6 items
2018-07-25 22:14:04,581 : INFO : sample=0.001 downsamples 3 most-common words
2018-07-25 22:14:04,582 : INFO : downsampling leaves estimated 0 word corpus (5.8% of prior 6)
2018-07-25 22:14:04,583 : INFO : estimated required memory for 3 words and 150 dimensions: 5100 bytes
2018-07-25 22:14:04,584 : INFO : resetting layer weights
2018-07-25 22:14:04,585 : INFO : training model with 10 workers on

2018-07-25 22:14:04,775 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-07-25 22:14:04,776 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-07-25 22:14:04,778 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-07-25 22:14:04,780 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-07-25 22:14:04,781 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-25 22:14:04,783 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-25 22:14:04,785 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-25 22:14:04,786 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-25 22:14:04,788 : INFO : EPOCH - 2 : training on 9 raw words (0 effective words) took 0.0s, 0 effective words/s
2018-07-25 22:14:04,805 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-07-25 22:14:04,807 : INFO : worker thread finished; awai

2018-07-25 22:14:05,004 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-07-25 22:14:05,006 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-25 22:14:05,008 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-25 22:14:05,009 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-25 22:14:05,011 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-25 22:14:05,013 : INFO : EPOCH - 9 : training on 9 raw words (1 effective words) took 0.0s, 50 effective words/s
2018-07-25 22:14:05,030 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-07-25 22:14:05,033 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-07-25 22:14:05,035 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-07-25 22:14:05,036 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-07-25 22:14:05,038 : INFO : worker thread finished; awa

(3, 90)