In [1]:
#Problem Setup/Definition:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from string import punctuation
from string import digits
from nltk.corpus import wordnet
from sklearn.feature_extraction import text, stop_words
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.svm import LinearSVC
import math 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD

import pandas as pd
%matplotlib inline



In [2]:
def my_custom_preprocessor(doc_string):
    # do all data preprocessing here
    
    # Lower case
    doc_string=doc_string.lower()
    
    # Remove Numbers
    remove_digits = str.maketrans('', '', digits)
    doc_string.translate(remove_digits)
    
    # Convert to tokenized form....
    tokens = nltk.tokenize.word_tokenize(doc_string)
    # Iterate through list of tokens (words) and remove all numbers
    tokens = [word for word in tokens if word.isalpha()]
    # Iterate through list of tokens (words) and stem (shorten) each word
    port_stemmer = PorterStemmer()
    tokens = [port_stemmer.stem(words) for words in tokens ]
    # Iterate through list of tokens (words) and remove all stopwords
    tokens_no_stop = []
    stop_words = text.ENGLISH_STOP_WORDS
    for words in tokens:
        if not words in stop_words:
            tokens_no_stop.append(words)
    
    ###############################
    #### Lemmatize with pos_tag ###
    ###############################
    
    lemmatizer = WordNetLemmatizer()
    
    # Convert between two different tagging schemes
    def change_tags(penntag):
        morphy_tag = {'NN':'n', 'JJ':'a',
                      'VB':'v', 'RB':'r'}
        try:
            return morphy_tag[penntag[:2]]
        except:
            return 'n'
        
    tokens_no_stop = [lemmatizer.lemmatize(word.lower(), pos=change_tags(tag)) for word, tag in pos_tag(tokens_no_stop)]
    
    # Rejoin List of tokens and return that single document-string
    return ' '.join(tokens_no_stop)

###########################
#### RoC Curve Function ###
###########################

def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)

def fit_predict_and_plot_roc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)

    if hasattr(pipe, 'decision_function'):
        prob_score = pipe.decision_function(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score)
    else:
        prob_score = pipe.predict_proba(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score[:,1])

    plot_roc(fpr, tpr)
    
#####################################################
#### Define Custom stop words for CountVectorizer ###
#####################################################

stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

# Run stop_words through the same pre-processor as the document-matrix
# This will apply stemmed/lemmatized stop_woirds to stemmed/lemmatized tokenized document lists
def process_stop_words(stop_word_set):
    doc_string = ' '.join(stop_word_set)
    return my_custom_preprocessor(doc_string).split()

################################
#### Estimator Helper Class  ###
################################

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

##################################
#### Import Dataset Train/Test ###
##################################

# Only take a specific selection (8) of the 20 available categories
categories = ['comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'rec.autos', 'rec.motorcycles',
'rec.sport.baseball', 'rec.sport.hockey']

# Load a full data sets consisting of those 8 categories, one with headers + footers, one without
# Dont need "test" set with cross validation; splits up entire set for you k-fold times
dataset = fetch_20newsgroups(subset = 'all', categories = categories, shuffle = True, random_state = None)

## Load training & test data sets WITHOUT headers & footers
dataset_no_hf = fetch_20newsgroups(subset = 'all', categories = categories, shuffle = True, random_state = None, remove=['headers', 'footers'])

# Clean the data sets before analysis: 
cleaned_dataset = []
for documents in range(len(dataset.data)):
    cleaned_dataset.append(my_custom_preprocessor(dataset.data[documents]))
    
cleaned_dataset_no_hf = []
for documents in range(len(dataset_no_hf.data)):
    cleaned_dataset_no_hf.append(my_custom_preprocessor(dataset_no_hf.data[documents]))

print("\n\n" + '-'*40 + "\n\n")

#############################################
#### Define Class data set arrys (0 or 1) ###
#############################################
# Categorize the 8 news categories into two (binary) Classes 
# 0 = computer technology
# 1 = recreational activity
data_class = [] 
data_class_no_hf = []

# Categories are mapped 0-7, (0-3) = Comp, (4-7) = Recreation
print(dataset.target_names)
print(dataset.target)

for category in dataset.target:
    if category < 4:
        data_class.append(0)
    else:
        data_class.append(1)
        
        
for category in dataset_no_hf.target:
    if category < 4:
        data_class_no_hf.append(0)
    else:
        data_class_no_hf.append(1)
        
# Sanity Checks, values should all be either 1 or 0
print("First 10 articles Classification (H&F): \n" + str(data_class[0:10]))
print("First 10 articles Classification (NO H&F): \n" + str(data_class_no_hf[0:10]))
# This will be used for the classification categories only!!!!
# Each data point refers to the classification of a single article in dataset



----------------------------------------


['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
[2 1 2 ... 2 3 4]
First 10 articles Classification (H&F): 
[0, 0, 0, 0, 1, 1, 0, 1, 0, 1]
First 10 articles Classification (NO H&F): 
[1, 1, 0, 1, 1, 1, 0, 0, 0, 1]


In [3]:
#####################################
## Create Pipelines for Comparison ##
#####################################
#enable Cachine
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=0)

### Initial Pipeline ###
# These tuples() will be altered via the 'param_grid' List[]
pipeline_hf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dim_reducer', TruncatedSVD()),
    ('classifier', LinearSVC(max_iter=5000)),
],
memory=memory
)

pipeline_no_hf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dim_reducer', TruncatedSVD()),
    ('classifier', LinearSVC(max_iter=5000)),
],
memory=memory
)

######################
## Cross Validation ##
######################
# An List[] of Dictionary{key:value} parameters that will be iterated over
# Each Dictionary{} in the List[] references different types of 'vectorizer', 'tfidf', etc.
# Alter the range of hyperparameters within each Dictionary{} with <estimator>__<parameter>.
# E.g. Try both 3 & 5 min_df values for CountVectorizer().... '<vectorizer>__<min_df>'': [3,5] 

# Options to Iterate Over:

MIN_DIF = [3,5]
# Lemm + Stemm or Defualt
TOKEN_PATTERN = [r'(?u)\b\w\w+\b',r'(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b']
REDUCER_OPTIONS = [TruncatedSVD(), NMF()]
REDUCER_N_COMPONENTS = [50]
# LinearSVC() values
OPTIMAL_LINEAR_C_VALUE = [100]
# LogisticRegression() Values
LOG_REG_PENALTIES = ['l1', 'l2']
OPTIMAL_LOG_REG_C_VALUE = [100]

param_grid = [
                { # Linear Classifier 
                    'vectorizer__min_df': MIN_DIF,
                    'vectorizer__token_pattern': TOKEN_PATTERN,
                    'dim_reducer': REDUCER_OPTIONS,
                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
                    'classifier': [LinearSVC()],
                    'classifier__C':OPTIMAL_LINEAR_C_VALUE
                },
    
                { # Logisitc Regresion
                    'vectorizer__min_df': MIN_DIF,
                    'vectorizer__token_pattern': TOKEN_PATTERN,
                    'dim_reducer': REDUCER_OPTIONS,
                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
                    'classifier': [LogisticRegression(solver='liblinear', max_iter=5000)],
                    'classifier__penalty': LOG_REG_PENALTIES,
                    'classifier__C':OPTIMAL_LOG_REG_C_VALUE                        
                },
    
                { # Naive Bayes Gaussian
                    'vectorizer__min_df': MIN_DIF,
                    'vectorizer__token_pattern': TOKEN_PATTERN,
                    'dim_reducer': REDUCER_OPTIONS,
                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
                    'classifier': [GaussianNB()],                    
                }    
            ]

You provided "cachedir='/tmp/tmpsl8epos8'", use "location='/tmp/tmpsl8epos8'" instead.
  


In [4]:
## Cross Validate/iterate over pipeline; data has header/footer included
grid = GridSearchCV(pipeline_hf, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid.fit(cleaned_dataset, dataset.target)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=Memory(location=/tmp/tmpsl8epos8/joblib),
                                steps=[('vectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                  

In [5]:
## Cross Validate/iterate over pipeline; data has header/footer removed
grid_no_hf = GridSearchCV(pipeline_no_hf, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid_no_hf.fit(cleaned_dataset_no_hf, dataset_no_hf.target)

rmtree(cachedir)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


In [6]:
#######################################################
## Grid Search Results : DATA with HEADERS + FOOTERS ##
#######################################################
# Add column to Table: Used data with Header and Footers INCLUDED
table_hf = pd.DataFrame(grid.cv_results_)
table_hf.insert (len(table_hf.columns)-1, 'Has Header + Footer', 'True')

# Print and order by best 'accuracy'
table_hf.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_dim_reducer,param_dim_reducer__n_components,param_vectorizer__min_df,param_vectorizer__token_pattern,...,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,Has Header + Footer,rank_test_score
17,0.945943,0.017245,0.178805,0.005491,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.87064,0.880786,0.869289,0.865482,0.861675,0.869575,0.00642,True,1
9,1.586256,0.061375,0.177286,0.005182,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.872543,0.880152,0.86802,0.865482,0.861675,0.869575,0.00636,True,2
16,0.956435,0.013438,0.158251,0.00382,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.869372,0.884591,0.86231,0.864848,0.86231,0.868686,0.00836,True,3
8,1.58011,0.096306,0.153366,0.003977,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.87064,0.884591,0.862944,0.862944,0.86231,0.868686,0.008525,True,4
1,7.784404,0.043562,0.173576,0.004308,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.873811,0.877616,0.869924,0.865482,0.85533,0.868433,0.007692,True,5
10,1.605547,0.070769,0.154724,0.003794,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.871275,0.871275,0.861041,0.865482,0.871827,0.86818,0.004259,True,6
0,7.988868,0.061038,0.158344,0.005003,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.869372,0.885859,0.864848,0.862944,0.857868,0.868178,0.009579,True,7
18,0.899801,0.013383,0.156915,0.00356,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.869372,0.871909,0.863579,0.862944,0.872462,0.868053,0.004054,True,8
2,7.151419,0.114914,0.155776,0.004122,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.870006,0.871909,0.861675,0.865482,0.869924,0.867799,0.003719,True,9
11,1.48645,0.044939,0.17562,0.003588,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.866836,0.880152,0.865482,0.862944,0.855964,0.866276,0.007886,True,10


In [7]:
#######################################################
## Grid Search Results : DATA with HEADERS + FOOTERS ##
#######################################################
# Add column to Table: Used data with Header and Footers REMOVED
table_no_hf = pd.DataFrame(grid_no_hf.cv_results_)
table_no_hf.insert (len(table_no_hf.columns)-1, 'Has Header + Footer', 'Fasle')

# Print and order by best 'accuracy'
table_no_hf.sort_values(by=['rank_test_score'])


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_dim_reducer,param_dim_reducer__n_components,param_vectorizer__min_df,param_vectorizer__token_pattern,...,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,Has Header + Footer,rank_test_score
16,0.894754,0.015393,0.128359,0.006513,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.807229,0.82435,0.816624,0.82297,0.822335,0.818702,0.006313,Fasle,1
8,1.2047,0.038466,0.127841,0.005909,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.804692,0.82435,0.815355,0.820431,0.823604,0.817687,0.007228,Fasle,2
0,8.041098,0.13605,0.124596,0.00498,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.804692,0.819911,0.812183,0.819797,0.824239,0.816164,0.006927,Fasle,3
11,1.165379,0.057432,0.14191,0.005705,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.809131,0.814204,0.812817,0.82297,0.819797,0.815784,0.004966,Fasle,4
19,0.879986,0.016832,0.14189,0.005257,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.808497,0.812302,0.811548,0.824239,0.817259,0.814769,0.005508,Fasle,5
18,0.876696,0.00565,0.126755,0.005011,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.805961,0.811668,0.812817,0.82297,0.817259,0.814135,0.0057,Fasle,6
3,7.364936,0.084261,0.140669,0.004977,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.807229,0.810399,0.810914,0.821701,0.817893,0.813627,0.005331,Fasle,7
2,7.345575,0.097333,0.123465,0.004625,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.80279,0.812936,0.810914,0.820431,0.814086,0.812232,0.005693,Fasle,8
10,1.174914,0.034177,0.126713,0.004896,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.80279,0.811668,0.810914,0.821701,0.812817,0.811978,0.006015,Fasle,9
9,1.228541,0.057624,0.144126,0.005073,"LogisticRegression(C=100, class_weight=None, d...",100.0,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.807863,0.814838,0.805203,0.820431,0.810914,0.81185,0.005358,Fasle,10


In [12]:
#########################################################################
## Combine Findings of both Tables (W/wo) HEADERS + FOOTERS IN DATASET ##
#########################################################################

# Combine Both Tables
combined_table = pd.concat([table_no_hf, table_hf])
combined_table.sort_values(by=['mean_test_score'], ascending=False).head(10)

## Note: Still some weirdness when combining tables. Overlapping indexes for ranking
## Using data without headers/foots results in incredibly low scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_dim_reducer,param_dim_reducer__n_components,param_vectorizer__min_df,param_vectorizer__token_pattern,...,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,Has Header + Footer,rank_test_score
17,0.945943,0.017245,0.178805,0.005491,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.87064,0.880786,0.869289,0.865482,0.861675,0.869575,0.00642,True,1
9,1.586256,0.061375,0.177286,0.005182,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.872543,0.880152,0.86802,0.865482,0.861675,0.869575,0.00636,True,2
16,0.956435,0.013438,0.158251,0.00382,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.869372,0.884591,0.86231,0.864848,0.86231,0.868686,0.00836,True,3
8,1.58011,0.096306,0.153366,0.003977,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.87064,0.884591,0.862944,0.862944,0.86231,0.868686,0.008525,True,4
1,7.784404,0.043562,0.173576,0.004308,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.873811,0.877616,0.869924,0.865482,0.85533,0.868433,0.007692,True,5
10,1.605547,0.070769,0.154724,0.003794,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.871275,0.871275,0.861041,0.865482,0.871827,0.86818,0.004259,True,6
0,7.988868,0.061038,0.158344,0.005003,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,3,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.869372,0.885859,0.864848,0.862944,0.857868,0.868178,0.009579,True,7
18,0.899801,0.013383,0.156915,0.00356,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.869372,0.871909,0.863579,0.862944,0.872462,0.868053,0.004054,True,8
2,7.151419,0.114914,0.155776,0.004122,"LinearSVC(C=1.0, class_weight=None, dual=True,...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b\w\w+\b,...,"{'classifier': LinearSVC(C=1.0, class_weight=N...",0.870006,0.871909,0.861675,0.865482,0.869924,0.867799,0.003719,True,9
11,1.48645,0.044939,0.17562,0.003588,"LogisticRegression(C=100, class_weight=None, d...",100,"TruncatedSVD(algorithm='randomized', n_compone...",50,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,...,"{'classifier': LogisticRegression(C=100, class...",0.866836,0.880152,0.865482,0.862944,0.855964,0.866276,0.007886,True,10
