In [14]:
#Problem Setup/Definition:
import numpy as np
np.random.seed(42)
import random
random.seed(42)
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from string import punctuation
from string import digits
from nltk.corpus import wordnet
from sklearn.feature_extraction import text, stop_words
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.svm import LinearSVC
import math 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD

import pandas as pd
%matplotlib inline

In [26]:
def my_custom_preprocessor(doc_string):
    # do all data preprocessing here
    
    # Lower case
    doc_string=doc_string.lower()
    
    # Remove Numbers
    remove_digits = str.maketrans('', '', digits)
    doc_string.translate(remove_digits)
    
    # Convert to tokenized form....
    tokens = nltk.tokenize.word_tokenize(doc_string)
    # Iterate through list of tokens (words) and remove all numbers
    tokens = [word for word in tokens if word.isalpha()]
    # Iterate through list of tokens (words) and stem (shorten) each word
    port_stemmer = PorterStemmer()
    tokens = [port_stemmer.stem(words) for words in tokens ]
    
    ###############################
    #### Lemmatize with pos_tag ###
    ###############################
    
    lemmatizer = WordNetLemmatizer()
    
    # Convert between two different tagging schemes
    def change_tags(penntag):
        morphy_tag = {'NN':'n', 'JJ':'a',
                      'VB':'v', 'RB':'r'}
        try:
            return morphy_tag[penntag[:2]]
        except:
            return 'n'
        
    tokens = [lemmatizer.lemmatize(word.lower(), pos=change_tags(tag)) for word, tag in pos_tag(tokens)]
    
    # Rejoin List of tokens and return that single document-string
    return ' '.join(tokens)

###########################
#### RoC Curve Function ###
###########################

def plot_roc(fpr, tpr):
    fig, ax = plt.subplots()

    roc_auc = auc(fpr,tpr)

    ax.plot(fpr, tpr, lw=2, label= 'area under curve = %0.4f' % roc_auc)

    ax.grid(color='0.7', linestyle='--', linewidth=1)

    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate',fontsize=15)
    ax.set_ylabel('True Positive Rate',fontsize=15)

    ax.legend(loc="lower right")

    for label in ax.get_xticklabels()+ax.get_yticklabels():
        label.set_fontsize(15)

def fit_predict_and_plot_roc(pipe, train_data, train_label, test_data, test_label):
    pipe.fit(train_data, train_label)

    if hasattr(pipe, 'decision_function'):
        prob_score = pipe.decision_function(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score)
    else:
        prob_score = pipe.predict_proba(test_data)
        fpr, tpr, _ = roc_curve(test_label, prob_score[:,1])

    plot_roc(fpr, tpr)
    
#####################################################
#### Define Custom stop words for CountVectorizer ###
#####################################################

stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

# Run stop_words through the same pre-processor as the document-matrix
# This will apply stemmed/lemmatized stop_woirds to stemmed/lemmatized tokenized document lists
def process_stop_words(stop_word_set):
    doc_string = ' '.join(stop_word_set)
    return my_custom_preprocessor(doc_string).split()

################################
#### Estimator Helper Class  ###
################################

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

##################################
#### Import Dataset Train/Test ###
##################################

# Only take a specific selection (8) of the 20 available categories
categories = ['comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'rec.autos', 'rec.motorcycles',
'rec.sport.baseball', 'rec.sport.hockey']

# Load a training & test data sets consisting of those 8 categories
train_dataset = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = None)
test_dataset = fetch_20newsgroups(subset = 'test', categories = categories, shuffle = True, random_state = None)

## Load training & test data sets WITHOUT headers & footers
train_dataset_no_hf = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = None, remove=['headers', 'footers'])
test_dataset_no_hf = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = None, remove=['headers', 'footers'])

print("\n\n" + '-'*40 + "\n\n")

#############################################
#### Define Class data set arrys (0 or 1) ###
#############################################
# Categorize the 8 news categories into two (binary) Classes 
# 0 = computer technology
# 1 = recreational activity
training_data_class = [] 
test_data_class = []

# Categories are mapped 0-7, (0-3) = Comp, (4-7) = Recreation
print(train_dataset.target_names)
print(train_dataset.target)

for category in train_dataset.target:
    if category < 4:
        training_data_class.append(0)
    else:
        training_data_class.append(1)
        
# Reshape test dataset
for category in test_dataset.target:
    if category < 4:
        test_data_class.append(0)
    else:
        test_data_class.append(1)
        
# Sanity Checks, values should all be either 1 or 0
print("First 10 articles Classification (Train): \n" + str(training_data_class[0:10]))
print("First 10 articles Classification (Train): \n" + str(test_data_class[0:10]))
# This will be used for the classification categories only!!!!
# Each data point refers to the classification of a single article in dataset



----------------------------------------


['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
[7 2 6 ... 0 0 5]
First 10 articles Classification (Train): 
[1, 0, 1, 0, 0, 1, 0, 0, 1, 1]
First 10 articles Classification (Train): 
[0, 1, 0, 1, 0, 0, 0, 1, 0, 0]


In [25]:
#####################################
## Create Pipelines for Comparison ##
#####################################
#enable Cachine
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=0)

### Initial Pipeline ###
# These tuples() will be altered via the 'param_grid' List[]
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('dim_reducer', TruncatedSVD()),
    ('classifier', LinearSVC(max_iter=5000)),
],
memory=memory
)

######################
## Cross Validation ##
######################
# An List[] of Dictionary{key:value} parameters that will be iterated over
# Each Dictionary{} in the List[] references different types of 'vectorizer', 'tfidf', etc.
# Alter the range of hyperparameters within each Dictionary{} with <estimator>__<parameter>.
# E.g. Try both 3 & 5 min_df values for CountVectorizer().... '<vectorizer>__<min_df>'': [3,5] 

# Options to Iterate Over:

MIN_DIF = [3,5]
# Lemm + Stemm or Defualt
TOKEN_PATTERN = [r'(?u)\b\w\w+\b',r'(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b']
REDUCER_OPTIONS = [TruncatedSVD(), NMF()]
REDUCER_N_COMPONENTS = [50]
# LinearSVC() values
OPTIMAL_LINEAR_C_VALUE = [100]
# LogisticRegression() Values
LOG_REG_PENALTIES = ['l1', 'l2']
OPTIMAL_LOG_REG_C_VALUE = [100]

param_grid = [
                { # Linear Classifier 
                    'vectorizer__min_df': MIN_DIF,
                    'vectorizer__token_pattern': TOKEN_PATTERN,
                    'dim_reducer': REDUCER_OPTIONS,
                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
                    'classifier': [LinearSVC(max_iter=5000)],
                    'classifier__C':OPTIMAL_LINEAR_C_VALUE
                }
    
#                { # Logisitc Regresion
#                    'vectorizer__min_df': MIN_DIF,
#                    'vectorizer__preprocessor': PRE_PROCESSOR,
#                    'dim_reducer': REDUCER_OPTIONS,
##                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
#                    'classifier': [LogisticRegression(max_iter=5000)],
#                    'classifier__penalty': LOG_REG_PENALTIES
#                    'classifier__C':OPTIMAL_LOG_REG_C_VALUE                        
#                },
#                { # Naive Bayes Gaussian
#                    'vectorizer__min_df': MIN_DIF,
#                    'vectorizer__preprocessor': PRE_PROCESSOR,
#                    'dim_reducer': REDUCER_OPTIONS,
#                    'dim_reducer__n_components': REDUCER_N_COMPONENTS,
#                    'classifier': [GaussianNB()],                    
#                }    
            ]
 
grid = GridSearchCV(pipeline, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
#### Need to consider Headers/Footers here, Happens on loading train_dataset
grid.fit(train_dataset.data, train_dataset.target)

rmtree(cachedir)


You provided "cachedir='/tmp/tmpa0hvxmbi'", use "location='/tmp/tmpa0hvxmbi'" instead.
  
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wra

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

KeyboardInterrupt: 

In [24]:
## Print out Values
table = pd.DataFrame(grid.cv_results_)

## Do No Header/Footer Version
grid_no_hf = GridSearchCV(pipeline, cv=5, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid_no_hf.fit(train_dataset_no_hf.data, train_dataset.target)

table_no_hf = pd.DataFrame(grid.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vectorizer__min_df,param_vectorizer__token_pattern,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.148703,0.10388,0.195462,0.00765,3,(?u)\b\w\w+\b,"{'vectorizer__min_df': 3, 'vectorizer__token_p...",0.337909,0.319958,0.339323,0.339323,0.321353,0.331573,0.00894,3
1,4.833085,0.077054,0.184695,0.006082,3,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,"{'vectorizer__min_df': 3, 'vectorizer__token_p...",0.345301,0.338965,0.354123,0.343552,0.346723,0.345733,0.004942,1
2,4.817333,0.130925,0.194469,0.006737,5,(?u)\b\w\w+\b,"{'vectorizer__min_df': 5, 'vectorizer__token_p...",0.338965,0.315734,0.338266,0.334038,0.321353,0.329671,0.00941,4
3,3.856862,0.031514,0.185816,0.005542,5,(?u)\b[^\W\d_][^\W\d_][^\W\d_]+\b,"{'vectorizer__min_df': 5, 'vectorizer__token_p...",0.350581,0.337909,0.35518,0.342495,0.339323,0.345098,0.006686,2


In [None]:
# Combine Both Tables and Print out results in order of most accurate
pd.concat([table, table_no_hf]).sort_values(by=['rank_test_score'])