In [31]:
import sys
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import string
from string import punctuation

import itertools

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ryanli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ryanli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ryanli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Data Preprocessing (Q2, Q3)

In [32]:
from sklearn.datasets import fetch_20newsgroups
categories=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

train_dataset = fetch_20newsgroups(subset = 'train',
                                   categories = categories,
                                   shuffle = True,
                                   random_state = 42)
test_dataset = fetch_20newsgroups(subset = 'test',
                                 categories = categories,
                                 shuffle = True,
                                 random_state = 42)

removed_train_dataset = fetch_20newsgroups(subset = 'train', 
                                           categories = categories, 
                                           remove = ('headers', 'footers'),
                                           shuffle = True, 
                                           random_state = 42)
removed_test_dataset = fetch_20newsgroups(subset = 'test',
                                 categories = categories,
                                 remove = ('headers', 'footers'),
                                 shuffle = True,
                                 random_state = 42)

# Our final targets are in 2 categories: "Computer Technology" and "Recreational Activity"
# Convert 8 imported categories into 2 categories
y_train = [int(i/4) for i in train_dataset.target] 
y_test = [int(i/4) for i in test_dataset.target] 

In [33]:
# stop words
stop_words_skt = text.ENGLISH_STOP_WORDS
stop_words_en = stopwords.words('english')
combined_stopwords = set.union(set(stop_words_en),set(punctuation),set(stop_words_skt))

In [36]:
# used to cache results
from tempfile import mkdtemp
from shutil import rmtree
import joblib
sys.modules['sklearn.externals.joblib'] = joblib
from sklearn.externals.joblib import Memory
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=3, stop_words=combined_stopwords)),
    ('tfidf', TfidfTransformer()),
    ('reduce_dim', TruncatedSVD(random_state=0)),
    ('clf', GaussianNB()),
],
memory=memory
)

MIN_DF_OPTIONS = [3, 5]
N_FEATURES_OPTIONS = [50]
#C_OPTIONS = [0.1, 1, 10]

param_grid = [
    {
        'vect__min_df': MIN_DF_OPTIONS, #2 choices
        'reduce_dim': [TruncatedSVD(), NMF()], # 2 choices
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'clf': [LinearSVC(C=10), 
                LogisticRegression(penalty='l1',C=10),
                LogisticRegression(penalty='l2',C=100),
                GaussianNB()], # 4 choices
    },
]

grid1 = GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=param_grid, scoring='accuracy')
grid1.fit(train_dataset.data, y_train)
grid2 = GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=param_grid, scoring='accuracy')
grid2.fit(removed_train_dataset.data, y_train)
rmtree(cachedir)

You provided "cachedir='/var/folders/xr/t1yct1wj0k76xzvjf87vtxs80000gn/T/tmp_0topn8a'", use "location='/var/folders/xr/t1yct1wj0k76xzvjf87vtxs80000gn/T/tmp_0topn8a'" instead.
  
        nan        nan 0.97590834 0.97358455 0.9659769  0.96513213
 0.90596029 0.91377645 0.93850571 0.94336092]


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(CountVectorizer(min_df=3,
                stop_words={'!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                            '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                            '?', '@', '[', '\\', ']', '^', '_', '`', 'a',
                            'about', ...}), 
[ 'From: sac@asdi.saic.com (Steve A. Conroy x6172)\n'
  'Subject: Re: Darrrrrrrrryl\n'
  'Organization: SAIC\n'
  'Lines: 33\n'
  '\n'
  'In article <mssC5KCru.5Ip@netcom.com>, mss@netcom.com (Mark Singer) '
  'writes:\n'
  '|> \n'
  '|> \n'
  '|> The media is beating the incident at Dodger Stadium on Wednesday to\n'
  "|> death, but I haven't seen anything in rsb yet.\n"
  '|> \n'
  '|> Gerald Perry of the Cardinals pinch hit in the eighth inning with two\n'
  '|> on and his club down by a run.  He stroked a line drive into the\n'
  '|>

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 2.1s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(TfidfTransformer(), <4732x20272 sparse matrix of type '<class 'numpy.int64'>'
	with 407470 stored elements in Compressed Sparse Row format>, 
[ 1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
 

        nan        nan 0.97210486 0.97168135 0.95921425 0.95879008
 0.86052517 0.85481313 0.94505314 0.94695433]


________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(CountVectorizer(min_df=3,
                stop_words={'!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                            '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                            '?', '@', '[', '\\', ']', '^', '_', '`', 'a',
                            'about', ...}), 
[ 'In article <mssC5KCru.5Ip@netcom.com>, mss@netcom.com (Mark Singer) '
  'writes:\n'
  '|> \n'
  '|> \n'
  '|> The media is beating the incident at Dodger Stadium on Wednesday to\n'
  "|> death, but I haven't seen anything in rsb yet.\n"
  '|> \n'
  '|> Gerald Perry of the Cardinals pinch hit in the eighth inning with two\n'
  '|> on and his club down by a run.  He stroked a line drive into the\n'
  '|> right field corner.  The ball cleared the three-foot high fence and\n'
  '|> went into the crowd.  Darryl, racing over from right cent

In [37]:
# Results when Headers and Footers are NOT removed
pd.DataFrame(grid1.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_reduce_dim__n_components,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.293598,0.171156,0.364678,0.041838,LinearSVC(C=10),TruncatedSVD(n_components=50),50,3,"{'clf': LinearSVC(C=10), 'reduce_dim': Truncat...",0.978881,0.975713,0.975687,0.972516,0.973573,0.975274,0.002185,2
1,4.794568,0.24779,0.371252,0.05037,LinearSVC(C=10),TruncatedSVD(n_components=50),50,5,"{'clf': LinearSVC(C=10), 'reduce_dim': Truncat...",0.972545,0.973601,0.973573,0.970402,0.977801,0.973584,0.002408,4
2,24.260134,5.196884,0.404097,0.013733,LinearSVC(C=10),NMF(),50,3,"{'clf': LinearSVC(C=10), 'reduce_dim': NMF(), ...",0.966209,0.956705,0.96723,0.96723,0.969345,0.965344,0.004438,6
3,19.330702,3.927405,0.39381,0.024676,LinearSVC(C=10),NMF(),50,5,"{'clf': LinearSVC(C=10), 'reduce_dim': NMF(), ...",0.964097,0.960929,0.961945,0.960888,0.96723,0.963018,0.002407,8
4,0.577965,0.043071,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",TruncatedSVD(n_components=50),50,3,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,13
5,0.588469,0.030896,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",TruncatedSVD(n_components=50),50,5,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,14
6,0.622024,0.030183,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",NMF(),50,3,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,15
7,6.757843,7.800136,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",NMF(),50,5,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,16
8,0.626187,0.039548,0.318269,0.019913,LogisticRegression(C=100),TruncatedSVD(n_components=50),50,3,"{'clf': LogisticRegression(C=100), 'reduce_dim...",0.978881,0.974657,0.976744,0.973573,0.975687,0.975908,0.001822,1
9,0.6523,0.081594,0.314146,0.024342,LogisticRegression(C=100),TruncatedSVD(n_components=50),50,5,"{'clf': LogisticRegression(C=100), 'reduce_dim...",0.973601,0.971489,0.97463,0.971459,0.976744,0.973585,0.001999,3


In [38]:
# Results when Headers and Footers are removed
pd.DataFrame(grid2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_reduce_dim,param_reduce_dim__n_components,param_vect__min_df,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.341135,0.343946,0.310825,0.032707,LinearSVC(C=10),TruncatedSVD(n_components=50),50,3,"{'clf': LinearSVC(C=10), 'reduce_dim': Truncat...",0.966209,0.977825,0.973573,0.968288,0.97463,0.972105,0.004256,2
1,4.360378,0.287692,0.291636,0.032915,LinearSVC(C=10),TruncatedSVD(n_components=50),50,5,"{'clf': LinearSVC(C=10), 'reduce_dim': Truncat...",0.964097,0.980993,0.970402,0.965116,0.97463,0.971048,0.006255,4
2,21.655785,3.108991,0.31656,0.018701,LinearSVC(C=10),NMF(),50,3,"{'clf': LinearSVC(C=10), 'reduce_dim': NMF(), ...",0.947202,0.968321,0.95666,0.952431,0.961945,0.957312,0.007337,8
3,12.531716,3.102796,0.290919,0.027788,LinearSVC(C=10),NMF(),50,5,"{'clf': LinearSVC(C=10), 'reduce_dim': NMF(), ...",0.954593,0.967265,0.960888,0.946089,0.965116,0.95879,0.007684,6
4,0.516619,0.046804,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",TruncatedSVD(n_components=50),50,3,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,13
5,0.486146,0.020771,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",TruncatedSVD(n_components=50),50,5,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,14
6,0.526037,0.025804,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",NMF(),50,3,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,15
7,3.658533,6.242231,0.0,0.0,"LogisticRegression(C=10, penalty='l1')",NMF(),50,5,"{'clf': LogisticRegression(C=10, penalty='l1')...",,,,,,,,16
8,0.572096,0.020714,0.251907,0.019098,LogisticRegression(C=100),TruncatedSVD(n_components=50),50,3,"{'clf': LogisticRegression(C=100), 'reduce_dim...",0.966209,0.977825,0.973573,0.970402,0.972516,0.972105,0.003813,1
9,0.55077,0.03056,0.249504,0.017271,LogisticRegression(C=100),TruncatedSVD(n_components=50),50,5,"{'clf': LogisticRegression(C=100), 'reduce_dim...",0.966209,0.980993,0.970402,0.966173,0.97463,0.971681,0.005609,3
