# Import python libraries

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.model_selection import GridSearchCV , train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier , ExtraTreesClassifier  
from sklearn.preprocessing import OneHotEncoder


# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Import Train and Test Data

In [2]:
X_train= pd.read_csv("./datasets/X_train.csv" , squeeze = True)
y_train= pd.read_csv("./datasets/y_train.csv", squeeze = True)

X_test= pd.read_csv("./datasets/X_test.csv" , squeeze = True)
y_test= pd.read_csv("./datasets/y_test.csv", squeeze = True)

### Model 5: Random Forest : Pipeline - GridSearch

In [3]:
# Let's set a pipeline up with two stages:
# 1. CountVectorizer (transformer)
# 2. Random Forest Classifier (estimator)
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])

In [4]:
# Search over the following values of hyperparameters:
# Maximum number of features fit: 2000, 3000, 4000, 5000
# Minimum number of documents needed to include token: 2, 3
# Maximum number of documents needed to include token: 90%, 95%
# Check (individual tokens) and also check (individual tokens and 2-grams).
pipe_params = {
    'cvec__max_features' : [200, 300, 400, 500],
    'cvec__stop_words' : [None,['Sherlock' , 'Poirot'], 'english'],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'rfc__criterion':['gini', 'entropy'],
    'rfc__max_features': ['auto', 'sqrt', 'log2'],
    'rfc__ccp_alpha' : [0, 0.001, 0.01, 0.1, 1, 10]
}

In [5]:
# Instantiate GridSearchCV.
gs_rfc = GridSearchCV(pipe,
                      pipe_params,
                      cv = 5,
                      verbose=1,
                      n_jobs=8)

In [6]:
# Fit GridSearch to training data.
gs_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   14.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   27.2s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:   43.1s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:  1.0min
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:  1.4min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:  1.9min
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:  2.4min
[Parallel(n_jobs=8)]: Done 4320 out of 4320 | elapsed:  2.6min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('rfc', RandomForestClassifier())]),
             n_jobs=8,
             param_grid={'cvec__max_features': [200, 300, 400, 500],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': [None, ['Sherlock', 'Poirot'],
                                              'english'],
                         'rfc__ccp_alpha': [0, 0.001, 0.01, 0.1, 1, 10],
                         'rfc__criterion': ['gini', 'entropy'],
                         'rfc__max_features': ['auto', 'sqrt', 'log2']},
             verbose=1)

In [7]:
gs_rfc_df = pd.DataFrame(gs_rfc.cv_results_)

In [8]:
gs_rfc.best_estimator_

Pipeline(steps=[('cvec',
                 CountVectorizer(max_features=500, stop_words='english')),
                ('rfc',
                 RandomForestClassifier(ccp_alpha=0, criterion='entropy',
                                        max_features='log2'))])

In [9]:
gs_rfc.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rfc__ccp_alpha': 0,
 'rfc__criterion': 'entropy',
 'rfc__max_features': 'log2'}

### Run the model on Test data

In [10]:
# Evaluate the best fit model on the test data.
best_rfc = gs_rfc.best_estimator_
print(f"Training Score from best Random Forest: {gs_rfc.score(X_train , y_train)}")
print(f"Test Score from best Random Forest: {gs_rfc.score(X_test, y_test)}") 

Training Score from best Random Forest: 0.9939024390243902
Test Score from best Random Forest: 0.9045801526717557
