# Import python libraries

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk import FreqDist, pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfTransformer

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Import Train Test Data

In [2]:
X_train= pd.read_csv("./datasets/X_train.csv" , squeeze = True)
y_train= pd.read_csv("./datasets/y_train.csv", squeeze = True)

X_test= pd.read_csv("./datasets/X_test.csv" , squeeze = True)
y_test= pd.read_csv("./datasets/y_test.csv", squeeze = True)


### Model 2: KNN : Pipeline - GridSearch

In [3]:
# Let's set a pipeline up with two stages:
# 1. CountVectorizer (transformer)
# 2. KNN (estimator)
pipe = Pipeline([
    ('cvec', CountVectorizer()),
     ('sc', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

In [4]:
# Search over the following values of hyperparameters:
# Maximum number of features fit: 2000, 3000, 4000, 5000
# Minimum number of documents needed to include token: 2, 3
# Maximum number of documents needed to include token: 90%, 95%
# Check (individual tokens) and also check (individual tokens and 2-grams).
pipe_params = {
    'cvec__max_features' : [200,300,400],
    'cvec__stop_words' : [None,['Sherlock' , 'Poirot'], 'english'],
    'cvec__ngram_range' : [(1,1), (1,2)],
    'sc__copy':[True , False],
    'sc__with_mean':[True , False],
    'sc__with_std': [True, False],
    'knn__n_neighbors':[1,5,10,15],
    'knn__weights':['uniform', 'distance'],
    'knn__p':[1,2]    
}

In [5]:
# Instantiate GridSearchCV.
gs_knn = GridSearchCV(pipe,
                      pipe_params,
                      cv = 10,
                      verbose=1,
                      n_jobs=8)

In [6]:
# Fit GridSearch to training data.
gs_knn.fit(X_train, y_train)

Fitting 10 folds for each of 2304 candidates, totalling 23040 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 560 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 1560 tasks      | elapsed:   11.3s
[Parallel(n_jobs=8)]: Done 2960 tasks      | elapsed:   20.8s
[Parallel(n_jobs=8)]: Done 4760 tasks      | elapsed:   42.4s
[Parallel(n_jobs=8)]: Done 6960 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 9560 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 12560 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done 15960 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done 19760 tasks      | elapsed:  3.4min
[Parallel(n_jobs=8)]: Done 23040 out of 23040 | elapsed:  4.3min finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('sc', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             n_jobs=8,
             param_grid={'cvec__max_features': [200, 300, 400],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': [None, ['Sherlock', 'Poirot'],
                                              'english'],
                         'knn__n_neighbors': [1, 5, 10, 15], 'knn__p': [1, 2],
                         'knn__weights': ['uniform', 'distance'],
                         'sc__copy': [True, False],
                         'sc__with_mean': [True, False],
                         'sc__with_std': [True, False]},
             verbose=1)

In [7]:
gs_knn_df = pd.DataFrame(gs_knn.cv_results_)

In [8]:
gs_knn_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_cvec__max_features,param_cvec__ngram_range,param_cvec__stop_words,param_knn__n_neighbors,param_knn__p,param_knn__weights,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.089538,0.020547,0.000000,0.000000,200,"(1, 1)",,1,1,uniform,...,,,,,,,,,,2304
1,0.065045,0.006497,0.000000,0.000000,200,"(1, 1)",,1,1,uniform,...,,,,,,,,,,1240
2,0.056893,0.003915,0.015792,0.002390,200,"(1, 1)",,1,1,uniform,...,0.924242,0.878788,0.924242,0.923077,0.907692,0.846154,0.938462,0.906993,0.030933,443
3,0.047336,0.003712,0.013412,0.002457,200,"(1, 1)",,1,1,uniform,...,0.954545,0.863636,0.909091,0.938462,0.907692,0.953846,1.000000,0.931515,0.042475,133
4,0.041343,0.004467,0.000000,0.000000,200,"(1, 1)",,1,1,uniform,...,,,,,,,,,,2236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,0.089686,0.017114,0.008200,0.002075,400,"(1, 2)",english,15,2,distance,...,0.954545,0.969697,0.969697,1.000000,0.907692,0.953846,0.938462,0.952727,0.030884,7
2300,0.087354,0.015799,0.000000,0.000000,400,"(1, 2)",english,15,2,distance,...,,,,,,,,,,1236
2301,0.090953,0.018379,0.000000,0.000000,400,"(1, 2)",english,15,2,distance,...,,,,,,,,,,1228
2302,0.092349,0.015872,0.008647,0.001859,400,"(1, 2)",english,15,2,distance,...,0.909091,0.939394,0.954545,0.938462,0.876923,0.953846,0.938462,0.929860,0.025783,159


In [9]:
gs_knn.best_estimator_

Pipeline(steps=[('cvec',
                 CountVectorizer(max_features=300, stop_words='english')),
                ('sc', StandardScaler(with_mean=False, with_std=False)),
                ('knn',
                 KNeighborsClassifier(n_neighbors=15, weights='distance'))])

In [10]:
gs_knn.best_params_

{'cvec__max_features': 300,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 15,
 'knn__p': 2,
 'knn__weights': 'distance',
 'sc__copy': True,
 'sc__with_mean': False,
 'sc__with_std': False}

### Run the model on Test data

In [11]:
# Evaluate the best fit model on the test data.
best_knn = gs_knn.best_estimator_
print(f"Training Score from best KNN: {best_knn.score(X_train , y_train)}")
print(f"Test Score from best KNN: {best_knn.score(X_test, y_test)}")                                     
                                    

Training Score from best KNN: 0.9801829268292683
Test Score from best KNN: 0.8778625954198473


#### Training Score from best KNN: 0.9801829268292683
#### Test Score from best KNN: 0.8778625954198473