# Classification Grid Search

In [1]:
N_JOBS = 3

In [2]:
import pandas as pd
import numpy as np
import time

- SVM
- KNN
- Decision Tree
- Naive Bayes

In [3]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV 
import os
from sklearn.model_selection import train_test_split
import json
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
def SVMGridSearch(dataSetName, X_train, y_train, X_test, y_test):  
    # defining parameter range 
    param_grid = {'C': [0.1, 1, 10, 100, 1000],  
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                  'kernel': ['rbf', 'linear']}  

    grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 1, n_jobs = N_JOBS) 
    # fitting the model for grid search 
    grid.fit(X_train, y_train)
    data = {}
    data['dataSet'] = dataSetName
    data['Algorithm'] = "SVC"
    data['best_estimator'] = str(grid.best_estimator_)
    data['training_score'] = grid.score(X_train, y_train)
    data['testing_score'] = grid.score(X_test, y_test)
    data['best_score'] = grid.best_score_
    return data 

In [5]:
def KNNGridSearch(dataSetName, X_train, y_train, X_test, y_test):  
    # defining parameter range 
    param_grid = {'n_neighbors': [3, 5, 7, 11, 19],  
                  'weights': ['uniform', 'distance'], 
                  'metric': ['euclidean', 'manhattan']}  

    grid = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 1, n_jobs = N_JOBS) 
    # fitting the model for grid search 
    grid.fit(X_train, y_train)
    data = {}
    data['dataSet'] = dataSetName
    data['Algorithm'] = "KNN"
    data['best_estimator'] = str(grid.best_estimator_)
    data['training_score'] = grid.score(X_train, y_train)
    data['testing_score'] = grid.score(X_test, y_test)
    data['best_score'] = grid.best_score_
    return data

In [6]:
def TreeGridSearch(dataSetName, X_train, y_train, X_test, y_test):  
    # defining parameter range 
    param_grid = {'criterion': ['entropy', 'gini'], 'max_depth': [2, 5, 10, 12], 'min_samples_leaf': [2, 5, 10, 12]}

    grid = GridSearchCV(DecisionTreeClassifier(), param_grid, refit = True, verbose = 1, n_jobs = N_JOBS) 
    # fitting the model for grid search 
    grid.fit(X_train, y_train)
    data = {}
    data['dataSet'] = dataSetName
    data['Algorithm'] = "DecisionTree"
    data['best_estimator'] = str(grid.best_estimator_)
    data['training_score'] = grid.score(X_train, y_train)
    data['testing_score'] = grid.score(X_test, y_test)
    data['best_score'] = grid.best_score_
    return data

In [7]:
def BayesSearch(dataSetName, X_train, y_train, X_test, y_test):  
    grid = GaussianNB()
    grid.fit(X_train, y_train)
    data = {}
    data['dataSet'] = dataSetName
    data['Algorithm'] = "Naive Bayes"
    data['best_estimator'] = "GaussianNB()"
    data['training_score'] = grid.score(X_train, y_train)
    data['testing_score'] = grid.score(X_test, y_test)
    data['best_score'] = data['training_score']
    return data

In [8]:
os.listdir()

['.ipynb_checkpoints',
 'accuracy-result.ipynb',
 'cleaned-data',
 'details',
 'done-cleaning-data',
 'done-results-data',
 'pipeline',
 'results',
 'results-metadata',
 'to_sahil',
 'to_sahil.zip',
 'transfusion.csv',
 'Untitled.ipynb']

In [9]:
os.listdir('cleaned-data')

['data_banknote_authentication.csv']

In [None]:
folder = 'cleaned-data'
files = os.listdir('cleaned-data')

for file in files:
    df = pd.read_csv(folder+"//"+file)
    dataset_name = file.split(".")
    dataset_name = dataset_name[:-1]
    dataset_name = ''.join(dataset_name)
    dataset_name += '-scored'
    X  = df.iloc[:,:-1].values
    y  = df.iloc[:,-1:].values
    y = y.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    print("starting..", dataset_name)
    
    start = time.time()
    data = BayesSearch(dataset_name ,X_train, y_train, X_test, y_test)
    timeTaken = time.time() - start
    data['TimeTaken'] = timeTaken
    data['njobs'] = N_JOBS
    json_file_name = "results/" + dataset_name + "-bayes.json"
    print(data)
    with open(json_file_name, "w") as f:
        json.dump(data, f)
        
    start = time.time()
    data = TreeGridSearch(dataset_name ,X_train, y_train, X_test, y_test)
    timeTaken = time.time() - start
    data['TimeTaken'] = timeTaken
    data['njobs'] = N_JOBS
    json_file_name = "results/" + dataset_name + "-tree.json"
    print(data)
    with open(json_file_name, "w") as f:
        json.dump(data, f)
    
    start = time.time()
    data = KNNGridSearch(dataset_name ,X_train, y_train, X_test, y_test)
    timeTaken = time.time() - start
    data['TimeTaken'] = timeTaken
    data['njobs'] = N_JOBS
    json_file_name = "results/" + dataset_name + "-knn.json"
    print(data)
    with open(json_file_name, "w") as f:
        json.dump(data, f)
        
    start = time.time()
    data = SVMGridSearch(dataset_name ,X_train, y_train, X_test, y_test)
    timeTaken = time.time() - start
    data['TimeTaken'] = timeTaken
    data['njobs'] = N_JOBS
    json_file_name = "results/" + dataset_name + "-svm.json"
    print(data)
    with open(json_file_name, "w") as f:
        json.dump(data, f)

starting.. data_banknote_authentication-scored
{'dataSet': 'data_banknote_authentication-scored', 'Algorithm': 'Naive Bayes', 'best_estimator': 'GaussianNB()', 'training_score': 0.8464528668610302, 'testing_score': 0.8279883381924198, 'best_score': 0.8464528668610302, 'TimeTaken': 0.002995014190673828, 'njobs': 3}
Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 140 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done 160 out of 160 | elapsed:    1.5s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    0.1s finished


{'dataSet': 'data_banknote_authentication-scored', 'Algorithm': 'DecisionTree', 'best_estimator': 'DecisionTreeClassifier(max_depth=10, min_samples_leaf=2)', 'training_score': 0.9931972789115646, 'testing_score': 0.9766763848396501, 'best_score': 0.9825147999052806, 'TimeTaken': 1.5910186767578125, 'njobs': 3}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'dataSet': 'data_banknote_authentication-scored', 'Algorithm': 'KNN', 'best_estimator': "KNeighborsClassifier(metric='euclidean', n_neighbors=3)", 'training_score': 0.9990281827016521, 'testing_score': 1.0, 'best_score': 1.0, 'TimeTaken': 0.17552947998046875, 'njobs': 3}
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
