In [1]:
# Import statements
import pandas as pd
import numpy as np
from scipy import stats
import os
import matplotlib.pyplot as plt
from scipy.stats import uniform, truncnorm, randint
from IPython.core.display import display, HTML
from IPython.display import Image
from IPython.core.display import HTML 

from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from imblearn.over_sampling import SMOTE

# Modules defined in the My_Functions.py file
from My_Functions import *
np.set_printoptions(precision=3)

# Make sure we can see the whole dataset and not a subset
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

# Make the notebook fill 100% of browser width.
display(HTML("<style>.container { width:100% !important; }</style>"))

#Setting working directory
os.chdir("../Data Mining Course Project")

Using TensorFlow backend.


In [2]:
# reading the train dataset
train_data_pca = pd.read_csv('kmeans_pca.csv')
train_labels = pd.read_csv('train_lab.csv')

In [3]:
train_data_pca.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,0.1
0,0,-0.46491,-0.316926,-1.363506,1.230901,-0.098547,-0.542456,0.490049,0.165996,0.151046,-0.091806,-0.332245,-0.504923,-0.757475,0.039029,-0.084057,0.131032,-0.097981,0.053069,-0.08515,0.818491,0.275366,-0.059183,0.218779,0.14722,-0.676325,-0.446347,-0.449449,0.320993,-0.333953,0.061953,-0.158515,-0.312028,0.171036,0.310926,-0.001286,-0.189245,1.195166,-0.44471,-0.063681,0.660103,0.652597,0.018062,0.152451,-0.244167,-0.029685,-0.111967,0.028517,-0.063287,0.184262,-0.081252,1
1,1,1.56016,-0.432727,-0.179785,-1.472583,0.615717,-0.296843,-0.665676,-0.284648,-0.623543,-0.289691,0.422913,-0.120372,-0.594092,-0.175062,-0.557302,0.305455,0.327487,-0.526919,0.435269,-0.329349,-0.031249,0.149355,-0.619289,-0.220203,0.581583,0.23713,0.075135,0.221422,-0.04415,-0.124973,-0.368901,-0.747158,-0.230927,-0.145338,-0.233412,0.052361,0.025482,0.117755,-0.55136,-0.130696,0.150802,0.10961,-0.115578,-0.136342,-0.252147,0.042794,0.017075,0.121018,-0.049644,-0.026691,0
2,2,1.955894,-0.462216,-0.126313,0.112227,1.863922,0.339726,0.815896,0.641782,1.169954,-0.280485,0.01286,-0.24726,0.08075,0.528933,0.506454,0.428117,-0.544043,-0.72599,-0.037273,-0.323508,0.007881,-0.046946,-0.772998,-0.014105,-0.762227,0.74219,0.394372,0.774807,0.385143,0.497819,-0.064672,0.102319,0.648539,0.255333,-0.074587,1.111646,-0.099733,-0.685151,-0.486844,0.695453,0.085704,-0.349214,-0.156687,-0.028938,0.008576,0.177825,0.102276,0.495152,0.575929,-0.115436,0
3,3,1.838911,0.716422,-0.841167,-0.285876,0.342644,1.07211,-0.741913,0.548292,-0.115703,-0.208982,-0.392698,0.103723,0.082739,0.187654,0.008773,0.134124,-0.551801,-0.115026,-0.313719,-0.547184,0.181539,-0.36624,0.181245,-0.155202,-0.042247,-0.584245,0.378847,0.072509,0.125673,-0.223241,-0.36066,1.10591,-0.00559,0.403564,-0.189161,-0.313299,-0.222334,-0.371923,0.21377,0.27229,-0.022659,0.113314,0.006712,0.064857,-0.041309,-0.053862,-0.007181,-0.040829,0.012355,0.016757,0
4,4,0.656492,-0.259554,-1.284894,-0.018745,-0.700243,-0.568247,0.41434,-0.381182,0.971891,0.082864,-0.459408,-0.141622,-0.425934,0.32673,0.506451,-0.753611,0.076733,-0.889669,-0.425743,0.414551,0.661222,-0.123608,0.582808,0.945176,0.214869,0.330753,-0.170024,0.27679,-0.318499,0.188457,-0.129848,-0.369151,0.325235,0.196965,-0.053203,-0.011197,-0.107848,0.075202,0.388868,-0.5801,0.649799,0.179885,0.050285,-0.137145,-0.24583,0.195104,-0.149184,-0.260455,0.162444,0.02492,0


In [4]:
train_data_pca.set_index("Unnamed: 0", inplace=True)
del train_data_pca.index.name

In [7]:
train_data_pca.rename(columns={"0.1":"cluster"}, inplace=True)
train_data_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,cluster
0,-0.46491,-0.316926,-1.363506,1.230901,-0.098547,-0.542456,0.490049,0.165996,0.151046,-0.091806,-0.332245,-0.504923,-0.757475,0.039029,-0.084057,0.131032,-0.097981,0.053069,-0.08515,0.818491,0.275366,-0.059183,0.218779,0.14722,-0.676325,-0.446347,-0.449449,0.320993,-0.333953,0.061953,-0.158515,-0.312028,0.171036,0.310926,-0.001286,-0.189245,1.195166,-0.44471,-0.063681,0.660103,0.652597,0.018062,0.152451,-0.244167,-0.029685,-0.111967,0.028517,-0.063287,0.184262,-0.081252,1
1,1.56016,-0.432727,-0.179785,-1.472583,0.615717,-0.296843,-0.665676,-0.284648,-0.623543,-0.289691,0.422913,-0.120372,-0.594092,-0.175062,-0.557302,0.305455,0.327487,-0.526919,0.435269,-0.329349,-0.031249,0.149355,-0.619289,-0.220203,0.581583,0.23713,0.075135,0.221422,-0.04415,-0.124973,-0.368901,-0.747158,-0.230927,-0.145338,-0.233412,0.052361,0.025482,0.117755,-0.55136,-0.130696,0.150802,0.10961,-0.115578,-0.136342,-0.252147,0.042794,0.017075,0.121018,-0.049644,-0.026691,0
2,1.955894,-0.462216,-0.126313,0.112227,1.863922,0.339726,0.815896,0.641782,1.169954,-0.280485,0.01286,-0.24726,0.08075,0.528933,0.506454,0.428117,-0.544043,-0.72599,-0.037273,-0.323508,0.007881,-0.046946,-0.772998,-0.014105,-0.762227,0.74219,0.394372,0.774807,0.385143,0.497819,-0.064672,0.102319,0.648539,0.255333,-0.074587,1.111646,-0.099733,-0.685151,-0.486844,0.695453,0.085704,-0.349214,-0.156687,-0.028938,0.008576,0.177825,0.102276,0.495152,0.575929,-0.115436,0
3,1.838911,0.716422,-0.841167,-0.285876,0.342644,1.07211,-0.741913,0.548292,-0.115703,-0.208982,-0.392698,0.103723,0.082739,0.187654,0.008773,0.134124,-0.551801,-0.115026,-0.313719,-0.547184,0.181539,-0.36624,0.181245,-0.155202,-0.042247,-0.584245,0.378847,0.072509,0.125673,-0.223241,-0.36066,1.10591,-0.00559,0.403564,-0.189161,-0.313299,-0.222334,-0.371923,0.21377,0.27229,-0.022659,0.113314,0.006712,0.064857,-0.041309,-0.053862,-0.007181,-0.040829,0.012355,0.016757,0
4,0.656492,-0.259554,-1.284894,-0.018745,-0.700243,-0.568247,0.41434,-0.381182,0.971891,0.082864,-0.459408,-0.141622,-0.425934,0.32673,0.506451,-0.753611,0.076733,-0.889669,-0.425743,0.414551,0.661222,-0.123608,0.582808,0.945176,0.214869,0.330753,-0.170024,0.27679,-0.318499,0.188457,-0.129848,-0.369151,0.325235,0.196965,-0.053203,-0.011197,-0.107848,0.075202,0.388868,-0.5801,0.649799,0.179885,0.050285,-0.137145,-0.24583,0.195104,-0.149184,-0.260455,0.162444,0.02492,0


In [10]:
train_data_pca.shape

(2973, 51)

In [16]:
train_pred = train_data_pca

In [14]:
#Create Indicators/labels
train_labels = train_labels["0"].to_numpy()
train_labels.shape

(2973,)

In [17]:
#Scale Data
pipeline = Pipeline([('imputer', SimpleImputer(strategy = 'median')), 
                      ('scaler', MinMaxScaler())])

train_pred = pipeline.fit_transform(train_pred)

In [19]:
#balancing the dataset
#print('Original dataset shape %s' % Counter(train_labels))

#sm = SMOTE(random_state=42)

#X_res, y_res = sm.fit_resample(train_pred, train_labels)

#print('Resampled dataset shape %s' % Counter(y_res))

In [20]:
#Function to pass different model types and perform 10 fold cross validation and hyperparameter tuning

def cv_hyptun_model(train, train_labels, model, name, params, model_results_balanced=None):
    """
    Perform 10 fold cross validation of a model and hyperparameter tuning
    Returns the model performance metrics (stored in model_results_balanced) and the best model (stored in hyptun_best_model)
    
    Pass the following input arguments to the function:
    train -        predictors from the training dataset
    train_labels - labels (y) from the training dataset
    model -        model function to create the model ex. RandomForestClassifier()
    name -         name of the model ex. 'RF'
    params -       a dictionary of parameters you want to tune and the values for 
                   each of those from which you want to perfrom the randomized search
                   ex. params = {
                                # randomly sample numbers from 4 to 204 estimators
                                'n_estimators': randint(4,200),
                                # normally distributed max_features, with mean .25 stddev 0.1, bounded between 0 and 1
                                'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
                                # uniform distribution from 0.01 to 0.2 (0.01 + 0.199)
                                'min_samples_split': uniform(0.01, 0.199)
                                }
    model_results_balanced is just an empty dataframe to store the model performance metrics
    
    Please note: We have calculated the Balanced Accuracy here even though the input maybe not be a imbalanced dataset
    
    """
    
    model_results_balanced = pd.DataFrame(columns = ['model', 'F1_score', 'Accuracy', 'Accuracy_balanced', 'ROC_AUC'])
    
    # Creating the RandomizedSearchCV object which will be used to perform the CV and hyperparameter tuning 
    cv_hyptun = RandomizedSearchCV(estimator = model, param_distributions = params, n_iter = 100, cv = 10, n_jobs = -1)
    
    # Fitting the model with the predictors and labels
    model_fit = cv_hyptun.fit(train, train_labels)
    
    # Obtianing the parameters for the best model obtained as a result of hyperparamter tuning
    hyptun_best_model = model_fit.best_estimator_.get_params()
    
    # Obtaining the predictions for each observation of the train dataset
    # This will automatically use the best model
    predictions_prob = model_fit.predict_proba(train)
    predictions_labels = model_fit.predict(train)
    
    # Calculating the various model performance metrics
    F1_score = f1_score(train_labels, predictions_labels, average="macro")
    Accuracy = accuracy_score(train_labels, predictions_labels)
    Accuracy_balanced = balanced_accuracy_score(train_labels, predictions_labels)
    ROC_AUC = roc_auc_score(train_labels, predictions_prob, multi_class = "ovo")
    
    # storing model performance metrics in one dataframe
    model_results_balanced = model_results_balanced.append(pd.DataFrame({'model': name, 'F1_score': F1_score,\
                                                                         'Accuracy': Accuracy, 'Accuracy_balanced': Accuracy_balanced,\
                                                                         'ROC_AUC': ROC_AUC},\
                                                                        index = [0]), ignore_index = True)

    return model_results_balanced, hyptun_best_model

In [21]:
# Random Forest
model_params_RF = {'n_estimators': randint(75,76), 'min_samples_split': randint(19,20)}
RF_model_metrics, RF_best_model = cv_hyptun_model(train_pred, train_labels, RandomForestClassifier(random_state=42),'RF', params = model_params_RF)
RF_model_metrics

Unnamed: 0,model,F1_score,Accuracy,Accuracy_balanced,ROC_AUC
0,RF,0.771932,0.864447,0.688547,0.9943


In [26]:
#balanced K Nearest Neighbors

model_params_KNN = {'n_neighbors':randint(2,30)}
KNN_model_metrics, KNN_best_model = cv_hyptun_model(X_res, y_res, KNeighborsClassifier(),'KNN', params = model_params_KNN)
KNN_model_metrics

Unnamed: 0,model,F1_score,Accuracy,Accuracy_balanced,ROC_AUC
0,KNN,0.917012,0.922083,0.922083,0.999453


In [30]:
# LDA

model_params_LDA = {'n_components':randint(4,120)}
LDA_model_metrics, LDA_best_model = cv_hyptun_model(X_res, y_res, LinearDiscriminantAnalysis(),'LDA', params = model_params_LDA)
LDA_model_metrics



Unnamed: 0,model,F1_score,Accuracy,Accuracy_balanced,ROC_AUC
0,LDA,0.547952,0.547595,0.547595,0.791798
