<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 4: Kaggle Competition - Modeling & Prediction

- Clustering location of traps
- Predicting without number of mosquitos (missing in test)

## File imports

In [41]:
#standard Imports
import pandas as pd
#so that pandas do not truncate the rows
pd.set_option('max_columns', 500) 
pd.set_option('max_rows', 500) 
pd.set_option('display.max_colwidth', 300)
#Set datafrome display format
pd.options.display.float_format = "{:,.3f}".format


import numpy as np

#graph imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#define the style of sns/plt
#sns.set_style("whitegrid")
from matplotlib.lines import Line2D

#==== Date Time ==== 
from datetime import datetime
from datetime import timedelta


#'==== Preprocessing == 
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#===Imbalance Data
from imblearn.over_sampling import SMOTE

#== Unsupervised learing: Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN


#Regression
from sklearn.linear_model import LinearRegression, LogisticRegression

#==Regularization
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import ElasticNet, ElasticNetCV

#==KNN
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

#=== Naive Bayes
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

#=== SVC
from sklearn.svm import SVC

#==== DecisionTrees
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

#==== Emsemble
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor


from sklearn.ensemble import VotingClassifier


#==== Classification matrices
from sklearn import metrics

from sklearn.metrics import plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_roc_curve, auc, roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef
from sklearn.metrics import silhouette_score


In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/rachellimce/DSI_Projects/master/project_4/datasets/train_weather.csv')
test = pd.read_csv('https://raw.githubusercontent.com/rachellimce/DSI_Projects/master/project_4/datasets/test_weather.csv')
colinear_feat = pd.read_csv('https://raw.githubusercontent.com/rachellimce/DSI_Projects/master/project_4/datasets/colinearfeatures.csv')

There are data which exists in test data but not in train data. 
There are 13 more traps which also do not exist in the train data. 

## Data Preparation

### Hot encoding categorial features
- Traps
- Species

We combine the 2 datasets - train and test to do one-hot encoding, drop more irrelevant roles and split the dataset back to train/test again

In [3]:
train.columns

Index(['date', 'species', 'trap', 'latitude', 'longitude', 'nummosquitos',
       'wnvpresent', 'year', 'month', 'weekofyear', 'yearmonth', 'weekday',
       'closest_station', 'station', 'tmax', 'tmin', 'tavg', 'depart',
       'dewpoint', 'wetbulb', 'preciptotal', 'resultspeed', 'resultdir',
       'avgspeed', 'daytime', 'rhumidity', 'rolltemp7', 'rollprec7', 'rollrh7',
       'rolltemp10', 'rollprec10', 'rollrh10', 'rolltemp14', 'rollprec14',
       'rollrh14'],
      dtype='object')

In [4]:
test.columns

Index(['id', 'date', 'species', 'trap', 'latitude', 'longitude', 'year',
       'month', 'weekofyear', 'yearmonth', 'weekday', 'closest_station',
       'station', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb',
       'preciptotal', 'resultspeed', 'resultdir', 'avgspeed', 'daytime',
       'rhumidity', 'rolltemp7', 'rollprec7', 'rollrh7', 'rolltemp10',
       'rollprec10', 'rollrh10', 'rolltemp14', 'rollprec14', 'rollrh14'],
      dtype='object')

In [5]:
train['is_train'] = 1
test['is_train'] = 0

train['id'] = 0 #dummy value
test['wnvpresent'] = 9 #dummy value
test['nummosquitos'] = 0 #dummy value

In [6]:
#rearrange the column order in both train and test before merge them for one-hot encoding
columnsTitles = train.columns
train = train.reindex(columns=columnsTitles)
test = test.reindex(columns=columnsTitles)

In [7]:
#one-hot encoding for train, test, merge them
combined = pd.concat([train, test],axis=0)

In [8]:
combined.shape

(126799, 37)

In [9]:
#perform one-hot encoding
object_columns = ['species','trap']

combined = pd.get_dummies(data=combined, columns = object_columns, drop_first = False)
combined.shape

(126799, 192)

In [10]:
#split the combined set into train set and test set
df_train = combined[combined['is_train']==1]
df_test = combined[combined['is_train']==0]


#drop columns created previously for merging for one-hot encoding
df_train = df_train.drop(['id','is_train'],axis=1)
df_test = df_test.drop(['wnvpresent', 'nummosquitos','is_train'],axis=1)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10506 entries, 0 to 10505
Columns: 190 entries, date to trap_T903
dtypes: float64(16), int64(15), object(2), uint8(157)
memory usage: 4.3+ MB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 116293 entries, 0 to 116292
Columns: 189 entries, date to trap_T903
dtypes: float64(16), int64(14), object(2), uint8(157)
memory usage: 46.7+ MB


### Feature Selection


In [87]:
#we will drop the derived parameters such as 
# date, 'yearmonth', 'weekday' (represented by year, month)
# longtitude, latitude (represent by traps), 
#'closest_station', 'station', (more for reference in getting the weather values)
# 'tmax', 'tmin' (represented by tavg)
# 'numofmosquitos' (not present in test, so we removing it first)

df_trainselected = df_train.drop(columns=['date', 'latitude', 'longitude', 'nummosquitos', 
'wnvpresent', 'yearmonth', 'weekday','closest_station', 'station', 'tmax', 'tmin'])

df_trainselected.columns

Index(['year', 'month', 'weekofyear', 'tavg', 'depart', 'dewpoint', 'wetbulb',
       'preciptotal', 'resultspeed', 'resultdir',
       ...
       'trap_T231', 'trap_T232', 'trap_T233', 'trap_T234', 'trap_T235',
       'trap_T236', 'trap_T237', 'trap_T238', 'trap_T900', 'trap_T903'],
      dtype='object', length=179)

In [88]:
#scaling the train data
df_trainselected_skewed = df_trainselected.apply(lambda x: np.log1p(x) if np.issubdtype(x.dtype, np.number) else x)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [89]:
df_trainselected_skewed.head()

Unnamed: 0,year,month,weekofyear,tavg,depart,dewpoint,wetbulb,preciptotal,resultspeed,resultdir,avgspeed,daytime,rhumidity,rolltemp7,rollprec7,rollrh7,rolltemp10,rollprec10,rollrh10,rolltemp14,rollprec14,rollrh14,species_CULEX ERRATICUS,species_CULEX PIPIENS,species_CULEX PIPIENS/RESTUANS,species_CULEX RESTUANS,species_CULEX SALINARIUS,species_CULEX TARSALIS,species_CULEX TERRITANS,species_UNSPECIFIED CULEX,trap_T001,trap_T002,trap_T002A,trap_T002B,trap_T003,trap_T004,trap_T005,trap_T006,trap_T007,trap_T008,trap_T009,trap_T011,trap_T012,trap_T013,trap_T014,trap_T015,trap_T016,trap_T017,trap_T018,trap_T019,trap_T025,trap_T027,trap_T028,trap_T030,trap_T031,trap_T033,trap_T034,trap_T035,trap_T036,trap_T037,trap_T039,trap_T040,trap_T043,trap_T044,trap_T045,trap_T046,trap_T047,trap_T048,trap_T049,trap_T050,trap_T051,trap_T054,trap_T054C,trap_T060,trap_T061,trap_T062,trap_T063,trap_T065,trap_T065A,trap_T066,trap_T067,trap_T069,trap_T070,trap_T071,trap_T072,trap_T073,trap_T074,trap_T075,trap_T076,trap_T077,trap_T078,trap_T079,trap_T080,trap_T081,trap_T082,trap_T083,trap_T084,trap_T085,trap_T086,trap_T088,trap_T089,trap_T090,trap_T090A,trap_T090B,trap_T090C,trap_T091,trap_T092,trap_T094,trap_T094B,trap_T095,trap_T096,trap_T097,trap_T099,trap_T100,trap_T102,trap_T103,trap_T107,trap_T114,trap_T115,trap_T128,trap_T128A,trap_T129,trap_T135,trap_T138,trap_T141,trap_T142,trap_T143,trap_T144,trap_T145,trap_T146,trap_T147,trap_T148,trap_T149,trap_T150,trap_T151,trap_T152,trap_T153,trap_T154,trap_T155,trap_T156,trap_T157,trap_T158,trap_T159,trap_T160,trap_T161,trap_T162,trap_T200,trap_T200A,trap_T200B,trap_T206,trap_T209,trap_T212,trap_T215,trap_T218,trap_T218A,trap_T218B,trap_T218C,trap_T219,trap_T220,trap_T221,trap_T222,trap_T223,trap_T224,trap_T225,trap_T226,trap_T227,trap_T228,trap_T229,trap_T230,trap_T231,trap_T232,trap_T233,trap_T234,trap_T235,trap_T236,trap_T237,trap_T238,trap_T900,trap_T903
0,7.605,1.792,3.135,4.317,2.398,4.078,4.19,0.0,1.917,2.944,2.015,6.799,4.067,4.186,0.184,4.17,4.197,0.14,4.104,4.238,0.103,4.004,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.605,1.792,3.135,4.317,2.398,4.078,4.19,0.0,1.917,2.944,2.015,6.799,4.067,4.186,0.184,4.17,4.197,0.14,4.104,4.238,0.103,4.004,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.605,1.792,3.135,4.317,2.398,4.078,4.19,0.0,1.917,2.944,2.015,6.799,4.067,4.186,0.184,4.17,4.197,0.14,4.104,4.238,0.103,4.004,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7.605,1.792,3.135,4.317,2.398,4.078,4.19,0.0,1.917,2.944,2.015,6.799,4.067,4.186,0.184,4.17,4.197,0.14,4.104,4.238,0.103,4.004,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.605,1.792,3.135,4.317,2.398,4.078,4.19,0.0,1.917,2.944,2.015,6.799,4.067,4.186,0.184,4.17,4.197,0.14,4.104,4.238,0.103,4.004,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train Test Split

In [90]:
X = df_trainselected_skewed
y = df_train['wnvpresent']

In [91]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [92]:
X_train.shape

(7354, 179)

In [93]:
X_val.shape

(3152, 179)

### Scaling of Data

In [94]:
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_val = ss.transform(X_val)

ValueError: Input contains infinity or a value too large for dtype('float64').

### SMOTE on inbalance data

In [None]:
sm = SMOTE()
Xsm_train, ysm_train = sm.fit_sample(Xs_train, y_train)

## Modelling

Models
- LogisticRegression
- NaiveBayes
- SVM
- RandomForest (only tuned models)
- ExtraTrees (only tuned models)
- AdaBoost (only tuned models)
- xgBoost? 

Parameters
- basic models: we pass in Xsm_train, ysm_train, Xs_val, y_val
- tuned models: we pass in X_train, y_train, X_val, y_val through a pipeline (scale, SMOTE, model) + GridSearch


In [None]:
def plot_coefficients(model, feature_names, top_features=20):
    """
    Function to plot the top coefficient for the linear model
    
    Takes in arguements:
    - model - the linear model which has the coef_ arguement
    - feature_names - features that goes into the model
    - top_features - number of features that has the most influence of the outcome
    """
    
    coef = model.coef_.ravel()

    top_positive_coefficients = np.argsort(coef)[-top_features:]
    
    top_negative_coefficients = np.argsort(coef)[:top_features]
    
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

    # create plot
    plt.figure(figsize=(15, 7))
    colors = ['green' if c < 0 else 'maroon' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(0, 2 * top_features), feature_names[top_coefficients], rotation=45, ha='right')
    plt.show()

In [None]:
def classifiermodel(lst_xyparam, model, model_name, gs=True, model_param=None,  df_modelresults=None, df_modelparams=None, 
                    showgraph=True, labels=None):
    """"
    Function to fit and display scores of the model, the confusion matrix and AUC ROC
    
    Take in Arguements:
     - lst_xyparam -  list of X_train, y_train, X_test, y_test - the train test split data
     - model - the type of model to fit data
     - model_name - the model name used to display the model
     - gs - if the model parsed in is a grid search, default true
     - model_param - the parameters of the model 
     - df_modelresults - dataframe that stores the previous models
     - df_modelparams  - dataframe that stores the previous models parameters
     - df_modelparams  - dataframe that stores the previous models parameters
     - showgraph - plot the confusion matrix and AUC ROC if true
     - labels - labels that model is classifying
     
     
     Returns the dataframes with the model results, the corresponding parameters and the best model
    """
    #unpack the train test split parameters
    X_train, y_train, X_test, y_test = lst_xyparam
    model.fit(X_train, y_train)
    trgscore = model.score(X_train, y_train)
    
    if gs:
        best_model = model.best_estimator_
        best_score = model.best_score_
        best_params =  model.best_params_
    else:
        best_model = model
        best_score = trgscore
        best_params = model.get_params()
        

    #do prediction and have y_pred
    y_pred = best_model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    #getting the probabilities of test data
    proba = best_model.predict_proba(X_test)
    proba = proba[:, 1]
    
    # Generate False positive rate and True positive rate
    fpr, tpr, threhold = roc_curve(y_test, proba)
    aucscore = roc_auc_score(y_test, proba)
    
    
    # Calculate Scores and put them in results dataframe
    index_labels = ['Test score', 'Train score', 'Sensitivity', 'Specificity', 'Precision', 'MC Score', 'AUC ROC']
    result = pd.DataFrame(index = index_labels, columns=[model_name])
    result.loc['Test score'][model_name] = round(best_model.score(X_test, y_test), 3)
    result.loc['Train score'][model_name] = round(best_score, 3)
    result.loc['Sensitivity'][model_name] = round(tp/(tp+fn), 3)
    result.loc['Specificity'][model_name] = round(tn/(tn+fp), 3)
    result.loc['Precision'][model_name] = round(tp/(tp+fp), 3)
    result.loc['MC Score'][model_name] = round(matthews_corrcoef(y_test, y_pred), 3)
    result.loc['AUC ROC'][model_name] = round(aucscore, 3)
    display(result)
    
    
    # add results and params to modelresults and modelparams
    df_modelresults = pd.concat([df_modelresults, result], axis=1)
    result_series = pd.Series({'Parameters' : best_params}, name=model_name)
    df_modelparams = df_modelparams.append(result_series)
    display(df_modelparams.loc[[model_name]])
    
    # Create the confusion matrix dataframe
    total = (tp + tn + fp + fn)
    confusion = pd.DataFrame(index= ['Actual. Neg','Actual. Pos', 'Total'])
    confusion['Predicted. Neg'] = str(round(tn/total*100, 1))+'%', str(round(fn/total*100, 1))+'%', (tn + fn)
    confusion['Predicted. Pos'] = str(round(fp/total*100, 1))+'%', str(round(tp/total*100, 1))+'%', (fp + tp)
    confusion['Total'] = (tn + fp), (fn + tp), total
    display(confusion)
    
    
    if showgraph:
        
        #try:
            #plot the confusion and AUC ROC graph
            fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,5))
            ax = ax.ravel()
                 
            #plot confusion matrix
            ax[0].set_title('Confusion Matrix')
            plot_confusion_matrix(best_model, X_test, y_test, display_labels=labels, cmap='Blues', values_format='d', ax=ax[0])
            
            if labels:
                ax[0].set_xticklabels(labels)
                ax[0].set_yticklabels(labels)
            ax[0].set(ylabel="True Label", xlabel="Predicted Label")

            #plot roc_curve
            plot_roc_curve(best_model, X_test, y_test, ax=ax[1])
            ax[1].set_title('ROC Curve')
            ax[1].set(ylabel="True Positive Rate", xlabel="False Positive Rate")
            ax[1].plot([0, 1], [0,1], lw = 4, linestyle = '--')
        
            plt.show()
        #except:
            #cant plot
        #    print('cant plot graph')
        #    pass
       
    return df_modelresults, df_modelparams, best_model

### Basic  models (No Hyperparameters tuning)


In [None]:
Xy_param = [Xsm_train, ysm_train, Xs_val, y_val]

In [None]:
# instantiating dataframes that will be used to store results and parameters of models tested
index_labels = ['Test score', 'Train score', 'Sensitivity', 'Specificity', 'Precision', 'MC Score', 'AUC ROC']
all_results = pd.DataFrame(index=index_labels)
all_params = pd.DataFrame(columns = ['Parameters'])

#### LogisticRegression

In [None]:
lr = LogisticRegression()
all_results, all_params, lr_best_model = classifiermodel(Xy_param, model=lr, model_name='LogReg_default', 
                                                         gs=False, model_param=None,  
                                                         df_modelresults=all_results, df_modelparams=all_params, 
                                                         showgraph=True)

#### NaiveBayes

In [None]:
nb = BernoulliNB()
all_results, all_params, nb_best_model = classifiermodel(Xy_param, model=nb, model_name='NaiveBayes_default', 
                                                         gs=False, model_param=None,  
                                                         df_modelresults=all_results, df_modelparams=all_params, 
                                                         showgraph=True)

#### SVM


In [None]:
svm = SVC(probability=True)
all_results, all_params, svc_best_model = classifiermodel(Xy_param, model=svm, model_name='SVM_default', 
                                                         gs=False, model_param=None,  
                                                         df_modelresults=all_results, df_modelparams=all_params, 
                                                         showgraph=True)

In [None]:
all_results

In [None]:
all_params

### RandomForest

### ExtraTrees 

### AdaBoost 

### xgBoost

In [None]:
#import xgboost as xgb
#model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
#model.fit(x_train, y_train)
#model.score(x_test,y_test)
