### Imports

In [1]:
# pandas and numpy for data manipulation
import numpy as np
import pandas as pd

# matplotlib and seaborn for visuilization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

# No warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read in data into dataframe
file_train=r'D:\Machine Learning\Python Projects\Project-2\project\train.csv'

car_train=pd.read_csv(file_train)
print('shape of train data after feature engineering: {}'.format(car_train.shape))

shape of train data after feature engineering: (5822, 103)


#### Splitting into train,validation and test

In [3]:
from sklearn.model_selection import train_test_split

train,test=train_test_split(car_train,test_size=0.15,random_state=123)

print('shape of train data: {}'.format(train.shape))
print('shape of test data: {}'.format(test.shape))

train1, validation=train_test_split(train,test_size=0.2,random_state=456)
print('shape of train1 data: {}'.format(train1.shape))
print('shape of validation data: {}'.format(validation.shape))

shape of train data: (4948, 103)
shape of test data: (874, 103)
shape of train1 data: (3958, 103)
shape of validation data: (990, 103)


In [4]:
# Separting independent and dependent variables
x_train=train1.drop('V86',axis=1)
y_train=train1['V86']

print('shape of x_train data: {}'.format(x_train.shape))
print('shape of y_train data: {}'.format(y_train.shape))

x_validation=validation.drop('V86',axis=1)
y_validation=validation['V86']

print('shape of x_validation data: {}'.format(x_validation.shape))
print('shape of y_validation data: {}'.format(y_validation.shape))

x_test=test.drop('V86',axis=1)
y_test=test['V86']

print('shape of x_test data: {}'.format(x_test.shape))
print('shape of y_test data: {}'.format(y_test.shape))

shape of x_train data: (3958, 102)
shape of y_train data: (3958,)
shape of x_validation data: (990, 102)
shape of y_validation data: (990,)
shape of x_test data: (874, 102)
shape of y_test data: (874,)


### Model Building

#### 1: Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
logr=LogisticRegression()

#### 1.1: Hyper parameter tuning using GridSearchCV

In [6]:
params={'penalty':['l1','l2','elasticnet'],
       'C':[2,4,6,8,10],
        'fit_intercept':[True,False],
       'class_weight':['balanced','None']}

In [7]:
logr_grid=GridSearchCV(logr,param_grid=params,scoring='roc_auc',cv=10,verbose=1,n_jobs=-1)
logr_grid.fit(x_validation,y_validation)

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    7.3s finished


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [2, 4, 6, 8, 10],
                         'class_weight': ['balanced', 'None'],
                         'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='roc_auc', verbose=1)

In [8]:
def report(results,n_top=3):
    for i in range(1,n_top+1):
        candidates = np.flatnonzero(results['rank_test_score']==i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean Validation Score: {0:.8f} (std:{1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [9]:
report(logr_grid.cv_results_,3)

Model with rank: 1
Mean Validation Score: 0.71414932 (std:0.107)
Parameters: {'C': 2, 'class_weight': 'None', 'fit_intercept': True, 'penalty': 'l2'}

Model with rank: 2
Mean Validation Score: 0.71165180 (std:0.104)
Parameters: {'C': 2, 'class_weight': 'None', 'fit_intercept': False, 'penalty': 'l2'}

Model with rank: 3
Mean Validation Score: 0.70798978 (std:0.104)
Parameters: {'C': 4, 'class_weight': 'None', 'fit_intercept': True, 'penalty': 'l2'}



In [10]:
logr_grid.best_estimator_

LogisticRegression(C=2, class_weight='None')

#### 1.2: Fitting on the train data with best parameters and predicing on test data

In [11]:
from sklearn.metrics import roc_auc_score,fbeta_score

In [12]:
logr_model=LogisticRegression(C=2,penalty='l2',class_weight=None,fit_intercept=True)
logr_model.fit(x_train,y_train)

# At Ramdon cutoff ---> 0.5
predict_test=logr_model.predict(x_test)
print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_test)))
print('FB2: {}'.format(fbeta_score(y_test,predict_test,beta=2)))

roc_auc_score: 0.5072475824061558
FB2: 0.02092050209205021


In [13]:
train_score=logr_model.predict_proba(x_train)[:,1]
real=y_train
print(logr_model.classes_)

[0. 1.]


In [14]:
cutoffs=np.linspace(0,1,200)

In [15]:
beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [16]:
mycutoff=cutoffs[FB_all==max(FB_all)][0]
mycutoff

0.010050251256281407

In [17]:
test_score=logr_model.predict_proba(x_test)[:,1]
test_classes=(test_score>mycutoff).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes,beta=2)))

roc_auc_score: 0.5760736196319018
FB2: 0.2991886409736308


### 2: Random Forest 

In [18]:
# Fitting the Random Forest on Training set
from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
clf=RandomForestClassifier()

#### 2.1: Hyper parameter tuning using RandomizedSearchCV

In [19]:
params_dist={'n_estimators':[100,150,200,300,500,700,1000],
            'criterion':['gini','entropy'],
            'max_depth':[3,5,8,None],
            'min_samples_split':sp_randint(3,11),
            'min_samples_leaf':sp_randint(3,11),
            'max_features':sp_randint(3,11),
            'bootstrap':[True,False],
            'class_weight':['balanced']}

In [20]:
# run Randomized Search
n_iter_search=200

random_search=RandomizedSearchCV(clf,param_distributions=params_dist,n_iter=n_iter_search,scoring='roc_auc',
                                n_jobs=-1,cv=10,verbose=1,random_state=789)
random_search.fit(x_validation,y_validation)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  6.3min finished


RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'class_weight': ['balanced'],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, 8, None],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019DCD505C70>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019DCD505A30>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019DCD505520>,
                                        'n_estimators': [100, 150, 200, 300,
                                                         500, 700, 1000]},
                   random_state=789, scoring='roc_auc', v

In [21]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean Validation Score: 0.71402578 (std:0.085)
Parameters: {'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 10, 'min_samples_split': 6, 'n_estimators': 300}

Model with rank: 2
Mean Validation Score: 0.71144056 (std:0.094)
Parameters: {'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3, 'max_features': 8, 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 300}

Model with rank: 3
Mean Validation Score: 0.71057653 (std:0.096)
Parameters: {'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 1000}



In [22]:
random_search.best_estimator_

RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=3,
                       max_features=3, min_samples_leaf=10, min_samples_split=6,
                       n_estimators=300)

{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 9, 'n_estimators': 200}

#### 2.2: Fitting on the train data with best parameters and predicing on test data

In [23]:
# Fit model on train data and predict
rf=RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=3,
                       max_features=3, min_samples_leaf=10, min_samples_split=6,
                       n_estimators=300)
rf.fit(x_train,y_train)


# At Ramdon cutoff ---> 0.5
predict_test=rf.predict(x_test)
print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_test)))
print('FB2: {}'.format(fbeta_score(y_test,predict_test,beta=2)))

roc_auc_score: 0.6456899240927524
FB2: 0.33849129593810445


In [24]:
train_score_rf=rf.predict_proba(x_train)[:,1]
real=y_train
print(rf.classes_)

[0. 1.]


In [25]:
cutoffs=np.linspace(0,1,200)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_rf>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [26]:
mycutoff_rf=cutoffs[FB_all==max(FB_all)][0]
mycutoff_rf

0.2964824120603015

In [27]:
test_score_rf=rf.predict_proba(x_test)[:,1]
test_classes_rf=(test_score_rf>mycutoff_rf).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_rf)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_rf,beta=2)))

roc_auc_score: 0.5012269938650307
FB2: 0.26624548736462095


In [28]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(x_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, list(x_train.columns)[f], importances[indices[f]]))

Feature ranking:
1. feature V2 (0.070586)
2. feature V3 (0.050008)
3. feature V4 (0.049203)
4. feature V6 (0.045950)
5. feature V7 (0.045173)
6. feature V8 (0.035429)
7. feature V9 (0.034680)
8. feature V10 (0.032473)
9. feature V11 (0.032380)
10. feature V12 (0.030558)
11. feature V13 (0.026861)
12. feature V14 (0.026407)
13. feature V15 (0.025254)
14. feature V16 (0.024165)
15. feature V17 (0.023786)
16. feature V18 (0.022499)
17. feature V19 (0.022115)
18. feature V20 (0.020729)
19. feature V21 (0.018590)
20. feature V22 (0.017915)
21. feature V23 (0.017832)
22. feature V24 (0.015624)
23. feature V25 (0.015516)
24. feature V26 (0.015251)
25. feature V27 (0.014987)
26. feature V28 (0.014979)
27. feature V29 (0.014483)
28. feature V30 (0.013319)
29. feature V31 (0.012706)
30. feature V32 (0.012406)
31. feature V33 (0.010604)
32. feature V34 (0.010548)
33. feature V35 (0.010142)
34. feature V36 (0.010069)
35. feature V37 (0.009936)
36. feature V38 (0.009840)
37. feature V39 (0.007230)


### 3: Gradient Boosting 

In [29]:
# Fitting Gradient Boosting on Training set
from hyperopt import fmin, tpe,hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

#### 3.1: Hyper parameter tuning using Bayesian Hyperopt

In [30]:
# Hyperparameter using Bayesian Hyperopt
def acc_model(params):
    gbm=GradientBoostingClassifier(**params)
    return cross_val_score(gbm,x_validation,y_validation).mean()

In [31]:
param_space={'learning_rate':hp.choice('learning_rate',[0.001,0.01,0.1,0.2]),
             'n_estimators':hp.choice('n_estimators',range(200,500)),
            'subsample':hp.choice('subsample',[0.2,0.4,0.6,0.8,1]),
            'max_depth':hp.choice('max_depth',range(1,10)),
            'max_features':hp.choice('max_features',range(5,15)),
            }


In [32]:
best=0
def f(params):
    global best
    acc = acc_model(params)
    if acc>best:
        best=acc
    print('new best:',best,params)
    return {'loss': -acc,'status': STATUS_OK}

In [33]:
trials=Trials()
best=fmin(f,param_space,algo=tpe.suggest,max_evals=25,trials=trials)
print('best: {}'.format(best))

new best:                                                                                                              
0.8484848484848484                                                                                                     
{'learning_rate': 0.2, 'max_depth': 6, 'max_features': 9, 'n_estimators': 240, 'subsample': 0.4}                       
new best:                                                                                                              
0.9434343434343434                                                                                                     
{'learning_rate': 0.01, 'max_depth': 2, 'max_features': 7, 'n_estimators': 213, 'subsample': 0.8}                      
new best:                                                                                                              
0.9434343434343434                                                                                                     
{'learning_rate': 0.01, 'max_depth': 6, 

1: best: {'learning_rate': 0.01, 'max_depth': 4, 'max_features': 12, 'n_estimators': 226, 'subsample': 0.2}

2: best : {'learning_rate': 0.01, 'max_depth': 2, 'max_features': 7, 'n_estimators': 214, 'subsample': 0.8}

#### 3.2: Fitting on the train data with best parameters and predicing on test data

In [34]:
gbm_bayesian=GradientBoostingClassifier(**{'learning_rate': 0.01, 'max_depth': 2, 'max_features': 7, 
                                           'n_estimators': 214, 'subsample': 0.8})
gbm_bayesian.fit(x_train,y_train)

predict_gbm=gbm_bayesian.predict(x_test)
print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_gbm)))
print('FB2: {}'.format(fbeta_score(y_test,predict_gbm,beta=2)))

roc_auc_score: 0.5
FB2: 0.0


In [35]:
train_score_gbm=gbm_bayesian.predict_proba(x_train)[:,1]
real=y_train
print(gbm_bayesian.classes_)

[0. 1.]


In [36]:
cutoffs=np.linspace(0,1,500)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_gbm>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [37]:
mycutoff_gbm=cutoffs[FB_all==max(FB_all)][0]
mycutoff_gbm

0.044088176352705406

In [38]:
test_score_gbm=gbm_bayesian.predict_proba(x_test)[:,1]
test_classes_gbm=(test_score_gbm>mycutoff_gbm).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_gbm)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_gbm,beta=2)))

roc_auc_score: 0.6192055734636581
FB2: 0.32019704433497537


### 4: XGBoost

In [55]:
# Fitting xgboost on Trining set
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBClassifier

#### 4.1: Hyper parameter tuning using Bayesian Hyperopt

In [56]:
# Hyperparameter tuning using Bayesian Hyperopt
def acc_model(params):
    xgb=XGBClassifier(**params)
    return cross_val_score(xgb,x_validation,y_validation).mean()

In [57]:
param_space={'n_estimators':hp.choice('n_estimators',[300,500,600,700,800,1000]),
            'max_depth':hp.choice('max_depth',[2,3,4,5,6,7,8]),
            'learning_rate':hp.choice('learning_rate',[0.001,0.01,0.1,0.2,0.5]),
            'gamma':hp.choice('gamma',[i/10.0 for i in range(0,5)]),
            'min_child_weight':hp.choice('min_child_weight',[4,5,6,7]),
            'subsample':hp.choice('subsample',[i/10.0 for i in range(3,10)]),
            'colsample_bytree':hp.choice('colsample_bytree',[i/10.0 for i in range(3,10)]),
            'reg_alpha':hp.choice('reg_alpha',[0.1,5,10,40,50,70]),
            'reg_lambda':hp.choice('reg_lambda',[1,5,10,15,50]),
            'scale_pos_weight':hp.choice('scale_pos_weight',[2,3,4,5,6,7,8,9])}

In [58]:
best=0
def f(params):
    global best
    acc = acc_model(params)
    if acc>best:
        best=acc
    print('new best:',best,params)
    return {'loss':-acc, 'status':STATUS_OK}

In [59]:
trials=Trials()
best=fmin(f,param_space,algo=tpe.suggest,max_evals=50,trials=trials)
print('best: {}'.format(best))

new best:                                                                                                              
0.9131313131313131                                                                                                     
{'colsample_bytree': 0.4, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 6, 'n_estimators': 1000, 'reg_alpha': 5, 'reg_lambda': 5, 'scale_pos_weight': 9, 'subsample': 0.9}
new best:                                                                                                              
0.9131313131313131                                                                                                     
{'colsample_bytree': 0.4, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 800, 'reg_alpha': 10, 'reg_lambda': 5, 'scale_pos_weight': 7, 'subsample': 0.5}
new best:                                                                                                              
0.9282

new best:                                                                                                              
0.9434343434343434                                                                                                     
{'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.5, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 1000, 'reg_alpha': 40, 'reg_lambda': 10, 'scale_pos_weight': 6, 'subsample': 0.7}
new best:                                                                                                              
0.9434343434343434                                                                                                     
{'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.001, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 700, 'reg_alpha': 40, 'reg_lambda': 10, 'scale_pos_weight': 7, 'subsample': 0.7}
new best:                                                                                                              
0

{'colsample_bytree': 0.4, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 800, 'reg_alpha': 40, 'reg_lambda': 15, 'scale_pos_weight': 5, 'subsample': 0.5}

{'colsample_bytree': 0.6, 'gamma': 0.2, 'learning_rate': 0.001, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 800, 'reg_alpha': 50, 'reg_lambda': 50, 'scale_pos_weight': 5, 'subsample': 0.4}

#### 4.2: Fitting the on train data with best parameters and predicing on test data

In [60]:
xgb_bayesian=XGBClassifier(**{'colsample_bytree': 0.6, 'gamma': 0.2, 'learning_rate': 0.001, 
                              'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 800, 
                              'reg_alpha': 50, 'reg_lambda': 50, 'scale_pos_weight': 5, 
                              'subsample': 0.4})
xgb_bayesian.fit(x_train,y_train)

predict_xgb=xgb_bayesian.predict(x_test)
print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_xgb)))
print('FB2: {}'.format(fbeta_score(y_test,predict_xgb,beta=2)))

roc_auc_score: 0.5
FB2: 0.0


In [61]:
train_score_xgb=xgb_bayesian.predict_proba(x_train)[:,1]
real=y_train
print(xgb_bayesian.classes_)

[0. 1.]


In [62]:
cutoffs=np.linspace(0,1,500)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_xgb>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [63]:
mycutoff_xgb=cutoffs[FB_all==max(FB_all)][0]
mycutoff_xgb

0.3567134268537074

In [64]:
test_score_xgb=xgb_bayesian.predict_proba(x_test)[:,1]
test_classes_xgb=(test_score_xgb>mycutoff_xgb).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_xgb)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_xgb,beta=2)))

roc_auc_score: 0.6414058438182386
FB2: 0.33508541392904073


### 5: KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()

#### 5.1: Hyper parameter tuning using RandomizedSearchCV

In [50]:
params={'n_neighbors':[10,20,30,40,50],
       'weights':['uniform','distance'],
       'p':[1,2]}

In [51]:
knn_random=RandomizedSearchCV(knn,param_distributions=params,cv=10,scoring='roc_auc',
                             n_iter=100,verbose=1,n_jobs=-1)
knn_random.fit(x_validation,y_validation)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.6s finished


RandomizedSearchCV(cv=10, estimator=KNeighborsClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'n_neighbors': [10, 20, 30, 40, 50],
                                        'p': [1, 2],
                                        'weights': ['uniform', 'distance']},
                   scoring='roc_auc', verbose=1)

In [52]:
report(knn_random.cv_results_,3)

Model with rank: 1
Mean Validation Score: 0.64086593 (std:0.102)
Parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 50}

Model with rank: 2
Mean Validation Score: 0.62710288 (std:0.135)
Parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 30}

Model with rank: 3
Mean Validation Score: 0.62667696 (std:0.111)
Parameters: {'weights': 'distance', 'p': 2, 'n_neighbors': 50}



In [53]:
knn_random.best_estimator_

KNeighborsClassifier(n_neighbors=50)

#### 5.2: Fitting  on the train data with best parameters and predicing on test data

In [54]:
# Fit model on train data and predict

knn_model=KNeighborsClassifier(n_neighbors=50)
knn_model.fit(x_train,y_train)

predicted_knn=knn_model.predict(x_test)
print('roc_auc_score: {}'.format(roc_auc_score(y_test,predicted_knn)))
print('FB2: {}'.format(fbeta_score(y_test,predicted_knn,beta=2)))

roc_auc_score: 0.5
FB2: 0.0


In [67]:
train_score_knn=knn_model.predict_proba(x_train)[:,1]
real=y_train
print(knn_model.classes_)

[0. 1.]


In [68]:
cutoffs=np.linspace(0,1,500)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_knn>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [69]:
mycutoff_knn=cutoffs[FB_all==max(FB_all)][0]
mycutoff_knn

0.02004008016032064

In [71]:
test_score_knn=knn_model.predict_proba(x_test)[:,1]
test_classes_knn=(test_score_knn>mycutoff_knn).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_knn)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_knn,beta=2)))

roc_auc_score: 0.6101175002599564
FB2: 0.3140394088669951


### 6: Naive Bayes 

In [75]:
from sklearn.naive_bayes import GaussianNB
naive=GaussianNB()

#### 6.2: Fitting on the train data with best parameters and predicing on test data

In [77]:
# Fitting the model on train and predicting on test
naive.fit(x_train,y_train)

predict_naive=naive.predict(x_test)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_naive)))
print('FB2: {}'.format(fbeta_score(y_test,predict_naive,beta=2)))

roc_auc_score: 0.5611937194551315
FB2: 0.2891692954784437


In [78]:
train_score_naive=naive.predict_proba(x_train)[:,1]
real=y_train
print(naive.classes_)

[0. 1.]


In [79]:
cutoffs=np.linspace(0,1,500)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_naive>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [80]:
mycutoff_naive=cutoffs[FB_all==max(FB_all)][0]
mycutoff_naive

0.03006012024048096

In [81]:
test_score_naive=naive.predict_proba(x_test)[:,1]
test_classes_naive=(test_score_naive>mycutoff_naive).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_naive)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_naive,beta=2)))

roc_auc_score: 0.5483102838723095
FB2: 0.2829218106995884


### 7: SVM

In [89]:
from sklearn import svm
clf=svm.SVC()

#### 7.1: Hyper parameter tuning using RandomizedSearchCV

In [95]:
params={
       'C':[0.2,0.5,0.8,1],
       'kernel' : ['rbf', 'sigmoid'],
       'gamma':[0.2,0.5,0.8,1],
       'class_weight':['balanced',None]}

In [96]:
svm_random=RandomizedSearchCV(clf,param_distributions=params,cv=10,scoring='roc_auc',n_iter=50,
                             verbose=1,n_jobs=-1)
svm_random.fit(x_validation,y_validation)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   10.5s finished


RandomizedSearchCV(cv=10, estimator=SVC(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [0.2, 0.5, 0.8, 1],
                                        'class_weight': ['balanced', None],
                                        'gamma': [0.2, 0.5, 0.8, 1],
                                        'kernel': ['rbf', 'sigmoid']},
                   scoring='roc_auc', verbose=1)

In [98]:
report(svm_random.cv_results_,3)

Model with rank: 1
Mean Validation Score: 0.73652635 (std:0.088)
Parameters: {'kernel': 'sigmoid', 'gamma': 0.2, 'class_weight': 'balanced', 'C': 0.2}

Model with rank: 2
Mean Validation Score: 0.72789064 (std:0.083)
Parameters: {'kernel': 'sigmoid', 'gamma': 0.2, 'class_weight': 'balanced', 'C': 0.5}

Model with rank: 3
Mean Validation Score: 0.71043011 (std:0.074)
Parameters: {'kernel': 'sigmoid', 'gamma': 0.2, 'class_weight': 'balanced', 'C': 1}



In [99]:
svm_random.best_estimator_

SVC(C=0.2, class_weight='balanced', gamma=0.2, kernel='sigmoid')

#### 7.2: Fitting  on the train data with best parameters and predicing on test data

In [104]:
# Fitting the model on train and predicting on test
svm_model=svm.SVC(C=0.2, class_weight='balanced', gamma=0.2, kernel='sigmoid',probability=True)
svm_model.fit(x_train,y_train)

predict_svm=svm_model.predict(x_test)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,predict_svm)))
print('FB2: {}'.format(fbeta_score(y_test,predict_svm,beta=2)))

roc_auc_score: 0.6223042528855152
FB2: 0.3183229813664596


In [None]:
# LinearSVC_classifier = SklearnClassifier(SVC(kernel='linear',probability=True))

In [105]:
train_score_svm=svm_model.predict_proba(x_train)[:,1]
real=y_train
print(svm_model.classes_)

[0. 1.]


In [106]:
cutoffs=np.linspace(0,1,500)

beta=2
FB_all=[]
for cutoff in cutoffs:
    predicted=(train_score_svm>cutoff).astype(int)
    
    TP=((predicted==1) & (real==1)).sum()
    TN=((predicted==0) & (real==0)).sum()
    FP=((predicted==1) & (real==0)).sum()
    FN=((predicted==0) & (real==1)).sum()
    
    P=TP+FN
    N=TN+FP
    
    precision=TP/(TP+FP)
    recall=TP/P
    FB=(1+beta**2)*precision*recall/((beta**2)*precision*recall)
    FB_all.append(FB)

In [107]:
mycutoff_svm=cutoffs[FB_all==max(FB_all)][0]
mycutoff_svm

0.004008016032064128

In [108]:
test_score_svm=svm_model.predict_proba(x_test)[:,1]
test_classes_svm=(test_score_svm>mycutoff_svm).astype(int)

print('roc_auc_score: {}'.format(roc_auc_score(y_test,test_classes_svm)))
print('FB2: {}'.format(fbeta_score(y_test,test_classes_svm,beta=2)))

roc_auc_score: 0.5196319018404908
FB2: 0.27365491651205937
