In [1]:
# import dependencies

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
# read data
x_train = pd.read_csv('C:/Users/User/Downloads/x_train.csv')
x_val = pd.read_csv('C:/Users/User/Downloads/x_val.csv')
y_train = pd.read_csv('C:/Users/User/Downloads/y_train.csv')
y_val = pd.read_csv('C:/Users/User/Downloads/y_val.csv')
x_test = pd.read_csv('C:/Users/User/Downloads/x_test.csv')

In [3]:
y_train.value_counts()

target_numeric
0                 63433
1                 16159
Name: count, dtype: int64

Logistic regression train dataset

In [4]:
# Logistic regression on unbalanced dataset

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# Model development

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()
lr_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
            

In [5]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.6687017876528293
confusion_matrix: 
 [[9720 6207]
 [1344 2627]]
classification_report: 
               precision    recall  f1-score   support

           0       0.88      0.61      0.72     15927
           1       0.30      0.66      0.41      3971

    accuracy                           0.62     19898
   macro avg       0.59      0.64      0.57     19898
weighted avg       0.76      0.62      0.66     19898



In [6]:
# RandomForest classifier

# model development

params = {'n_estimators':[100, 500],
          'min_samples_split':[2, 5, None],
          'min_samples_leaf':[1,10,100]}

model = RandomForestClassifier()
rf_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

In [7]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = rf_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7364860913121355
confusion_matrix: 
 [[10285  5642]
 [ 1164  2807]]
classification_report: 
               precision    recall  f1-score   support

           0       0.90      0.65      0.75     15927
           1       0.33      0.71      0.45      3971

    accuracy                           0.66     19898
   macro avg       0.62      0.68      0.60     19898
weighted avg       0.79      0.66      0.69     19898



In [8]:
# GradientBoostingClassifier classifier

# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.1, 0.01],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [2, 5],
         'tol': [0.001, 0.0001]}

model = GradientBoostingClassifier()
gb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [9]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = gb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7365187652547902
confusion_matrix: 
 [[9285 6642]
 [ 892 3079]]
classification_report: 
               precision    recall  f1-score   support

           0       0.91      0.58      0.71     15927
           1       0.32      0.78      0.45      3971

    accuracy                           0.62     19898
   macro avg       0.61      0.68      0.58     19898
weighted avg       0.79      0.62      0.66     19898



In [10]:
from sklearn.naive_bayes import GaussianNB

# model development

params = {'var_smoothing':[1e-08, 1e-09]}

model = GaussianNB()
nb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
nb_model.fit(x_train,
             y_train.values.ravel())



Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [11]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = nb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = nb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.5900805657365495
confusion_matrix: 
 [[8229 7698]
 [1355 2616]]
classification_report: 
               precision    recall  f1-score   support

           0       0.86      0.52      0.65     15927
           1       0.25      0.66      0.37      3971

    accuracy                           0.55     19898
   macro avg       0.56      0.59      0.51     19898
weighted avg       0.74      0.55      0.59     19898



In [22]:
submissions=pd.DataFrame({'Junk':nb_model.predict_proba(x_test)[:,1]})
#submissions.to_csv('output/submission_4.csv',index=False)

In [23]:
(submissions.round(0)==1).sum()

Junk    2329
dtype: int64

In [24]:
10723
131

131

In [25]:
print("roc_auc_score:", roc_auc_score(y_val[:10854], submissions)) 

roc_auc_score: 0.5098390475376198


In [27]:
estimators = [('rf', rf_model),
              ('gb', gb_model)]
final_estimator = lr_model
              
stacking_model = StackingClassifier(estimators = estimators, 
                                  final_estimator = final_estimator)
stacking_model.fit(x_train,
                   y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


20 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits


30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits


30 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in 

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits


90 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\myproject\myfirstproject\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
            

In [29]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = stacking_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = stacking_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7376289124595585
confusion_matrix: 
 [[10252  5675]
 [ 1145  2826]]
classification_report: 
               precision    recall  f1-score   support

           0       0.90      0.64      0.75     15927
           1       0.33      0.71      0.45      3971

    accuracy                           0.66     19898
   macro avg       0.62      0.68      0.60     19898
weighted avg       0.79      0.66      0.69     19898



In [97]:
submissions1=pd.DataFrame({'Junk':stacking_model.predict_proba(x_test)[:,1]})

In [98]:
(submissions1.round(0)==0).sum()

Junk    6131
dtype: int64

In [99]:
from sklearn.metrics import roc_auc_score

In [100]:
y_train

Unnamed: 0,target_numeric
0,1
1,1
2,0
3,0
4,0
...,...
79587,0
79588,0
79589,0
79590,0


In [101]:
print("roc_auc_score:", roc_auc_score(y_train[:10854], submissions1)) 

roc_auc_score: 0.4943039890949973


In [103]:
submissions1.to_csv('saujanyaproject.csv',index=False)