In [1]:
import joblib
import pandas as pd
import numpy as np

In [2]:
X_train = joblib.load("../data/processed/X_train")
y_train = joblib.load("../data/processed/y_train")
X_val = joblib.load("../data/processed/X_val")
y_val = joblib.load("../data/processed/y_val")

In [3]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

max_roc_auc_score = 0.0
def get_roc_auc_score(model):
    global max_roc_auc_score
    model.fit(X_train, y_train)
    # If it's logistic regression, use predict_proba
    if isinstance(model, LogisticRegression):
        y_pred_train = model.predict_proba(X_train)[:,-1]
        y_pred = model.predict_proba(X_val)[:,-1]
        z_pred_train = model.predict(X_train)
        z_pred = model.predict(X_val)
        print("roc_auc_score for training classification: "+ str(roc_auc_score(y_train, z_pred_train)))
        print("roc_auc_score for testing classification: "+ str(roc_auc_score(y_val, z_pred)))
    else:
        y_pred_train = model.predict(X_train)
        y_pred = model.predict(X_val)

    if isinstance(model, LinearRegression):
        print('Coef_ is :')
        print(model.coef_)

    # sureOne = np.vectorize(lambda x: 1 if x > 0.9 else x )
    # sureZero = np.vectorize(lambda x: 0 if x < 0.5 else x )

    # y_pred = sureOne(y_pred)
    # y_pred = sureZero(y_pred)

    # print(y_pred)

    val_roc_auc_score = roc_auc_score(y_val, y_pred)
    print("roc_auc_score for training set: "+ str(roc_auc_score(y_train, y_pred_train)))
    print("roc_auc_score for testing set: "+ str(val_roc_auc_score))

    if max_roc_auc_score < val_roc_auc_score:
        print("The score is better than "+ str(max_roc_auc_score) + " so save the model")
        joblib.dump(model, "../models/kpw_best_model")
        max_roc_auc_score = val_roc_auc_score

In [4]:
from sklearn.linear_model import Lasso
get_roc_auc_score(Lasso(alpha=0.1))


roc_auc_score for training set: 0.5
roc_auc_score for testing set: 0.5
The score is better than 0.0 so save the model


In [5]:
from sklearn.linear_model import LogisticRegression
get_roc_auc_score(LogisticRegression())

roc_auc_score for training classification: 0.5156216893200217
roc_auc_score for testing classification: 0.5044313937957589
roc_auc_score for training set: 0.7055296353715409
roc_auc_score for testing set: 0.7104310866837993
The score is better than 0.5 so save the model


In [6]:
get_roc_auc_score(LogisticRegression(l1_ratio=0.5))

roc_auc_score for training classification: 0.5156216893200217
roc_auc_score for testing classification: 0.5044313937957589
roc_auc_score for training set: 0.7055296353715409
roc_auc_score for testing set: 0.7104310866837993


In [7]:
get_roc_auc_score(LogisticRegression(random_state=8))

roc_auc_score for training classification: 0.5156216893200217
roc_auc_score for testing classification: 0.5044313937957589
roc_auc_score for training set: 0.7055296353715409
roc_auc_score for testing set: 0.7104310866837993


In [8]:
from sklearn.linear_model import LinearRegression
get_roc_auc_score(LinearRegression())

Coef_ is :
[[ 0.54582578  0.04033382  0.3979464  -0.9912349   0.63396178  0.26307072
  -0.27899192 -0.62348562  0.60276375  0.45935745  0.43770223 -0.66082638
   0.15043927 -0.0373118  -0.34526586 -0.08282391  0.14791064  0.35511374
  -0.10567092  0.22921539  0.07801185  0.07829666  0.0257073   0.01407638
  -0.02019674]]
roc_auc_score for training set: 0.7047794768085447
roc_auc_score for testing set: 0.706531344252226


In [9]:
from sklearn.ensemble import GradientBoostingRegressor
get_roc_auc_score(GradientBoostingRegressor(random_state=0))

roc_auc_score for training set: 0.7886003869846178
roc_auc_score for testing set: 0.6838108538002208


In [10]:
get_roc_auc_score(GradientBoostingRegressor(random_state=8, learning_rate=0.005))

roc_auc_score for training set: 0.7185720099774061
roc_auc_score for testing set: 0.6907295647412292


In [11]:
from sklearn.ensemble import RandomForestRegressor
get_roc_auc_score(RandomForestRegressor())

roc_auc_score for training set: 1.0
roc_auc_score for testing set: 0.6387204440954828


In [12]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 150, random_state = 8, max_depth = 8, min_samples_leaf = 2))

roc_auc_score for training set: 0.8990170492807499
roc_auc_score for testing set: 0.6849436913119185


In [13]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5))

roc_auc_score for training set: 0.767647344707912
roc_auc_score for testing set: 0.6951392289171985


In [14]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 3))

roc_auc_score for training set: 0.7684183070157221
roc_auc_score for testing set: 0.6938151707513524


In [15]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2, min_samples_leaf = 7))

roc_auc_score for training set: 0.7048345455448168
roc_auc_score for testing set: 0.691866748176885


In [16]:
from sklearn.ensemble import VotingRegressor
r1 = LinearRegression()
r2 = LinearRegression()
r5 = LinearRegression()
r6 = LinearRegression()
r7 = Lasso(alpha=0.1)
r8 = GradientBoostingRegressor(random_state=0, learning_rate=0.05)
r9 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r3 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2, min_samples_leaf = 2)
# r4 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r4 = GradientBoostingRegressor(random_state=5)
vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr3', r5), ('lr4', r6), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
# vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
get_roc_auc_score(vr)

roc_auc_score for training set: 0.739891827519823
roc_auc_score for testing set: 0.7045843703190778


In [17]:
# vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
# get_roc_auc_score(vr)