In [1]:
import joblib
import pandas as pd
import numpy as np

In [2]:
X_train = joblib.load("../data/processed/X_train")
y_train = joblib.load("../data/processed/y_train")
X_val = joblib.load("../data/processed/X_val")
y_val = joblib.load("../data/processed/y_val")

In [3]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

max_roc_auc_score = 0.0
def get_roc_auc_score(model):
    global max_roc_auc_score
    model.fit(X_train, y_train)
    # If it's logistic regression, use predict_proba
    if isinstance(model, LogisticRegression):
        y_pred_train = model.predict_proba(X_train)[:,-1]
        y_pred = model.predict_proba(X_val)[:,-1]
        z_pred_train = model.predict(X_train)
        z_pred = model.predict(X_val)
        print("roc_auc_score for training classification: "+ str(roc_auc_score(y_train, z_pred_train)))
        print("roc_auc_score for testing classification: "+ str(roc_auc_score(y_val, z_pred)))
    else:
        y_pred_train = model.predict(X_train)
        y_pred = model.predict(X_val)

    if isinstance(model, LinearRegression):
        print('Coef_ is :')
        print(model.coef_)

    sureOne = np.vectorize(lambda x: 1 if x > 1 else x )
    sureZero = np.vectorize(lambda x: 0 if x < 0 else x )

    y_pred = sureOne(y_pred)
    y_pred = sureZero(y_pred)

    # # print(y_pred)

    val_roc_auc_score = roc_auc_score(y_val, y_pred)
    print("roc_auc_score for training set: "+ str(roc_auc_score(y_train, y_pred_train)))
    print("roc_auc_score for testing set: "+ str(val_roc_auc_score))

    if max_roc_auc_score < val_roc_auc_score:
        print("The score is better than "+ str(max_roc_auc_score) + " so save the model")
        joblib.dump(model, "../models/kpw_best_model")
        max_roc_auc_score = val_roc_auc_score

In [4]:
from sklearn.linear_model import Lasso
get_roc_auc_score(Lasso(alpha=0.1))


roc_auc_score for training set: 0.5
roc_auc_score for testing set: 0.5
The score is better than 0.0 so save the model


In [5]:
from sklearn.linear_model import LogisticRegression
get_roc_auc_score(LogisticRegression())

roc_auc_score for training classification: 0.5924046991142659
roc_auc_score for testing classification: 0.5804145793983193
roc_auc_score for training set: 0.7028033075103917
roc_auc_score for testing set: 0.6971727208854581
The score is better than 0.5 so save the model


In [6]:
get_roc_auc_score(LogisticRegression(l1_ratio=0.5))

roc_auc_score for training classification: 0.5924046991142659
roc_auc_score for testing classification: 0.5804145793983193
roc_auc_score for training set: 0.7028033075103917
roc_auc_score for testing set: 0.6971727208854581


In [7]:
get_roc_auc_score(LogisticRegression(random_state=8))

roc_auc_score for training classification: 0.5924046991142659
roc_auc_score for testing classification: 0.5804145793983193
roc_auc_score for training set: 0.7028033075103917
roc_auc_score for testing set: 0.6971727208854581


In [8]:
from sklearn.linear_model import LinearRegression
get_roc_auc_score(LinearRegression())

Coef_ is :
[[ 0.82869471 -0.25384755  1.64482246 -1.87596627  0.54290857  0.13775574
  -0.15544202 -0.62118787  0.42155907  0.41178312 -0.16076054  0.26213857
  -0.06953298 -0.37159479 -0.1532048   0.37906973  0.47005124 -0.09275599
   0.24418388  0.11303833  0.09796879 -0.04576895 -0.01287444 -0.00670456
  -0.04899562  0.0341028 ]]
roc_auc_score for training set: 0.7022802934929436
roc_auc_score for testing set: 0.6968841531849663


In [9]:
from sklearn.ensemble import GradientBoostingRegressor
get_roc_auc_score(GradientBoostingRegressor(random_state=0))

roc_auc_score for training set: 0.8060798418575934
roc_auc_score for testing set: 0.7533527460695483
The score is better than 0.6971727208854581 so save the model


In [10]:
get_roc_auc_score(GradientBoostingRegressor(random_state=8, learning_rate=0.08))

roc_auc_score for training set: 0.791542006121533
roc_auc_score for testing set: 0.7458642431542162


In [11]:
from sklearn.ensemble import RandomForestRegressor
get_roc_auc_score(RandomForestRegressor())

roc_auc_score for training set: 1.0
roc_auc_score for testing set: 0.9649048068763516
The score is better than 0.7533527460695483 so save the model


In [12]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 150, random_state = 8, max_depth = 8, min_samples_leaf = 2))

roc_auc_score for training set: 0.9228456976984583
roc_auc_score for testing set: 0.8325565957679779


In [13]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5))

roc_auc_score for training set: 0.7768035579520262
roc_auc_score for testing set: 0.7362981668537223


In [14]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 3))

roc_auc_score for training set: 0.7775002563244917
roc_auc_score for testing set: 0.7367161908151069


In [15]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2, min_samples_leaf = 7))

roc_auc_score for training set: 0.7055930074118397
roc_auc_score for testing set: 0.6972890604326918


In [16]:
from sklearn.ensemble import VotingRegressor
r1 = LinearRegression()
r2 = LinearRegression()
r5 = LinearRegression()
r6 = LinearRegression()
r7 = Lasso(alpha=0.1)
r8 = GradientBoostingRegressor(random_state=0, learning_rate=0.05)
r9 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r3 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2, min_samples_leaf = 2)
# r4 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r4 = GradientBoostingRegressor(random_state=5)
vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr3', r5), ('lr4', r6), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
# vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
get_roc_auc_score(vr)

roc_auc_score for training set: 0.7406960512581751
roc_auc_score for testing set: 0.7198771819368025


In [17]:
# vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
# get_roc_auc_score(vr)