In [1]:
import joblib
import pandas as pd
import numpy as np

In [2]:
X_train = joblib.load("../data/processed/X_train")
y_train = joblib.load("../data/processed/y_train")
X_val = joblib.load("../data/processed/X_val")
y_val = joblib.load("../data/processed/y_val")

In [3]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_val)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [4]:
from sklearn.metrics import roc_auc_score
max_roc_auc_score = 0.0
def get_roc_auc_score(model):
    global max_roc_auc_score
    model.fit(X_train, y_train)
    # If it's logistic regression, use predict_proba
    if isinstance(model, LogisticRegression):
        y_pred_train = model.predict_proba(X_train)[:,-1]
        y_pred = model.predict_proba(X_val)[:,-1]
        z_pred_train = model.predict(X_train)
        z_pred = model.predict(X_val)
        print("roc_auc_score for training classification: "+ str(roc_auc_score(y_train, z_pred_train)))
        print("roc_auc_score for testing classification: "+ str(roc_auc_score(y_val, z_pred)))
    else:
        y_pred_train = model.predict(X_train)
        y_pred = model.predict(X_val)

    sureOne = np.vectorize(lambda x: 1 if x > 0.9 else x )
    sureZero = np.vectorize(lambda x: 0 if x < 0.5 else x )

    y_pred = sureOne(y_pred)
    y_pred = sureZero(y_pred)

    print(y_pred)

    val_roc_auc_score = roc_auc_score(y_val, y_pred)
    print("roc_auc_score for training set: "+ str(roc_auc_score(y_train, y_pred_train)))
    print("roc_auc_score for testing set: "+ str(val_roc_auc_score))

    if max_roc_auc_score < val_roc_auc_score:
        print("The score is better than "+ str(max_roc_auc_score) + " so save the model")
        joblib.dump(model, "../models/kpw_best_model")
        max_roc_auc_score = val_roc_auc_score

In [5]:
from sklearn.linear_model import Lasso
get_roc_auc_score(Lasso(alpha=0.1))


[0.8321875 0.8321875 0.8321875 ... 0.8321875 0.8321875 0.8321875]
roc_auc_score for training set: 0.5
roc_auc_score for testing set: 0.5
The score is better than 0.0 so save the model


In [6]:
from sklearn.linear_model import LogisticRegression
get_roc_auc_score(LogisticRegression())

roc_auc_score for training classification: 0.5135755798300876
roc_auc_score for testing classification: 0.5067492199066496
[0.73836098 0.79777991 0.90993434 ... 0.92091971 0.74308528 0.91430544]
roc_auc_score for training set: 0.7047104223614732
roc_auc_score for testing set: 0.7069108882778841
The score is better than 0.5 so save the model


In [7]:
get_roc_auc_score(LogisticRegression(l1_ratio=0.5))

roc_auc_score for training classification: 0.5135755798300876
roc_auc_score for testing classification: 0.5067492199066496
[0.73836098 0.79777991 0.90993434 ... 0.92091971 0.74308528 0.91430544]
roc_auc_score for training set: 0.7047104223614732
roc_auc_score for testing set: 0.7069108882778841


In [8]:
get_roc_auc_score(LogisticRegression(random_state=0))

roc_auc_score for training classification: 0.5135755798300876
roc_auc_score for testing classification: 0.5067492199066496
[0.73836098 0.79777991 0.90993434 ... 0.92091971 0.74308528 0.91430544]
roc_auc_score for training set: 0.7047104223614732
roc_auc_score for testing set: 0.7069108882778841


In [9]:
from sklearn.linear_model import LinearRegression
get_roc_auc_score(LinearRegression())

[[0.76421958]
 [0.78911927]
 [0.89922436]
 ...
 [0.92562282]
 [0.73115571]
 [0.89588775]]
roc_auc_score for training set: 0.7037546738497277
roc_auc_score for testing set: 0.7034689165032116


In [10]:
from sklearn.ensemble import RandomForestRegressor
get_roc_auc_score(RandomForestRegressor())

[0.72 0.67 0.84 ... 0.77 0.82 0.63]
roc_auc_score for training set: 1.0
roc_auc_score for testing set: 0.6418074987469252


In [11]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 150, random_state = 8, max_depth = 8, min_samples_leaf = 2))

[0.73178629 0.63555479 0.9254413  ... 0.84729232 0.82021343 0.81755706]
roc_auc_score for training set: 0.8966321359467031
roc_auc_score for testing set: 0.6832458836856912


In [12]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5))

[0.72330667 0.67401038 0.91269408 ... 0.86996529 0.80434905 0.91260824]
roc_auc_score for training set: 0.7669847716587962
roc_auc_score for testing set: 0.6953130658755153


In [13]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 3))

[0.71872603 0.68335577 0.91818066 ... 0.87411779 0.80653069 0.91429383]
roc_auc_score for training set: 0.7668797040064166
roc_auc_score for testing set: 0.6935862854229018


In [14]:
get_roc_auc_score(RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2In, min_samples_leaf = 7))

SyntaxError: invalid syntax (<ipython-input-14-79ba340e1a25>, line 1)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
get_roc_auc_score(GradientBoostingRegressor(random_state=0))

In [None]:
get_roc_auc_score(GradientBoostingRegressor(random_state=0, learning_rate=0.05))

In [None]:
from sklearn.ensemble import VotingRegressor
r1 = LinearRegression()
r2 = LinearRegression()
r5 = LinearRegression()
r6 = LinearRegression()
r7 = Lasso(alpha=0.1)
r8 = GradientBoostingRegressor(random_state=0, learning_rate=0.05)
r9 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r3 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 2, min_samples_leaf = 2)
# r4 = RandomForestRegressor(n_estimators = 100, random_state = 8, max_depth = 5, min_samples_leaf = 5)
r4 = GradientBoostingRegressor(random_state=5)
vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr3', r5), ('lr4', r6), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
get_roc_auc_score(vr)

In [None]:
vr = VotingRegressor([('lr', r1), ('lr2', r2), ('rf', r3), ('rfh', r4), ('lr5', r7), ('lr6', r8), ('lr7', r9)])
get_roc_auc_score(vr)