# Hyperparameter tuning using Oputa on XGB model and LGBM and Catboost model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm

from sklearn.metrics.pairwise import rbf_kernel

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline, make_pipeline, FunctionTransformer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
import xgboost as xgb
import warnings
warnings.simplefilter('ignore')

In [2]:
agg_train = pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/train/agg_train.csv",low_memory=False)
agg_test = pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/test/agg_test.csv",low_memory=False)

In [4]:
agg_train.head()

Unnamed: 0.1,Unnamed: 0,disrict,client_catg,region,consumption_diff_sum,consumption_diff_mean,tarif_type_count,reading_remarque_mean,counter_statue_count,counter_coefficient_mean,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean,counter_number_nunique,months_number_mean,tenure_days,invoice_freq,daily_consumption,client_period
0,0,60,11,101,0,0.0,35,6.971429,35,1.0,352.4,10.571429,0.0,0.0,1,4.628571,4901,140.028571,2.592124,25
1,1,69,11,107,0,0.0,37,7.216216,37,1.0,557.540541,0.0,0.0,0.0,1,4.324324,4913,132.783784,4.19886,17
2,2,62,11,301,0,0.0,18,7.055556,18,1.0,798.611111,37.888889,0.0,0.0,1,6.444444,4921,273.388889,3.059744,33
3,3,69,11,105,0,0.0,20,6.15,20,1.0,1.2,0.0,0.0,0.0,1,4.2,2664,133.2,0.009009,16
4,4,62,11,303,0,0.0,14,8.857143,14,1.0,663.714286,104.857143,117.357143,36.714286,1,3.714286,1585,113.214286,7.825237,5


In [5]:
agg_test.head()

Unnamed: 0.1,Unnamed: 0,disrict,client_catg,region,consumption_diff_sum,consumption_diff_mean,tarif_type_count,reading_remarque_mean,counter_statue_count,counter_coefficient_mean,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean,counter_number_nunique,months_number_mean,tenure_days,invoice_freq,daily_consumption,client_period
0,0,62,11,307,0,0.0,37,6.810811,37,1.0,488.135135,3.243243,0.0,0.0,1,4.378378,4967,134.243243,3.660358,17
1,1,69,11,103,0,0.0,22,7.636364,22,1.0,1091.409091,843.136364,182.318182,586.318182,1,4.545455,3744,170.181818,12.438835,10
2,2,62,11,310,0,0.0,74,7.459459,74,1.0,554.040541,37.364865,15.743243,0.162162,2,4.0,5022,67.864865,8.946436,15
3,3,60,11,101,0,0.0,40,6.575,40,1.0,244.35,0.0,0.0,0.0,2,3.9,2486,62.15,3.931617,20
4,4,62,11,301,-116,-2.188679,53,7.90566,53,1.0,568.188679,145.056604,33.679245,0.0,3,4.528302,5137,96.924528,7.706249,42


In [3]:
client_train =pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/train/client_train.csv",low_memory=False)
target = client_train['target']

In [6]:
def rbf_transformer(Y):
    return make_pipeline(
        FunctionTransformer(rbf_kernel, kw_args=dict(Y=Y, gamma=0.1),feature_names_out="one-to-one"),
        StandardScaler()
    )
# def num_transformer():
#     return make_pipeline(
#             FunctionTransformer(lambda x: x.astype(int), validate=False)

preprocessing = ColumnTransformer([
 ('onehot', OneHotEncoder(handle_unknown="ignore"), ['disrict']),
 ("rbf_transformer_105", rbf_transformer([[105]]), ["region"]),
 ("rbf_transformer_305", rbf_transformer([[305]]), ["region"]),
 ("rbf_transformer_375", rbf_transformer([[375]]), ["region"]),
 ], remainder="passthrough")

In [2]:
train_df = pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/train/train_df.csv",low_memory=False)
test_df = pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/test/test_df.csv",low_memory=False)

In [3]:
train_df.shape

(135488, 388)

In [4]:
train_df.head()

Unnamed: 0,region_group,creation_day,creation_month,creation_year,duration,target,disrict_60,disrict_62,disrict_63,disrict_69,...,Rem_9_GAZ,consumption_diff_sum,consumption_diff_mean,tenure_days,invoice_freq,daily_consumption,first_invoice_gap_ELEC,last_invoice_gap_ELEC,first_invoice_gap_GAZ,last_invoice_gap_GAZ
0,2,31,12,1994,324,0.0,1.0,0.0,0.0,0.0,...,0.0,2888.0,82.51428,4901,140.0,0.5894,3943.0,8844.0,0.0,0.0
1,2,29,5,2002,235,0.0,0.0,0.0,0.0,1.0,...,0.0,4789.0,129.43243,4913,132.8,0.9746,1239.0,6152.0,0.0,0.0
2,3,13,3,1986,429,0.0,0.0,1.0,0.0,0.0,...,0.0,2286.0,127.0,4921,273.5,0.4646,7182.0,12103.0,0.0,0.0
3,2,11,7,1996,305,0.0,0.0,0.0,0.0,1.0,...,0.0,6.0,0.3,2664,133.2,0.002253,3256.0,5920.0,0.0,0.0
4,3,14,10,2014,86,0.0,0.0,1.0,0.0,0.0,...,0.0,3331.0,237.92857,1585,113.2,2.02,122.0,1707.0,0.0,0.0


# LGBM

In [None]:
import optuna
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

def objective(trial,X,y):
 # 'is_unbalance':True,
    lgbm_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt', 
        'metric': 'auc',
        "n_estimators": 1000,
        'scale_pos_weight': trial.suggest_int('scale_pos_weight', 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_iterations": 500,
        "num_leaves": trial.suggest_int('num_leaves',20,100),
        "min_child_samples": trial.suggest_int("min_child_samples",20,500),
        "max_depth": trial.suggest_int("max_depth",5,20),
        "subsample": trial.suggest_float("subsample",0.5,1.0),
        "feature_fraction": trial.suggest_float("feature_fraction",0.5,1.0)
    }
    
    light_GBM =ImPipeline([
    ('model', LGBMClassifier(**lgbm_params))
    ])
    
    # Fit the model
    light_GBM.fit(train_df, target)
    cv_scores = cross_val_score(light_GBM, X, y, cv=10, n_jobs=-1, scoring="roc_auc")
    score = np.mean(cv_scores)
    # scorestd = cv_scores.std()
    return score  # Replace with appropriate metric

# Create a study object and specify the direction is 'maximize'.
study = optuna.create_study(direction='maximize')

# Start the optimization
study.optimize(lambda trial: objective(trial, train_df, target), n_trials=100,  gc_after_trial=True)

# Print the optimal parameters
print(study.best_params)

In [50]:
study.best_params

'''
10 mins
Best is trial 7 with value: 0.8369938153532145.
{'scale_pos_weight': 5, 'learning_rate': 0.007708305735559817}

100 trial 72 mins
Best is trial 88 with value: 0.839493765558394.
{'scale_pos_weight': 3, 'learning_rate': 0.023458592474959277, 'num_leaves': 43, 'min_child_samples': 123, 
'max_depth': 16, 'subsample': 0.9637220440391915, 'feature_fraction': 0.566261327139611}

20 hrs
Best is trial 51 with value: 0.9026014242790721.
Trial 51 finished with value: 0.9026014242790721 and parameters: {'scale_pos_weight': 3, 'learning_rate': 0.017044136762318283,
'num_leaves': 79, 'min_child_samples': 98, 'max_depth': 11, 'subsample': 0.8679507197110152, 'feature_fraction': 0.5453545038655571}.
'''



In [51]:
params = {
        'objective': 'binary',
        'boosting_type': 'gbdt', 
        'metric': 'auc',
        "n_estimators": 1000,
        'scale_pos_weight': 3,
        "learning_rate": 0.017044136762318283,
        "num_iterations": 500,
        "num_leaves": 79,
        "min_child_samples": 98,
        "max_depth": 11,
        "subsample": 0.8679507197110152,
        "feature_fraction": 0.5453545038655571
    }
    
light_GBM =ImPipeline([
('model', LGBMClassifier(**params))
])

# Fit the model
light_GBM.fit(train_df, target)

[LightGBM] [Info] Number of positive: 7566, number of negative: 127922
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 56368
[LightGBM] [Info] Number of data points in the train set: 135488, number of used features: 310
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055843 -> initscore=-2.827756
[LightGBM] [Info] Start training from score -2.827756



# Tuning XGB model

In [5]:
target = train_df['target']
train_df.drop(columns=['target'],axis=1,inplace=True)

In [6]:
train_df.head()

Unnamed: 0,region_group,creation_day,creation_month,creation_year,duration,disrict_60,disrict_62,disrict_63,disrict_69,client_catg_11,...,Rem_9_GAZ,consumption_diff_sum,consumption_diff_mean,tenure_days,invoice_freq,daily_consumption,first_invoice_gap_ELEC,last_invoice_gap_ELEC,first_invoice_gap_GAZ,last_invoice_gap_GAZ
0,2,31,12,1994,324,1.0,0.0,0.0,0.0,1.0,...,0.0,2888.0,82.51428,4901,140.0,0.5894,3943.0,8844.0,0.0,0.0
1,2,29,5,2002,235,0.0,0.0,0.0,1.0,1.0,...,0.0,4789.0,129.43243,4913,132.8,0.9746,1239.0,6152.0,0.0,0.0
2,3,13,3,1986,429,0.0,1.0,0.0,0.0,1.0,...,0.0,2286.0,127.0,4921,273.5,0.4646,7182.0,12103.0,0.0,0.0
3,2,11,7,1996,305,0.0,0.0,0.0,1.0,1.0,...,0.0,6.0,0.3,2664,133.2,0.002253,3256.0,5920.0,0.0,0.0
4,3,14,10,2014,86,0.0,1.0,0.0,0.0,1.0,...,0.0,3331.0,237.92857,1585,113.2,2.02,122.0,1707.0,0.0,0.0


In [44]:
train_df.info

<bound method DataFrame.info of         region_group  creation_day  creation_month  creation_year  duration  \
0                  2            31              12           1994       324   
1                  2            29               5           2002       235   
2                  3            13               3           1986       429   
3                  2            11               7           1996       305   
4                  3            14              10           2014        86   
...              ...           ...             ...            ...       ...   
135483             3            26               7           2004       209   
135484             3            25              10           2012       110   
135485             3            22              11           2011       121   
135486             2            22              12           1993       336   
135487             2            18               2           1986       430   

        disrict_60 

In [7]:
target

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
135483    0.0
135484    0.0
135485    0.0
135486    0.0
135487    0.0
Name: target, Length: 135488, dtype: float64

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder

# numeric_features = ["", ""]
# numeric_transformer = Pipeline(
#     steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
# )

# categorical_features = ["", "", ""]
# categorical_transformer = Pipeline(
#     steps=[
#         ("encoder", OneHotEncoder(handle_unknown="ignore")),
#         ("selector", SelectPercentile(chi2, percentile=50)),
#     ]
# )

# preprocessor = ColumnTransformer([
#     ('num', numerical_transformer, make_column_selector(dtype_exclude=object)),
#     ('cat', categorical_transformer, make_column_selector(dtype_exclude=object))
#     ])

In [None]:
import optuna
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

        
def objective(trial,X,y):
 # 'is_unbalance':True,
    params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'booster': 'gbtree',
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 11),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.4, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'alpha': trial.suggest_float('alpha', 0, 10),
            'lambda': trial.suggest_float('lambda', 0, 10),
            'scale_pos_weight': trial.suggest_int('scale_pos_weight', 0, 10),
            'n_estimators': 1000
            }
    
    XGBmodel =ImPipeline([
    ('model', XGBClassifier(**params))
    ])
    
    # Fit the model
    XGBmodel.fit(train_df, target)
    cv_scores = cross_val_score(XGBmodel, X, y, cv=10, n_jobs=-1, scoring="roc_auc")
    score = np.mean(cv_scores)
    # scorestd = cv_scores.std()
    return score  # Replace with appropriate metric

# Create a study object and specify the direction is 'maximize'.
study = optuna.create_study(direction='maximize')

# Start the optimization
study.optimize(lambda trial: objective(trial, train_df, target), n_trials=100,  gc_after_trial=True)

# Print the optimal parameters
print(study.best_params)

In [None]:

print(study.best_params)
# 0.9001688929625951 and parameters: {'learning_rate': 0.02039826866042329, 'max_depth': 5, 'min_child_weight': 4, 
# 'subsample': 0.6937845559087417, 'gamma': 0.3651378395261624, 'colsample_bytree': 0.6544744376383287, 'alpha': 2.2996772213995964, 'lambda': 2.588895028771817, 'scale_pos_weight': 7}.

# Trial 34 finished with value: 0.9031123075164114 and parameters: {'learning_rate': 0.01221636279037111, 'max_depth': 9, 'min_child_weight': 6, 
# 'subsample': 0.9388323387748934, 'gamma': 0.8647062123246894, 
# 'colsample_bytree': 0.7644821944288629, 'alpha': 0.8375691885584633, 'lambda': 3.9543889166305703, 'scale_pos_weight': 1}.

Trial 34 finished with value: 0.9031123075164114 and parameters: {'learning_rate': 0.01221636279037111, 'max_depth': 9, 'min_child_weight': 6, 
'subsample': 0.9388323387748934, 'gamma': 0.8647062123246894, 
'colsample_bytree': 0.7644821944288629, 'alpha': 0.8375691885584633, 'lambda': 3.9543889166305703, 'scale_pos_weight': 1}.

In [10]:
params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'booster': 'gbtree',
            'learning_rate': 0.01221636279037111,
            'max_depth': 9,
            'min_child_weight': 6,
            'subsample': 0.9388323387748934,
            'gamma': 0.8647062123246894,
            'colsample_bytree': 0.7644821944288629,
            'alpha': 0.8375691885584633,
            'lambda': 3.9543889166305703,
            'scale_pos_weight': 1,
            'n_estimators': 1000
            }
    
XGBmodel =ImPipeline([
('model', XGBClassifier(**params))
])

# Fit the model
XGBmodel.fit(train_df, target)

# Tuning the Catboost model

In [14]:
agg_test.head()

Unnamed: 0,disrict,client_catg,region,consumption_diff_sum,consumption_diff_mean,tarif_type_count,reading_remarque_mean,counter_statue_count,counter_coefficient_mean,consommation_level_1_mean,consommation_level_2_mean,consommation_level_3_mean,consommation_level_4_mean,counter_number_nunique,months_number_mean,tenure_days,invoice_freq,daily_consumption,client_period
0,62,11,307,0,0.0,37,6.810811,37,1.0,488.135135,3.243243,0.0,0.0,1,4.378378,4967,134.243243,3.660358,17
1,69,11,103,0,0.0,22,7.636364,22,1.0,1091.409091,843.136364,182.318182,586.318182,1,4.545455,3744,170.181818,12.438835,10
2,62,11,310,0,0.0,74,7.459459,74,1.0,554.040541,37.364865,15.743243,0.162162,2,4.0,5022,67.864865,8.946436,15
3,60,11,101,0,0.0,40,6.575,40,1.0,244.35,0.0,0.0,0.0,2,3.9,2486,62.15,3.931617,20
4,62,11,301,-116,-2.188679,53,7.90566,53,1.0,568.188679,145.056604,33.679245,0.0,3,4.528302,5137,96.924528,7.706249,42


In [15]:
import optuna
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

def objective(trial,X,y):
 # 'is_unbalance':True,
    cat_params = {
        'iterations': trial.suggest_int('iterations', 100, 2000, step=50),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'depth': trial.suggest_int('depth', 4, 10, step=1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0, step=0.1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 1.0, step=0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20, step=1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10, step=1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10, step=1),
        'border_count': trial.suggest_int('border_count', 5, 255, step=5),
        'eval_metric': 'AUC'
    }

    # Create and train the CatBoostClassifier
    model = CatBoostClassifier(**cat_params, random_state=42, verbose=0)
    
    # Fit the model
    model.fit(agg_train, target)
    cv_scores = cross_val_score(model, X, y, cv=10, n_jobs=-1, scoring="roc_auc")
    score = np.mean(cv_scores)
    # scorestd = cv_scores.std()
    return score  # Replace with appropriate metric

# Create a study object and specify the direction is 'maximize'.
study = optuna.create_study(direction='maximize')

# Start the optimization
study.optimize(lambda trial: objective(trial, agg_train, target), n_trials=100,  gc_after_trial=True)

# Print the optimal parameters
print(study.best_params)

[I 2023-12-10 22:38:15,405] A new study created in memory with name: no-name-9f0a6c19-e1d1-47ba-bd34-9461685e1568


[I 2023-12-10 22:40:05,547] Trial 0 finished with value: 0.7603610449234928 and parameters: {'iterations': 500, 'learning_rate': 0.11, 'depth': 9, 'l2_leaf_reg': 0.00327258649050133, 'subsample': 0.8, 'colsample_bylevel': 0.2, 'min_child_samples': 13, 'bagging_temperature': 7.0, 'scale_pos_weight': 10.0, 'border_count': 35}. Best is trial 0 with value: 0.7603610449234928.
[I 2023-12-10 22:43:20,987] Trial 1 finished with value: 0.831017819910594 and parameters: {'iterations': 950, 'learning_rate': 0.04, 'depth': 7, 'l2_leaf_reg': 0.2704634533819429, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'min_child_samples': 6, 'bagging_temperature': 1.0, 'scale_pos_weight': 6.0, 'border_count': 145}. Best is trial 1 with value: 0.831017819910594.
[I 2023-12-10 22:47:42,963] Trial 2 finished with value: 0.8167361632404825 and parameters: {'iterations': 1550, 'learning_rate': 0.08, 'depth': 5, 'l2_leaf_reg': 0.00284586915899393, 'subsample': 1.0, 'colsample_bylevel': 0.5, 'min_child_samples': 12, '

{'iterations': 2000, 'learning_rate': 0.01, 'depth': 10, 'l2_leaf_reg': 9.736531689548219, 'subsample': 0.8, 'colsample_bylevel': 0.2, 'min_child_samples': 4, 'bagging_temperature': 4.0, 'scale_pos_weight': 2.0, 'border_count': 150}


In [16]:
print(study.best_params)
# Best is trial 75 with value: 0.8394281819451365. {'iterations': 2000, 'learning_rate': 0.01, 'depth': 10, 'l2_leaf_reg': 9.736531689548219, 'subsample': 0.8, 'colsample_bylevel': 0.2, 'min_child_samples': 4, 'bagging_temperature': 4.0, 'scale_pos_weight': 2.0, 'border_count': 150}

{'iterations': 2000, 'learning_rate': 0.01, 'depth': 10, 'l2_leaf_reg': 9.736531689548219, 'subsample': 0.8, 'colsample_bylevel': 0.2, 'min_child_samples': 4, 'bagging_temperature': 4.0, 'scale_pos_weight': 2.0, 'border_count': 150}


In [18]:
params = {
        'iterations': 2000,
        'learning_rate': 0.01,
        'depth': 10,
        'l2_leaf_reg': 9.736531689548219,
        'subsample': 0.8,
        'colsample_bylevel': 0.2,
        'min_child_samples': 4,
        'bagging_temperature': 4.0,
        'scale_pos_weight': 2.0,
        'border_count': 150,
        'eval_metric': 'AUC'
    }

    # Create and train the CatBoostClassifier
model = CatBoostClassifier(**params, random_state=42, verbose=0)

# Fit the model
model.fit(agg_train, target)

<catboost.core.CatBoostClassifier at 0x2386dbddb50>

In [11]:
pred = XGBmodel.predict_proba(test_df)
preds = pd.DataFrame(pred, columns=['target','target2'])

In [12]:
preds = pd.DataFrame(preds, columns=['target2'])

In [13]:
preds.head()

Unnamed: 0,target2
0,0.016889
1,0.118035
2,0.005813
3,0.005299
4,0.03042


In [14]:
preds.shape

(58069, 1)

In [15]:
client_test = pd.read_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/test/client_test.csv",low_memory=False)

In [16]:
client_test.shape

(58069, 5)

In [17]:
submission = pd.DataFrame(
    {
        'client_id': client_test['client_id'],
        'target': preds['target2']
    }
)

submission.head()

Unnamed: 0,client_id,target
0,test_Client_0,0.016889
1,test_Client_1,0.118035
2,test_Client_10,0.005813
3,test_Client_100,0.005299
4,test_Client_1000,0.03042


In [66]:
submission.to_csv("C:/Users/Asus/Documents/MMAI/869_MachineLearningAI/Team_assignment/submissionXGB_Final.csv",index=False)

In [25]:
submission.shape

(58069, 2)

In [2]:
import pandas as pd
costData = pd.DataFrame([[2500,20000],[500,0]], index=['Actual Fail', 'Actual No Fail'],columns=['Predicted Fail','Predicted No Fail'])

In [3]:
costData

Unnamed: 0,Predicted Fail,Predicted No Fail
Actual Fail,2500,20000
Actual No Fail,500,0


In [4]:
rfCostData = pd.DataFrame([[201*2500,55*20000],[50*500,0]], index=['Actual Fail', 'Actual No Fail'],columns=['Predicted Fail','Predicted No Fail'])

#The total cost after using the RandomForest model
rfCostData.sum()

Predicted Fail        527500
Predicted No Fail    1100000
dtype: int64

In [5]:
rfCostData

Unnamed: 0,Predicted Fail,Predicted No Fail
Actual Fail,502500,1100000
Actual No Fail,25000,0


In [6]:
#Cost from RNN model
rnnCostData = pd.DataFrame([[226*2500,30*20000],[1200*500,0]], index=['Actual Fail', 'Actual No Fail'],columns=['Predicted Fail','Predicted No Fail'])

#The total cost after using the RNN model
rnnCostData.sum()

Predicted Fail       1165000
Predicted No Fail     600000
dtype: int64

In [7]:
rnnCostData

Unnamed: 0,Predicted Fail,Predicted No Fail
Actual Fail,565000,600000
Actual No Fail,600000,0
