In [None]:
# system
import warnings
import os

# operation
import pandas as pd
import numpy as np
import pickle

# modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support, f1_score, roc_auc_score, r2_score, mean_squared_error

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
%config InlineBackend.figure_format = 'retina'

In [3]:
df = pd.read_csv('../artifacts/data.csv')
df1 = df.query('Premium_Level == 1')
df2 = df.query('Premium_Level > 3')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Premium_Level,Premium_Log
0,1,1,44,1,28.0,0,3,1,40454.0,26.0,217,1,3.0,4.606961
1,2,1,76,1,3.0,0,2,0,33536.0,26.0,183,0,3.0,4.525511
2,3,1,47,1,28.0,0,3,1,38294.0,26.0,27,1,3.0,4.583131
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0,3.0,4.456654
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0,3.0,4.43927


In [16]:
X, y = df.drop(['id', 'Premium_Level', 'Premium_Log', 'Annual_Premium', 'Response'], axis=1), df[['Annual_Premium']]
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=df['Premium_Level'], random_state=4)

regress_perf = pd.DataFrame(columns=['model', 'rmse', 'r2'])
class_perf = pd.DataFrame(columns={'model', 'f1', 'roc-auc'})

In [10]:
# Define the parameter grid
dt_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# grid search with cross-validation on decision tree
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=dt_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
dt1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt1.predict(Xtest)
score = dt1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['dt1', rmse, r2]

dt score: 0.2556446242119378


In [28]:
# Define the parameter grid
rf_grid = {
    'n_estimators': [None, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=rf_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
rf1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = dt1.predict(Xtest)
score = dt1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]

NameError: name 'dt1' is not defined

In [29]:
# Evaluate the best estimator on the test set
ypred = rf1.predict(Xtest)
score = rf1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]

dt score: 0.26018211734795027


In [None]:
xgb_grid = {
    'n_estimators': [None, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=xgb_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(Xtrain, ytrain)
xgb1 = grid_search.best_estimator_

# Evaluate the best estimator on the test set
ypred = xgb1.predict(Xtest)
score = xgb1.score(Xtest, ytest)
print(f'dt score: {score}')

rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['xgb1', rmse, r2]

In [None]:
# Try RandomForestRegressor
rf2 = RandomForestRegressor()
rf2.fit(Xtrain, ytrain)
rf2_score = rf2.score(Xtest, ytest)
print(f'RandomForestRegressor R² score: {rf2_score}')

ypred = rf2.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf2', rmse, r2]


# Try GradientBoostingRegressor
xgb2 = GradientBoostingRegressor()
xgb2.fit(Xtrain, ytrain)
xgb2_score = xgb2.score(Xtest, ytest)
print(f'GradientBoostingRegressor R² score: {xgb2_score}')

ypred = xgb2.predict(Xtest)
rmse = mean_squared_error(ytest, ypred, squared=False)
r2 = r2_score(ytest, ypred)
regress_perf.loc[regress_perf.index.size] = ['rf1', rmse, r2]