In [1]:
import xgboost as xg
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_validate

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
  
# Load the data 
data = pd.read_excel("insurance.xlsx") 
data["Height"]= data["Height"]/100
BMI= round(data["Weight"]/(data["Height"]**2) ,1)
data.insert(8,"BMI", BMI)
X=  data[['Age', 'Diabetes', 'BloodPressureProblems', 'AnyTransplants',
       'AnyChronicDiseases', 'BMI' , 'KnownAllergies',
       'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries']]

y= data["PremiumPrice"]

In [3]:

# Splitting 
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                      test_size = 0.11, random_state = 123) 

# Standardizing te data
scaler= StandardScaler()
train_X= pd.DataFrame(scaler.fit_transform(train_X) , columns=train_X.columns)
test_X= pd.DataFrame(scaler.transform(test_X) , columns=test_X.columns)
  


In [16]:
lr_model=LinearRegression()
lr_model.fit(train_X,train_y)
pred= lr_model.predict(test_X)

# RMSE Computation 
rmse = np.sqrt(MSE(test_y, pred)) 
print("RMSE : % f" %(rmse)) 

# coefficient of Determination Computation
print(f"R^2  is : ",lr_model.score(test_X,test_y))

RMSE :  4044.153714
R^2  is :  0.6454840128398805


In [19]:
# plt.plot(pred)
# plt.plot(test_y)

# plt.show()

**Using Decision Tree Regressor**

In [24]:
depths = [1,2,3,4,5,6,7,9,20,22,50]

for depth in depths:
    tree_reg = DecisionTreeRegressor(random_state=7, max_depth = depth )

    kfold = KFold(n_splits=5)
    # scoring is taken as negative mse as we maximize the scoring. maximizing negative MSE is same as minimizing MSE.r
    cv_acc_results = cross_validate(tree_reg, train_X, train_y, cv = kfold, scoring = 'neg_mean_squared_error', return_train_score = True)

    print(f"K-Fold for depth:{depth} -ve MSE Mean: Train: {cv_acc_results['train_score'].mean()*100} Validation: {cv_acc_results['test_score'].mean()*100}")
    print('***************')

K-Fold for depth:1 -ve MSE Mean: Train: -1807964505.6855466 Validation: -1820000472.6007454
***************
K-Fold for depth:2 -ve MSE Mean: Train: -1463600123.805136 Validation: -1610032174.4758058
***************
K-Fold for depth:3 -ve MSE Mean: Train: -1083921164.4518967 Validation: -1337657281.6440732
***************
K-Fold for depth:4 -ve MSE Mean: Train: -852420660.3904899 Validation: -1189958159.443913
***************
K-Fold for depth:5 -ve MSE Mean: Train: -697832925.8560473 Validation: -1288527025.5675764
***************
K-Fold for depth:6 -ve MSE Mean: Train: -476017957.2825326 Validation: -1274790801.2533216
***************
K-Fold for depth:7 -ve MSE Mean: Train: -367243525.1622652 Validation: -1273688507.5636961
***************
K-Fold for depth:9 -ve MSE Mean: Train: -175885466.7856033 Validation: -1405322626.4807646
***************
K-Fold for depth:20 -ve MSE Mean: Train: 0.0 Validation: -1550549350.6493504
***************
K-Fold for depth:22 -ve MSE Mean: Train: 0.0 Valid

In [35]:
tree_reg = DecisionTreeRegressor(random_state=7, max_depth = 6 , min_samples_split = 7 )
tree_reg.fit(train_X, train_y)
pred = tree_reg.predict(test_X)
print("Model Train acc",tree_reg.score(train_X, train_y))   # r2_score
print("Model Test acc",tree_reg.score(test_X, test_y))  # r2_score

Model Train acc 0.8424483373379567
Model Test acc 0.7104564142569318


In [30]:
from sklearn.model_selection import RandomizedSearchCV

rcv = RandomizedSearchCV(tree_reg, { 'max_depth':range(2,25) , "min_samples_split" : range(2,20)})

rcv.fit(train_X, train_y)

In [31]:
rcv.best_score_

0.6954792189891317

In [32]:
rcv.best_params_

{'min_samples_split': 7, 'max_depth': 6}

**Using Random Forest Regressor**

In [8]:
regr = RandomForestRegressor(n_estimators=300,max_depth = 9 , min_samples_split = 10 , random_state=0 , criterion='squared_error')
regr.fit(train_X, train_y)
pred = regr.predict(test_X)
print("Model acc",regr.score(train_X, train_y))
print("Model acc",regr.score(test_X, test_y))

Model acc 0.8773205220094025
Model acc 0.7619345183646712


In [5]:
from sklearn.model_selection import RandomizedSearchCV

rcv = RandomizedSearchCV(regr, {'n_estimators': [100,120,130,140,150,200,250,300,400], 'max_depth':range(2,25) , "min_samples_split" : range(2,14)})

rcv.fit(train_X, train_y)

In [6]:
rcv.best_score_

0.7448663971879248

In [7]:
rcv.best_params_

{'n_estimators': 300, 'min_samples_split': 10, 'max_depth': 9}

**Using Gradient Boosting Regressor**

In [47]:
 # Instantiation 
gbc = GradientBoostingRegressor(n_estimators=100, max_depth=3, loss = 'squared_error')

gbc.fit(train_X, train_y)
print("Model acc",gbc.score(train_X, train_y))
print("Model acc",gbc.score(test_X, test_y))

Model acc 0.8611129364880129
Model acc 0.7673365139927816


In [43]:
from sklearn.model_selection import RandomizedSearchCV

rcv = RandomizedSearchCV(gbc, {'n_estimators': [50,100,150,200,250,300,350,400,450], 'max_depth':range(2,25)})

rcv.fit(train_X, train_y)

In [44]:
rcv.best_score_

0.45995432675720005

In [45]:
rcv.best_params_

{'n_estimators': 200, 'max_depth': 7}

**Using XGBoost**

In [59]:
 # Instantiation 
xgb_r = xg.XGBRegressor( objective ='reg:squarederror',
                  n_estimators = 19 , seed = 123,learning_rate=0.2, max_depth=4 )

  
# Fitting the model 
xgb_r.fit(train_X, train_y) 
  
# Predict the model 
pred = xgb_r.predict(test_X) 
  
# RMSE Computation 
rmse = np.sqrt(MSE(test_y, pred)) 
print("RMSE : % f" %(rmse)) 

RMSE :  3196.076083


In [60]:
print("Model acc", xgb_r.score(train_X, train_y))
print("Model acc",xgb_r.score(test_X, test_y))

Model acc 0.8454205024586654
Model acc 0.7785810767604657


In [50]:
from sklearn.model_selection import RandomizedSearchCV

rcv = RandomizedSearchCV(gbc, {'n_estimators': [100,120,130,140,150,200,250,300,400], 'max_depth':[1,2,3,4,5,6,7,8,9,10]})

rcv.fit(train_X, train_y)

In [51]:
rcv.best_score_

0.7097416398633746

In [52]:
rcv.best_params_

{'n_estimators': 300, 'max_depth': 3}

In [61]:
# saving the model

import pickle 
pickle_out= open("regression.pkl" , mode= "wb")
pickle.dump(xgb_r,pickle_out)
pickle_out.close()