In [1]:
import pandas as pd
import xgboost as xgb


In [2]:
Dataset=pd.read_csv("insurance_pre.csv")
Dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
Dataset.columns


Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [4]:
# Here categorical data(state) is available in the dataset which is converted to numerical data 
# As the categorical data available is nominal, one hot coding method is used to convert it into numerical data
Dataset=pd.get_dummies(Dataset,dtype=int,drop_first=True)
Dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [5]:
Dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
#Input and output split
independent=Dataset[['age', 'bmi', 'children','sex_male','smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [7]:
Dependent=Dataset[["charges"]]
Dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [63]:
# model creation using GridsearchCV
from sklearn.model_selection import GridSearchCV
param_grid={'eta':[0.05,0.07,0.1],
            'max_depth':[3],
             'subsample':[0.7,0.9],
             'colsample_bytree':[0.7,0.9],
             'tree_method':['exact','auto','hist','approx']}
grid=GridSearchCV(xgb.XGBRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
grid.fit(independent,Dependent)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [65]:
re=grid.cv_results_

In [67]:
table=pd.DataFrame.from_dict(re)

In [69]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_eta,param_max_depth,param_subsample,param_tree_method,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.493432,0.257637,0.006045,0.000422,0.7,0.05,3,0.7,exact,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.873655,0.805918,0.873617,0.843646,0.856799,0.850727,0.025078,41
1,0.287349,0.035589,0.006287,0.000677,0.7,0.05,3,0.7,auto,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.870123,0.80411,0.868229,0.838384,0.855412,0.847252,0.02437,45
2,0.20115,0.170657,0.007482,0.00364,0.7,0.05,3,0.7,hist,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.870123,0.80411,0.868229,0.838384,0.855412,0.847252,0.02437,45
3,0.130928,0.01207,0.016588,0.003796,0.7,0.05,3,0.7,approx,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.869459,0.80362,0.867421,0.839531,0.856173,0.847241,0.02426,47
4,0.170899,0.01284,0.013215,0.002468,0.7,0.05,3,0.9,exact,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.873878,0.802931,0.875536,0.8446,0.858955,0.85118,0.026609,38
5,0.16927,0.021219,0.024961,0.005824,0.7,0.05,3,0.9,auto,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.869131,0.806127,0.867957,0.837617,0.85648,0.847462,0.023564,43
6,0.228825,0.027071,0.022646,0.006073,0.7,0.05,3,0.9,hist,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.869131,0.806127,0.867957,0.837617,0.85648,0.847462,0.023564,43
7,0.178091,0.018525,0.018221,0.002553,0.7,0.05,3,0.9,approx,"{'colsample_bytree': 0.7, 'eta': 0.05, 'max_de...",0.868065,0.801593,0.867829,0.839711,0.856166,0.846673,0.024803,48
8,0.216221,0.018027,0.031392,0.016953,0.7,0.07,3,0.7,exact,"{'colsample_bytree': 0.7, 'eta': 0.07, 'max_de...",0.875083,0.804822,0.88121,0.84693,0.861558,0.85392,0.027241,25
9,0.291147,0.047034,0.036268,0.017508,0.7,0.07,3,0.7,auto,"{'colsample_bytree': 0.7, 'eta': 0.07, 'max_de...",0.872822,0.804213,0.876765,0.840963,0.862136,0.85138,0.026657,36


In [71]:
table.to_excel("grid_search_results_XG.xlsx",index=False)

In [73]:
grid_predict=grid.predict(independent)

In [75]:
# evaluating the model performance
from sklearn.metrics import r2_score
r_score=r2_score(Dependent,grid_predict)

In [77]:
print("The best value of R_score parameters:{}".format(grid.best_params_),r_score)

The best value of R_score parameters:{'colsample_bytree': 0.9, 'eta': 0.05, 'max_depth': 3, 'subsample': 0.9, 'tree_method': 'auto'} 0.8790478706359863


In [79]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker yes 0 or 1:"))

Age: 50
BMI: 37
children: 2
Sex Male 0 or 1: 1
Smoker yes 0 or 1: 1


In [81]:
Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Prediction={}".format(Prediction))

Prediction=[44315.95]
