In [1]:
#impoting the libaries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#Here we load the dataset from the insurance.csv file
dataset = pd.read_csv("insurance.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
#pandas.get_dummies() is used for data manipulation. It converts categorical data into dummy or indicator variables.
#drop_first=True - Remove first level to get n-1 dummies out of n categorical level
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
# it returns the column labels of the dataset
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
#Split the inputs
independent=dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
#Split the output
dependent=dataset[["charges"]]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
from sklearn.tree import DecisionTreeRegressor

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = {'criterion':['squared_error','friedman_mse','absolute_error','poisson'],
               'max_features':['auto','sqrt','log2'],'splitter':['best','random']}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose=3, n_jobs=-1)
grid.fit(independent,dependent)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'friedman_mse',
                                       'absolute_error', 'poisson'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             verbose=3)

In [9]:
#print the best parameter after tunning
re = grid.cv_results_
print("R score value for best parameter {}:".format(grid.best_params_))

R score value for best parameter {'criterion': 'squared_error', 'max_features': 'sqrt', 'splitter': 'best'}:


In [10]:
table=pd.DataFrame.from_dict(re)

In [11]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015622,9.464947e-07,0.0,0.0,squared_error,auto,best,"{'criterion': 'squared_error', 'max_features':...",0.721034,0.609674,0.728883,0.698592,0.678568,0.68735,0.042654,7
1,0.053317,0.09127796,0.00625,0.007654305,squared_error,auto,random,"{'criterion': 'squared_error', 'max_features':...",0.656005,0.623053,0.697219,0.66512,0.675446,0.663369,0.02439,13
2,0.006248,0.007652787,0.009373,0.007653176,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.772337,0.687363,0.761547,0.735384,0.78124,0.747574,0.033811,1
3,0.0,0.0,0.015623,7.448435e-07,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.688101,0.575555,0.529219,0.687678,0.660152,0.628141,0.064367,19
4,0.012497,0.01169008,0.006249,0.007652903,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.65714,0.328256,0.744588,0.625053,0.737855,0.618578,0.152264,21
5,0.003124,0.006248474,0.012497,0.006248522,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.676487,0.558988,0.635593,0.699067,0.732004,0.660428,0.059634,14
6,0.006249,0.007653254,0.009373,0.007653293,friedman_mse,auto,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.711312,0.652715,0.733786,0.745964,0.691831,0.707121,0.032958,4
7,0.009373,0.007652787,0.00625,0.007654247,friedman_mse,auto,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.6584,0.548618,0.71113,0.719795,0.699973,0.667583,0.063101,11
8,0.009374,0.007653526,0.009373,0.007653137,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.706115,0.667815,0.553379,0.744395,0.725138,0.679369,0.06789,9
9,0.006249,0.007653371,0.009374,0.007654149,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.622117,0.580069,0.729557,0.738151,0.583955,0.65077,0.069465,16


In [12]:
age_input= float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("Children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age:32
BMI:43
Children:2
Sex Male 0 or 1:0
Smoker Yes 0 or 1:1


In [13]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future Prediction {}:".format(Future_Prediction))

Future Prediction [40932.4295]:


  "X does not have valid feature names, but"
