In [1]:
import pandas as pd #import pandas library to handle file, database

In [2]:
dataset = pd.read_csv("insurance_pre.csv") #read the file data and store it in dataset variable

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [7]:
#data preprocessing - because categorial data can't be handle so we modify the data into numerical data using one hot encoding(Nominal)
dataset = pd.get_dummies(dataset, drop_first=True)

In [9]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [11]:
boolean_columns = dataset.select_dtypes(include=['bool']).columns #get boolean type column data from the dataset and store it in boolean_columns variable
print(boolean_columns)

dataset[boolean_columns] = dataset[boolean_columns].astype(int) #convert boolean column dataset to 0's and 1's
print(dataset[boolean_columns])

Index(['sex_male', 'smoker_yes'], dtype='object')
      sex_male  smoker_yes
0            0           1
1            1           0
2            1           0
3            1           0
4            1           0
...        ...         ...
1333         1           0
1334         0           0
1335         0           0
1336         0           0
1337         0           1

[1338 rows x 2 columns]


In [13]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [15]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [17]:
#split input data
independent = dataset [['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [19]:
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [21]:
dependent = dataset [['charges']] #split output data

In [23]:
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor #Create a model

param_grid = {'criterion' : ['squared_error','friedman_mse','absolute_error','poisson'], 'splitter' : ['best','random']
             , 'max_features' : [None,'sqrt']}

grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1, scoring='r2')

#grid.fit(X_train, y_train)

grid.fit(independent, dependent)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [27]:
re = grid.cv_results_

#print("CV_RESULTS : ", re)

grid_pred = grid.predict(independent) #test the model with input data

from sklearn.metrics import r2_score #Evaluation metrics

r_score = r2_score(dependent, grid_pred)
print("R score value is : ", r_score) 

print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)

R score value is :  0.998667156135576
Best Score: 0.7093405606128359
Best Parameters: {'criterion': 'poisson', 'max_features': None, 'splitter': 'best'}


In [29]:
table = pd.DataFrame.from_dict(re) #convert dictonary format to DataFrame(table)

In [31]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023133,0.00574,0.010933,0.006759,squared_error,,best,"{'criterion': 'squared_error', 'max_features':...",0.725038,0.657537,0.715526,0.693345,0.658763,0.690042,0.028,6
1,0.021828,0.01038,0.007988,0.006018,squared_error,,random,"{'criterion': 'squared_error', 'max_features':...",0.678822,0.549057,0.695683,0.70325,0.753527,0.676068,0.06821,8
2,0.017431,0.008066,0.009612,0.003611,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.734932,0.56052,0.756006,0.682249,0.73747,0.694235,0.071224,4
3,0.014949,0.005858,0.008897,0.005628,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.743532,0.502015,0.712094,0.668956,0.587398,0.642799,0.087775,12
4,0.015248,0.004708,0.004479,0.00261,friedman_mse,,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.720725,0.651143,0.730525,0.708997,0.675866,0.697451,0.029595,3
5,0.009848,0.003409,0.010512,0.001947,friedman_mse,,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.677931,0.634059,0.636454,0.720737,0.700913,0.674019,0.034436,9
6,0.013453,0.00389,0.005793,0.002871,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.597275,0.586649,0.704431,0.774839,0.468335,0.626306,0.10539,14
7,0.016269,0.010814,0.011486,0.003824,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.63621,0.547983,0.571684,0.587901,0.555089,0.579773,0.031421,16
8,0.083207,0.019014,0.007402,0.004065,absolute_error,,best,"{'criterion': 'absolute_error', 'max_features'...",0.743261,0.587522,0.717544,0.723097,0.747081,0.703701,0.059182,2
9,0.058523,0.005801,0.003809,0.003867,absolute_error,,random,"{'criterion': 'absolute_error', 'max_features'...",0.703209,0.659247,0.71171,0.721512,0.674884,0.694113,0.02336,5


In [33]:
age = int(input("Enter the age:"))
bmi = float(input("Enter the BMI:"))
children = int(input("Enter the children number:"))
sex = int(input("Enter sex (0 for female and 1 for male):"))
smoker = int(input("Enter 0 for non smoker and 1 for smoker:"))

Enter the age: 26
Enter the BMI: 20
Enter the children number: 0
Enter sex (0 for female and 1 for male): 0
Enter 0 for non smoker and 1 for smoker: 1


In [35]:
Final_prediction =grid.predict([[age,bmi,children,sex,smoker]])
print("Final_prediction of DT: {}".format(Final_prediction))

Final_prediction of DT: [14571.8908]




In [37]:
import pickle

fileName = "DT_Grid_Insurance.sav"

pickle.dump(grid, open(fileName,'wb'))

In [39]:
load_model = pickle.load(open(fileName,'rb'))

result = load_model.predict([[30,25.899,2,1,1]])
result



array([18972.495])