In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [3]:
df=pd.read_csv('datasets_13720_18513_insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [9]:
## Checking if null values present
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [11]:
#Dtale tool for EDA
import dtale
dtale.show(df)



In [12]:
#Splitting X and y
X=df.iloc[:,:-1]
y=df.iloc[:,6:]

In [13]:
dummy=pd.get_dummies(X[['sex','smoker']],drop_first=True)
X['Sex']=dummy.iloc[:,0].values
X['Smoker']=dummy.iloc[:,1].values

In [14]:
dict={'southeast':0,'northwest':1,'southwest':2,'northeast':3}
X['Region']=X['region'].map(dict)

In [15]:
X.drop(['sex','smoker','region'],axis=1,inplace=True)

In [16]:
X

Unnamed: 0,age,bmi,children,Sex,Smoker,Region
0,19,27.900,0,0,1,2
1,18,33.770,1,1,0,0
2,28,33.000,3,1,0,0
3,33,22.705,0,1,0,1
4,32,28.880,0,1,0,1
...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1
1334,18,31.920,0,0,0,3
1335,18,36.850,0,0,0,0
1336,21,25.800,0,0,0,2


In [17]:
y

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
params={'n_estimators':[50,100,200,300],'criterion':["mse", "mae"],
        'max_depth':[1,2,3,4,5,6,7,8],'min_samples_split':[2,4,6,8,10],
        'min_samples_leaf':[1,2,3,4,5,6,7]}
regressor=RandomForestRegressor()
grid=GridSearchCV(regressor,param_grid=params,n_jobs=-1,cv=5,verbose=10)
grid.fit(X_train,y_train)
grid

Fitting 5 folds for each of 2240 candidates, totalling 11200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 9232 tasks      | elapsed: 69.4min
[Parallel(n_jobs=-1)]: Done 9369 tasks      | elapsed: 70.5min
[Parallel(n_jobs=-1)]: Done 9506 tasks      | elapsed: 71.9min
[Parallel(n_jobs=-1)]: Done 9645 tasks      | elapsed: 73.4min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed: 74.9min
[Parallel(n_jobs=-1)]: Done 9925 tasks      | elapsed: 76.6min
[Parallel(n_jobs=-1)]: Done 10066 tasks      | elapsed: 78.2min
[Parallel(n_jobs=-1)]: Done 10209 tasks      | elapsed: 79.8min
[Parallel(n_jobs=-1)]: Done 10352 tasks      | elapsed: 81.4min
[Parallel(n_jobs=-1)]: Done 10497 tasks      | elapsed: 83.0min
[Parallel(n_jobs=-1)]: Done 10642 tasks      | elapsed: 84.8min
[Parallel(n_jobs=-1)]: Done 10789 tasks      | elapsed: 86.5min
[Parallel(n_jobs=-1)]: Done 10936 tasks      | elapsed: 88.3min
[Parallel(n_jobs=-1)]: Done 11085 tasks      | elapsed: 90.0min
[Parallel(n_jobs=-1)]: Done 11200 out of 11200 | elapsed: 91.5min finished

A column-vector y was passed when 

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [44]:
grid.best_params_

{'criterion': 'mse',
 'max_depth': 4,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 100}

In [21]:
y_pred1=grid.predict(X_train)
mse=mean_squared_error(y_train,y_pred1)
print(mse)

19346900.069736782


In [22]:
y_pred=grid.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
print(mse)

15909103.676038329


In [36]:
import math
rmse=math.sqrt(mse)
print(rmse)

3988.6217765085635
Executing shutdown...


2020-10-29 22:12:31,108 - INFO     - Executing shutdown...


In [38]:
#Saving the model
import pickle

In [47]:
pickle.dump(grid, open('model.pkl','wb'))

In [46]:
import requests
url = 'https://pycaret-insurance.herokuapp.com/predict_api'
pred = requests.post(url,json={'age':55, 'sex':'male', 'bmi':59, 'children':1, 'smoker':'male', 'region':'northwest'})
print(pred.json())

75714.0
