## Decision Tree Algorithm using Regression on Insurance dataset to predict EMI Charges

Decision tree is a huge grand nested if else structure

In [5]:
import pandas as pd

In [7]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [11]:
#Checking NULL Values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [17]:
#Checking Duplicate Values
df.duplicated().sum()

1

In [19]:
df.drop_duplicates(inplace=True)

In [21]:
df.duplicated().sum()

0

In [25]:
#Checking the type of the data
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [27]:
df2 = pd.get_dummies(df,drop_first='if_binary').astype('int')
df2.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,0,1,0,0,1
1,18,33,1,1725,1,0,0,1,0
2,28,33,3,4449,1,0,0,1,0
3,33,22,0,21984,1,0,1,0,0
4,32,28,0,3866,1,0,1,0,0


In [33]:
#Splitting the data into X and Y
x = df2.drop('charges',axis='columns')
y = df2[['charges']]

In [35]:
from sklearn.model_selection import train_test_split

In [37]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [39]:
from sklearn.tree import DecisionTreeRegressor

In [41]:
dtr = DecisionTreeRegressor()

In [43]:
dtr.fit(x_train,y_train)

In [45]:
#Evaluation of Model
dtr.score(x_train,y_train)

0.9986051803764089

In [47]:
dtr.score(x_test,y_test)

0.6590488689679904

In [49]:
#Predictions made by Algorithm for training data
pred_train = dtr.predict(x_train)
y_train['pred_charges'] = pred_train

In [51]:
y_train

Unnamed: 0,charges,pred_charges
272,7265,7265.0
402,14692,14692.0
362,13844,13844.0
1188,21771,21771.0
24,6203,6203.0
...,...,...
274,2523,2523.0
161,36149,36149.0
1172,11093,11093.0
797,4719,4719.0


In [55]:
#Predictions made by Algorithm for testing data
pred_test = dtr.predict(x_test)
y_test['pred_charges'] = pred_test

In [57]:
y_test

Unnamed: 0,charges,pred_charges
1298,5261,5257.0
1174,4433,4433.0
473,20878,12797.0
219,25081,3046.0
1125,14254,14256.0
...,...,...
119,6686,6933.0
291,20277,3947.0
595,8823,8023.0
1281,24535,23401.0


In [61]:
#This model is an overfit model as the training score is very high and testing score is low

In [63]:
#Saving the model

In [65]:
import joblib

In [67]:
joblib.dump(dtr,'decision_tree_regression_insurance.lb')

['decision_tree_regression_insurance.lb']