### Random Forest Regression - Medical Insurance

In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('medical_insurance.csv')
df

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df['gender'].value_counts()

gender
male      676
female    662
Name: count, dtype: int64

In [5]:
df['gender'].value_counts().to_dict()

{'male': 676, 'female': 662}

In [6]:
df['gender'].replace({'male': 1, 'female': 0}, inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


In [8]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [9]:
df['smoker'].replace({'no':0, 'yes':1}, inplace = True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   gender    1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [11]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [12]:
pd.get_dummies(df['region'], dtype = int)

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [13]:
df = pd.get_dummies(df, columns = ['region'], dtype = int)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   gender            1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northeast  1338 non-null   int32  
 7   region_northwest  1338 non-null   int32  
 8   region_southeast  1338 non-null   int32  
 9   region_southwest  1338 non-null   int32  
dtypes: float64(2), int32(4), int64(4)
memory usage: 83.8 KB


### 4. Feature Selection

In [15]:
Embedded Method: 
    Tree Based model Feature Importance

SyntaxError: invalid syntax (3616170025.py, line 1)

### 5. Model Training

In [16]:
x = df.drop('charges', axis = 1)
y = df.charges
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=0)
x_train.shape

(1003, 9)

In [17]:
x_test.shape

(335, 9)

In [20]:
rf_reg = RandomForestRegressor(random_state = 1)
rf_reg.fit(x_train, y_train)

In [21]:
rf_reg.feature_importances_

array([0.13889898, 0.00682488, 0.20957303, 0.02152079, 0.60324082,
       0.00515951, 0.00631193, 0.0046453 , 0.00382476])

### 6. Model Evaluation 

In [22]:
# Train Data Evaluation
y_pred_train = rf_reg.predict(x_train)

y_pred_train[50:55]

array([ 2193.4127275, 11347.2263871,  7114.2981089,  6114.5256324,
        6699.9614502])

In [23]:
y_train[50:55]

1334     2205.98080
16      10797.33620
861      7151.09200
432      5969.72300
905      4564.19145
Name: charges, dtype: float64

In [24]:
# Train Data Evaluation
y_pred_train = rf_reg.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print('MAE :',mae)

r2_value = r2_score(y_train, y_pred_train)
print("R-squared Value :",r2_value)

MSE : 3686864.822630927
RMSE : 1920.1210437446196
MAE : 1070.8610148932448
R-squared Value : 0.9741984325427527


In [25]:
# Test Data Evaluation
y_pred = dt_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print('MAE :',mae)

r2_value = r2_score(y_test, y_pred)
print("R-squared Value :",r2_value)

MSE : 46579848.7166457
RMSE : 6824.943129187649
MAE : 3185.511135698508
R-squared Value : 0.7041319092556013


### Hyperparameter Tuning

In [None]:
rf_reg = RandomForestRegressor(max_features='sqrt',random_state=0,
                               bootstrap=True,
                               oob_score=True,
                              n_jobs = -1)
hyp_grid = {"n_estimators":np.arange(10,200,10),
            "criterion" : ['squared_error','absolute_error'],
            "max_depth" : np.arange(4,8),
            "min_samples_split" : range(10,15),
            "min_samples_leaf" : range(6,10)}

gscv_rf_reg = GridSearchCV(rf_reg, hyp_grid , cv = 3, n_jobs=-1)
gscv_rf_reg.fit(x_train, y_train)

In [None]:
190 * 6 * 7 * 6 * 2

In [None]:
gscv_rf_reg.best_estimator_

In [None]:
# Train Data Evaluation
rf_reg = gscv_rf_reg.best_estimator_
rf_reg.fit(x_train, y_train)

y_pred_train = rf_reg.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print('MAE :',mae)

r2_value = r2_score(y_train, y_pred_train)
print("R-squared Value :",r2_value)

In [None]:
# Test Data Evaluation
y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print('MAE :',mae)

r2_value = r2_score(y_test, y_pred)
print("R-squared Value :",r2_value)