In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv(r'E:\datasets_insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.shape

(1338, 7)

In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df.isna().sum()
#no NaN values

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [34]:
df1 = df.copy()
x = df1.iloc[:, :6]
y = df1.iloc[:, 6]

In [35]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [36]:
x['sex'].unique()

array(['female', 'male'], dtype=object)

In [37]:
df1.corr().round(2)

Unnamed: 0,age,bmi,children,charges
age,1.0,0.11,0.04,0.3
bmi,0.11,1.0,0.01,0.2
children,0.04,0.01,1.0,0.07
charges,0.3,0.2,0.07,1.0


In [38]:
from sklearn.preprocessing import LabelEncoder
x[['sex']] = x[['sex']].apply(LabelEncoder().fit_transform)

In [39]:
x[['smoker']] = x[['smoker']].apply(LabelEncoder().fit_transform)
x[['region']] = x[['region']].apply(LabelEncoder().fit_transform)

In [40]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [49]:
x = x.values
y = y.values

In [50]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)

In [51]:
len(x_train), len(x_test), len(y_train), len(y_test)

(936, 402, 936, 402)

##### applying linear regression

In [52]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

LinearRegression()

In [54]:
y_pred_lr = lr_model.predict(x_test)

In [55]:
from sklearn.metrics import r2_score, mean_squared_error

In [60]:
r2_lr = r2_score(y_test, y_pred_lr)
r2_lr

0.7911113876316933

In [57]:
y.mean()

13270.422265141257

In [80]:
rmse_lr = (mean_squared_error(y_test, y_pred_lr))**0.5
rmse_lr

5771.599022962351

In [70]:
from sklearn.metrics import mean_absolute_error
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mae_lr

4013.6929857812042

In [81]:
metric_dict = {}

metric_dict['linear_regression_model'] = {'r2_score':r2_lr, 'rmse':rmse_lr, 'mae':mae_lr}

In [82]:
metric_dict

{'linear_regression_model': {'r2_score': 0.7911113876316933,
  'rmse': 5771.599022962351,
  'mae': 4013.6929857812042}}

###### applying decision tree

In [65]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

DecisionTreeRegressor()

In [66]:
y_pred_dt = dt_model.predict(x_test)

In [68]:
r2_dt = r2_score(y_test, y_pred_dt)
r2_dt

0.7222824954039493

In [83]:
rmse_dt = (mean_squared_error(y_test, y_pred_dt))**0.5
rmse_dt

6654.88256166136

In [73]:
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mae_dt

3088.354669624378

In [84]:
metric_dict['decision_tree_model'] = {'r2_score':r2_dt, 'rmse':rmse_dt, 'mae':mae_dt}

##### applying random forest

In [77]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=10)
rf_model.fit(x_train, y_train)

RandomForestRegressor(n_estimators=10)

In [78]:
y_pred_rf = rf_model.predict(x_test)

In [79]:
r2_rf = r2_score(y_test, y_pred_rf)
r2_rf

0.8396626009984314

In [85]:
rmse_rf = (mean_squared_error(y_test, y_pred_rf))**0.5
rmse_rf

5056.571887736226

In [87]:
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mae_rf

2969.8070102027364

In [88]:
metric_dict['random_forest_model'] = {'r2_score':r2_rf, 'rmse':rmse_rf, 'mae':mae_rf}

In [89]:
metric_dict

{'linear_regression_model': {'r2_score': 0.7911113876316933,
  'rmse': 5771.599022962351,
  'mae': 4013.6929857812042},
 'decision_tree_model': {'r2_score': 0.7222824954039493,
  'rmse': 6654.88256166136,
  'mae': 3088.354669624378},
 'random_forest_model': {'r2_score': 0.8396626009984314,
  'rmse': 5056.571887736226,
  'mae': 2969.8070102027364}}

#### applying grid search to check if we can improve accuracy

In [94]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [95]:
from sklearn.model_selection import GridSearchCV
param = [{'n_estimators':list(range(1,20))}]

grid_search = GridSearchCV(estimator=rf_model, param_grid=param, scoring='r2', n_jobs=-1)
grid_search.fit(x_train, y_train)
best_r2 = grid_search.best_score_
best_param = grid_search.best_params_

In [96]:
best_r2

0.8070121147148344

In [97]:
best_param

{'n_estimators': 11}

In [101]:
pd.concat([pd.DataFrame(y_test), pd.DataFrame(y_pred_rf)], axis=1)

Unnamed: 0,0,0.1
0,9724.53000,11550.187514
1,8547.69130,11024.268252
2,45702.02235,43808.717815
3,12950.07120,13183.533615
4,9644.25250,9599.272010
...,...,...
397,3277.16100,6529.559452
398,17942.10600,17082.140460
399,10226.28420,14820.577143
400,14418.28040,18135.608859
