In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv(r'C:\Users\LENOVO\Downloads\Datasets\insurance - Regression.csv')

In [3]:
df = dataset.copy()

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.shape

(1338, 7)

In [6]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [8]:
df1 = pd.get_dummies(df, columns = ['sex', 'smoker', 'region'], drop_first = True)
df1.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [9]:
X = df1.drop('charges', axis = 1)
y = df1.drop(['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest', 
              'region_southeast', 'region_southwest'], axis = 1)

In [10]:
X.children.value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [11]:
X

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,0,1,0,0,1
1,18,33.770,1,1,0,0,1,0
2,28,33.000,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.880,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1,0,0
1334,18,31.920,0,0,0,0,0,0
1335,18,36.850,0,0,0,0,1,0
1336,21,25.800,0,0,0,0,0,1


In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[['age', 'bmi']] = sc.fit_transform(X[['age', 'bmi']])

In [13]:
X

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,-1.438764,-0.453320,0,0,1,0,0,1
1,-1.509965,0.509621,1,1,0,0,1,0
2,-0.797954,0.383307,3,1,0,0,1,0
3,-0.441948,-1.305531,0,1,0,1,0,0
4,-0.513149,-0.292556,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0.768473,0.050297,3,1,0,1,0,0
1334,-1.509965,0.206139,0,0,0,0,0,0
1335,-1.509965,1.014878,0,0,0,0,1,0
1336,-1.296362,-0.797813,0,0,0,0,0,1


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .2, random_state = 42)

In [15]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state = 0)
rfr.fit(X_train, y_train)

  rfr.fit(X_train, y_train)


In [16]:
y_pred = rfr.predict(X_test)

In [17]:
y_test

Unnamed: 0,charges
764,9095.06825
887,5272.17580
890,29330.98315
1293,9301.89355
259,33750.29180
...,...
109,47055.53210
575,12222.89830
535,6067.12675
543,63770.42801


In [18]:
from sklearn.metrics import r2_score, mean_squared_error

In [19]:
r2_score(y_test, y_pred)

0.8640099135651829

In [20]:
importance = rfr.feature_importances_
feat_name = X.columns
tree_result = pd.DataFrame({'Feature':feat_name, 'importance':importance})
tree_result_sort = tree_result.sort_values(by = 'importance', ascending = False)

In [21]:
tree_result_sort

Unnamed: 0,Feature,importance
4,smoker_yes,0.610743
1,bmi,0.211784
0,age,0.133965
2,children,0.020734
3,sex_male,0.006783
5,region_northwest,0.005847
6,region_southeast,0.005693
7,region_southwest,0.004452


### Hyperparameter Tuning

In [22]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100, random_state = 0)

In [23]:
print (rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


### Random hyperparameter grid

In [24]:
from sklearn.model_selection import RandomizedSearchCV
n_estimator = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 10, stop = 110, num = 10)]
max_depth.append(None)
min_samples_split = [2,5,10]
min_samples_leaf = [1,2,4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimator,
              'max_features':max_features,
              'max_depth' : max_depth,
               'min_samples_split' :min_samples_split,
               'min_samples_leaf' : min_samples_leaf,
               'bootstrap' : bootstrap}

In [25]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 21, 32, 43, 54, 65, 76, 87, 98, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [26]:
%%time
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = random_grid, n_iter=5, 
                               cv=3, verbose=2)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=32, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   4.0s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=32, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   3.6s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=32, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   3.6s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=54, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=2000; total time=   4.4s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=54, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=2000; total time=   4.4s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=True, max_depth=54, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=2000; total time=   4.4s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=21, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   2.3s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=21, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   2.3s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=21, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   2.3s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=65, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=65, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s


  estimator.fit(X_train, y_train, **fit_params)


[CV] END bootstrap=False, max_depth=65, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=800; total time=   1.4s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   2.9s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   2.9s


  estimator.fit(X_train, y_train, **fit_params)
  warn(


[CV] END bootstrap=False, max_depth=110, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=1000; total time=   3.2s


  self.best_estimator_.fit(X, y, **fit_params)


CPU times: total: 46.7 s
Wall time: 48 s


In [27]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 65,
 'bootstrap': False}

In [28]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 600, min_samples_split = 5, min_samples_leaf = 1,
                            max_features = 'sqrt', max_depth = 65, bootstrap = True)
rfr.fit(X_train, y_train)

  rfr.fit(X_train, y_train)


In [29]:
y_pred = rfr.predict(X_test)

In [30]:
r2_score(y_test, y_pred)

0.8555880684224417

In [31]:
importance = rfr.feature_importances_
feat_name = X.columns
tree_result = pd.DataFrame({'Feature':feat_name, 'importance':importance})
tree_result_sort = tree_result.sort_values(by = 'importance', ascending = False)

In [32]:
tree_result_sort

Unnamed: 0,Feature,importance
4,smoker_yes,0.615647
1,bmi,0.162299
0,age,0.160352
2,children,0.029412
3,sex_male,0.010327
6,region_southeast,0.008498
5,region_northwest,0.006764
7,region_southwest,0.006701


## Grid SearchCV

In [33]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [34]:
from sklearn.model_selection import GridSearchCV
n_estimator = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 20)]
max_features = [2,3]
max_depth = [int(x) for x in np.linspace(start = 10, stop = 110, num = 10)]
max_depth.append(None)
min_samples_split = [2,5,10, 12]
min_samples_leaf = [1,2,4, 5]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimator,
              'max_features':max_features,
              'max_depth' : max_depth,
               'min_samples_split' :min_samples_split,
               'min_samples_leaf' : min_samples_leaf,
               'bootstrap' : bootstrap}

In [35]:
%%time
grid_search = GridSearchCV(estimator = rfr, param_grid = param_grid, cv = 3, verbose = 2)

CPU times: total: 0 ns
Wall time: 0 ns


In [36]:
# grid_search.fit(X_train, y_train)

In [37]:
# param_grid = {'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
#               'max_features':[2,3],
#               'max_depth' : [10,20,30,40,50,60,70,80,90,100,110],
#                'min_samples_split' :[2,5,10, 12],
#                'min_samples_leaf' : [1,2,4, 5],
#                'bootstrap' : [True, False]}

In [38]:
# grid_search.best_params_

In [40]:
grid_search = {'bootstrap': True, 'max_depth': 98, 'max_features': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 294}

In [41]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(bootstrap = True, max_depth = 98, max_features = 3, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 294)
rfr.fit(X_train, y_train)

  rfr.fit(X_train, y_train)


In [42]:
y_pred = rfr.predict(X_test)

In [43]:
r2_score(y_test, y_pred)

0.8709195280574131