Medical Cost personal  Dataset - Insurance Forecast by using Regression algorithms of machine learning, Predicting the insurance costs.

### Model Building and Tuning

#### Import the Libraries

In [None]:
import pandas as pd
import numpy as np

#### Load the dataset

In [None]:
insurance = pd.read_csv('../input/medical-insurance-dataset/insurance_cleaned.csv')
insurance.head()

#### Features and Target variables

In [None]:
X = insurance.iloc[:,0:6].values     # features
y = insurance.iloc[:, 6:7].values    # Target

#### Splitting the data into train and test



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

#### Linear Regression Model Building


##### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(n_jobs= 2,normalize= True)
lin_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',lin_reg.score(X_train, y_train))

In [None]:
y_pred = lin_reg.predict(X_test)                     # y_pred = test set predicted profits and they are predicted on x_test = features of test set to get y_pred(label of test set)
np.set_printoptions(precision = 2)                        
df = np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),axis = 1)
df = pd.DataFrame(df)
df

In [None]:
from sklearn.metrics import r2_score
print('r2 score: ' , r2_score(y_test, y_pred))
print('Linear model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred))))

Cross Validation of Linear Regression model

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
cv_lin_reg = -cross_val_score(lin_reg, X_train, y_train, cv = 3, scoring = 'neg_mean_squared_error').mean()
print('RMSE of tuned model - Training: {}'.format(np.sqrt(cv_lin_reg)))

y_pred_cv = cross_val_predict(lin_reg, X_test, y_test, cv = 5)
print('RMSE of tuned model - Testing: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_cv))))

##### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',ridge_reg.score(X_train, y_train))

In [None]:
y_pred_ridge = ridge_reg.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_ridge))
print('Ridge model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_ridge))))

Model Tuning for Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
lambdas = 10**np.linspace(-5,5,500)*0.5
ridge_cv = RidgeCV(alphas = lambdas, scoring = 'neg_mean_squared_error', normalize = True,)
ridge_cv.fit(X_train, y_train)

In [None]:
ridge_cv.alpha_

In [None]:
ridge_reg_tuned = Ridge(alpha= ridge_cv.alpha_, normalize= True,random_state= 0)
ridge_reg_tuned.fit(X_train, y_train)

In [None]:
y_pred_ridge_tune = ridge_reg_tuned.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_ridge_tune))
print('Ridge Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_ridge_tune))))

##### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(max_iter=1000)
lasso_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',lasso_reg.score(X_train, y_train))

y_pred_lasso = lasso_reg.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_lasso))
print('Lasso model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_lasso))))

Model Tuning for Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
lasso_cv_reg = LassoCV(cv = 15, max_iter = 1000, normalize= True)
lasso_cv_reg.fit(X_train, y_train)

In [None]:
lasso_cv_reg.alpha_

In [None]:
lasso_tuned = Lasso(alpha= lasso_cv_reg.alpha_, max_iter= 5000, random_state= 0)
lasso_tuned.fit(X_train, y_train)

In [None]:
y_pred_lasso_tune = lasso_tuned.predict(X_test)
print('r2 score: ' , r2_score(y_test, y_pred_lasso_tune))
print('Lasso Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_lasso_tune))))

#### Random Forest Regression Model Building

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators= 100, criterion= 'mae')
rf_reg.fit(X_train , y_train)

In [None]:
y_pred_rf = rf_reg.predict(X_test)                     # y_pred = test set predicted profits and they are predicted on x_test = features of test set to get y_pred(label of test set)
np.set_printoptions(precision = 2)                        
df = np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1),y_test.reshape(len(y_test),1)),axis = 1)
df = pd.DataFrame(df)
df

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print('Score on training set: ',rf_reg.score(X_train, y_train))

print('r2 score: ' , r2_score(y_test, y_pred_rf))
print('Random Forest Regression model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf))))

##### Hyperparameter Tuning for Random Forest

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 220, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10]
min_samples_leaf = [1, 2, 4,5]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_tune = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_tune, param_distributions = random_grid, n_iter = 100, cv = 3, verbose = 2,
                               random_state = 42, n_jobs = -1)
rf_random.fit(X_train , y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_tuned = RandomForestRegressor(bootstrap= True,
 max_depth= 94,
 max_features= 'auto',
 min_samples_leaf = 5,
 min_samples_split = 2,
 n_estimators= 1155)
rf_tuned.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_rf_tune = rf_reg.predict(X_test) 

print('Score on training set: ',rf_tuned.score(X_train, y_train))
print('r2 score: ' , r2_score(y_test, y_pred_rf_tune))
print('Random Forest Regression Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf_tune))))

##### Grid Search with Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 100, 110,120, 150],
    'max_features': [ 3, 4, 5],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200,500, 1000]
}
rf_grid = RandomForestRegressor()

grid_search = GridSearchCV(estimator = rf_grid, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
rf_grid_tuned = RandomForestRegressor(bootstrap = True,
 max_depth = 150,
 max_features = 4,
 min_samples_leaf = 5,
 min_samples_split = 12,
 n_estimators = 200)
rf_grid_tuned.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_rf_tune_grid = rf_grid_tuned.predict(X_test) 

print('Score on training set: ',rf_grid_tuned.score(X_train, y_train))
print('r2 score: ' , r2_score(y_test, y_pred_rf_tune_grid))
print('Random Forest Regression Grid Cross CV Tuned model RMSE: {}'.format(np.sqrt(mean_squared_error(y_test, y_pred_rf_tune_grid))))