In [631]:
from sklearn.preprocessing import LabelEncoder #import the relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics

In [632]:
df=pd.read_csv("insurance.csv") # read the dataset
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [633]:
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

# Label encoding - encode the data
df['sex_encoded'] = le_sex.fit_transform(df.sex)
df['smoker_encoded'] = le_smoker.fit_transform(df.smoker)
df['region_encoded'] = le_region.fit_transform(df.region)

df_new  = df
df_new = df_new.drop(['sex','smoker','region'], axis=1)

In [634]:
df_new.head()

Unnamed: 0,age,bmi,children,charges,sex_encoded,smoker_encoded,region_encoded
0,19,27.9,0,16884.924,0,1,3
1,18,33.77,1,1725.5523,1,0,2
2,28,33.0,3,4449.462,1,0,2
3,33,22.705,0,21984.47061,1,0,1
4,32,28.88,0,3866.8552,1,0,1


In [635]:
## Scaling the data 

In [636]:
from sklearn.preprocessing import StandardScaler
data_pre = df_new.copy() # copying the current encoded data it will undergo further processing to make it more suitable
 
Bmi_v2 = data_pre.bmi
Bmi_v2 = Bmi_v2.values.reshape(-1,1)
data_pre['bmi'] = StandardScaler().fit_transform(Bmi_v2)
 
Age_v2 = data_pre.age
Age_v2 = Age_v2.values.reshape(-1,1)
data_pre['age'] = StandardScaler().fit_transform(Age_v2)
 
Charges_v2 = data_pre.charges
Charges_v2 = Charges_v2.values.reshape(-1,1)
data_pre['charges'] = StandardScaler().fit_transform(Charges_v2)
 
data_pre.head()

Unnamed: 0,age,bmi,children,charges,sex_encoded,smoker_encoded,region_encoded
0,-1.438764,-0.45332,0,0.298584,0,1,3
1,-1.509965,0.509621,1,-0.953689,1,0,2
2,-0.797954,0.383307,3,-0.728675,1,0,2
3,-0.441948,-1.305531,0,0.719843,1,0,1
4,-0.513149,-0.292556,0,-0.776802,1,0,1


In the below section is where the machine learning algorithms are used to predict the cost of healthcare. The following code shows a step-by-step process.

### Linear Regression

The first machine learning algorithm is Linear regression. This is used to predict the value of an explanatory variable based on the value of the dependent variable.

In [637]:
from sklearn.linear_model import LinearRegression # import the relevant Library 

In [638]:
X = data_pre.drop('charges', axis = 1) 
y = data_pre['charges'] # dependent variable

X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   test_size=0.2, random_state=42)

scaler= StandardScaler()
scaler.fit(X_train)

X_train_scaled= scaler.transform(X_train)
X_test_scaled= scaler.transform(X_test)

In [639]:
linear_reg_model= LinearRegression() 
plr = linear_reg_model.fit(X_train, y_train)

In [640]:
#y_pred = linear_reg_model.predict(X_test_scaled)

y_pred = linear_reg_model.predict(X_test) 
y_pred = pd.DataFrame(y_pred)

MAE_li_reg= metrics.mean_absolute_error(y_test, y_pred) # calculate the metric values
MSE_li_reg = metrics.mean_squared_error(y_test, y_pred)
RMSE_li_reg =np.sqrt(MSE_li_reg)

df_linearreg = pd.DataFrame([MAE_li_reg, MSE_li_reg, RMSE_li_reg], 
             index=['MAE Linear Regression', 'MSE Linear Regression', 'RMSE Linear Regression'], columns=['Metrics'])
df_linearreg

Unnamed: 0,Metrics
MAE Linear Regression,0.345836
MSE Linear Regression,0.229525
RMSE Linear Regression,0.479088


In [641]:
r_lr = r2_score(y_test, y_pred)
r_lr

0.7833463107364538

In [642]:
lr_accuracy = linear_reg_model.score(X_test, y_test)
print('Accuracy = '+ str(lr_accuracy)) # this is a different formula to make note of the r2

Accuracy = 0.7833463107364538


### Ridge Regression

The next algorithm used was Ridge regression, this is an extension of linear regression adds a regularization penalty to the loss function during training. 

In [699]:
from sklearn.linear_model import Ridge #import library

In [700]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

X_train,X_test,y_train,y_test=train_test_split(X, y, 
                                               test_size=0.2,random_state=46)
                                              

In [701]:
ridge_model=Ridge().fit(X_train,y_train)

In [702]:
y_pred = ridge_model.predict(X_test)
y_pred = pd.DataFrame(y_pred)

MAE_ridge_model= metrics.mean_absolute_error(y_test, y_pred)
MSE_ridge_model = metrics.mean_squared_error(y_test, y_pred)
RMSE_ridge_model =np.sqrt(MSE_ridge_model)

df_ridge = pd.DataFrame([MAE_ridge_model, MSE_ridge_model, RMSE_ridge_model], 
             index=['MAE Ridge', 'MSE Ridge', 'RMSE Ridge'], columns=['Metrics'])
df_ridge # ouptut a dataframe with the metrics that will be analysed

Unnamed: 0,Metrics
MAE Ridge,0.339
MSE Ridge,0.232647
RMSE Ridge,0.482335


In [703]:
r_ridge = r2_score(y_test, y_pred)
r_ridge

0.7667018581222799

In [706]:
ridge_accuracy = ridge_model.score(X_test, y_test)
print('Accuracy = '+ str(ridge_accuracy))

Accuracy = 0.7667018581222799


### Lasso Regression

In [649]:
from sklearn.linear_model import Lasso

In [650]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

X_train,X_test,y_train,y_test=train_test_split(X, y, 
                                               test_size=0.2,random_state=100)

In [651]:
lasso_model = Lasso(alpha=1.0)
lasso=lasso_model.fit(X_train, y_train)

In [652]:
lasso_predict = lasso.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_lasso_predict = metrics.mean_absolute_error(y_test, y_pred)
MSE_lasso_predict = metrics.mean_squared_error(y_test, y_pred)
RMSE_lasso_predict =np.sqrt(MSE_lasso_predict)

df_lasso = pd.DataFrame([MAE_lasso_predict, MSE_lasso_predict, RMSE_lasso_predict], 
             index=['MAE lasso', 'MSE', 'RMSE'], columns=['Metrics'])
df_lasso

Unnamed: 0,Metrics
MAE lasso,0.923032
MSE,1.665488
RMSE,1.290538


In [653]:
r_lasso = r2_score(y_test, y_pred)
r_lasso

-0.5564581088565381

### XGBRegressor

In [654]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [655]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   test_size=0.2, random_state=123)

In [656]:
xgb = xgb.XGBRegressor(objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.12,
    subsample=0.5,
    colsample_bytree=1, 
    max_depth=5,
)
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.12, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [657]:
xgb.fit(X_train,y_train)
predictions = xgb.predict(X_test)

In [658]:
y_pred = xgb.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_xgb = metrics.mean_absolute_error(y_test, y_pred)
MSE_xgb= metrics.mean_squared_error(y_test, y_pred)
RMSE_xgb =np.sqrt(MSE_xgb)

df_xgb = pd.DataFrame([MAE_xgb, MSE_xgb, RMSE_xgb], 
             index=['MAE XGB', 'MSE XGB', 'RMSE  XGB'], columns=['Metrics'])
df_xgb

Unnamed: 0,Metrics
MAE XGB,0.266169
MSE XGB,0.16076
RMSE XGB,0.400949


In [659]:
r_xgb = r2_score(y_test, y_pred)
r_xgb

0.845919321582291

In [660]:
xgb_accuracy = xgb.score(X_test, y_test)
print('Accuracy = '+ str(xgb_accuracy))

Accuracy = 0.845919321582291


from sklearn.metrics import r2_score
r2 = np.sqrt(r2_score(y_test, predictions))
print("R_Squared Score : %f" % (r2))

### Random Forest Regressor

In [661]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

#X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                  # test_size=0.2, random_state=100)

In [662]:
from sklearn.ensemble import RandomForestRegressor
  
 # create regressor object
random_forest = RandomForestRegressor(n_estimators = 100, random_state = 10)
  
# fit the regressor with x and y data
random_forest.fit(X, y)  

RandomForestRegressor(random_state=10)

In [663]:
y_pred = random_forest.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_random_forest= metrics.mean_absolute_error(y_test, y_pred)
MSE_random_forest = metrics.mean_squared_error(y_test, y_pred)
RMSE_random_forest =np.sqrt(MSE_random_forest)

df_randomforest = pd.DataFrame([MAE_random_forest, MSE_random_forest, RMSE_random_forest], 
             index=['MAE RF', 'MSE RF', 'RMSE RF'], columns=['Metrics'])
df_randomforest

Unnamed: 0,Metrics
MAE RF,0.074547
MSE RF,0.017117
RMSE RF,0.130831


In [664]:
r_rf = r2_score(y_test, y_pred)

In [665]:
rf_accuracy = random_forest.score(X_test, y_test)
print('Accuracy = '+ str(rf_accuracy))

Accuracy = 0.9835944783501921


### Decision Tree

In [666]:
from sklearn.tree import DecisionTreeRegressor

In [667]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

decision_tree = DecisionTreeRegressor(random_state = 10)
decision_tree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=10)

In [668]:
y_pred = decision_tree.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_decision_tree= metrics.mean_absolute_error(y_test, y_pred)
MSE_decision_tree = metrics.mean_squared_error(y_test, y_pred)
RMSE_decision_tree =np.sqrt(MSE_decision_tree)

df_decisiontree = pd.DataFrame([MAE_decision_tree, MSE_decision_tree, RMSE_decision_tree], 
             index=['MAE DT', 'MSE DT', 'RMSE DT'], columns=['Metrics'])
df_decisiontree

Unnamed: 0,Metrics
MAE DT,0.234997
MSE DT,0.256039
RMSE DT,0.506003


In [669]:
r_dt = r2_score(y_test, y_pred)
r_dt

0.7545991225593807

In [670]:
dt_accuracy = decision_tree.score(X_test, y_test)
print('Accuracy = '+ str(dt_accuracy))

Accuracy = 0.7545991225593807


### CatBoost

In [671]:
from catboost import CatBoostRegressor

In [672]:
X = data_pre.drop('charges', axis = 1)
y = data_pre['charges']

In [673]:
catboost_model = CatBoostRegressor(iterations=200, verbose=False)

In [674]:
catboost_model.fit(X_train, y_train)

y_pred = catboost_model.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_catboost_model= metrics.mean_absolute_error(y_test, y_pred)
MSE_catboost_model = metrics.mean_squared_error(y_test, y_pred)
RMSE_catboost_model =np.sqrt(MSE_regressor)

df_catboost = pd.DataFrame([MAE_catboost_model, MSE_catboost_model, RMSE_catboost_model], 
             index=['MAE Catboost', 'MSE Catboost', 'RMSE Catboost'], columns=['Metrics'])
df_catboost

Unnamed: 0,Metrics
MAE Catboost,0.204078
MSE Catboost,0.117276
RMSE Catboost,0.267274


In [675]:
r_cat = r2_score(y_test, y_pred)
r_cat

0.8875970077887563

In [676]:
cat_accuracy = catboost_model.score(X_test, y_test)
print('Accuracy = '+ str(cat_accuracy))

Accuracy = 0.8875970077887563


In [707]:
overall = [(r_cat, cat_accuracy), (r_dt,dt_accuracy), (r_rf,rf_accuracy ),
       (r_xgb, xgb_accuracy), (r_ridge, ridge_accuracy),  (r_lasso,  'NAN'), 
       (r_lr,lr_accuracy)]
  
# Create the pandas DataFrame with column names
r_and_accuracy = pd.DataFrame(overall, columns=['R squared', 'Accuracy'], index = ['Catboost', 'Decision Tree', 'Random Forest',
                                                                   'XGB', 'Ridge Regression', 'Lasso Regression',
                                                                   'Linear Regression'])
  
r_and_accuracy #print dataframe.

Unnamed: 0,R squared,Accuracy
Catboost,0.887597,0.887597
Decision Tree,0.754599,0.754599
Random Forest,0.983594,0.983594
XGB,0.845919,0.845919
Ridge Regression,0.766702,0.766702
Lasso Regression,-0.556458,NAN
Linear Regression,0.783346,0.783346


### Joined Datasets

In [678]:
joined = pd.concat((df_decisiontree, df_catboost, df_decisiontree, df_randomforest,
           df_linearreg, df_xgb, df_lasso,df_ridge), axis=1, keys=[ 'df_decisiontree', 'df_catboost',
                                                                   'df_randomforest','df_linearreg',
                                                                  'df_xgb', 'df_lasso','df_ridge'])
joined # this is the joined dataframe

Unnamed: 0_level_0,df_decisiontree,df_catboost,df_randomforest,df_linearreg,df_xgb,df_lasso,df_ridge
Unnamed: 0_level_1,Metrics,Metrics,Metrics,Metrics,Metrics,Metrics,Metrics
MAE DT,0.234997,,0.234997,,,,
MSE DT,0.256039,,0.256039,,,,
RMSE DT,0.506003,,0.506003,,,,
MAE Catboost,,0.204078,,,,,
MSE Catboost,,0.117276,,,,,
RMSE Catboost,,0.267274,,,,,
MAE RF,,,,0.074547,,,
MSE RF,,,,0.017117,,,
RMSE RF,,,,0.130831,,,
MAE Linear Regression,,,,,0.345836,,


### ML Models Without Duplicates and the Columns

In [679]:
df_new  = df
df_new = df_new.drop(['sex','smoker','region'], axis=1)

le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

# Label encoding
df['sex_encoded'] = le_sex.fit_transform(df.sex)
df['smoker_encoded'] = le_smoker.fit_transform(df.smoker)
df['region_encoded'] = le_region.fit_transform(df.region)

In [680]:
df_new = df_new.drop(['region_encoded','sex_encoded', 'children'], axis=1)
df_new

Unnamed: 0,age,bmi,charges,smoker_encoded
0,19,27.900,16884.92400,1
1,18,33.770,1725.55230,0
2,28,33.000,4449.46200,0
3,33,22.705,21984.47061,0
4,32,28.880,3866.85520,0
...,...,...,...,...
1333,50,30.970,10600.54830,0
1334,18,31.920,2205.98080,0
1335,18,36.850,1629.83350,0
1336,21,25.800,2007.94500,0


In [681]:
df_new. drop_duplicates()

Unnamed: 0,age,bmi,charges,smoker_encoded
0,19,27.900,16884.92400,1
1,18,33.770,1725.55230,0
2,28,33.000,4449.46200,0
3,33,22.705,21984.47061,0
4,32,28.880,3866.85520,0
...,...,...,...,...
1333,50,30.970,10600.54830,0
1334,18,31.920,2205.98080,0
1335,18,36.850,1629.83350,0
1336,21,25.800,2007.94500,0


## Top 3 Algorithms 

Based on the results in the first ML ALgorithms section I will be using the top 3 and removing the duplicated data and based on feature selction will identify whether there is a positive, negative or no effect when changing those factors.

### - Random Forest 

In [682]:
from sklearn.preprocessing import StandardScaler
df_new2 = df_new.copy()
 
tempBmi = data_pre.bmi
tempBmi = tempBmi.values.reshape(-1,1)
df_new2['bmi'] = StandardScaler().fit_transform(tempBmi)
 
tempAge = data_pre.age
tempAge = tempAge.values.reshape(-1,1)
df_new2['age'] = StandardScaler().fit_transform(tempAge)
 
tempCharges = data_pre.charges
tempCharges = tempCharges.values.reshape(-1,1)
df_new2['charges'] = StandardScaler().fit_transform(tempCharges)
 
df_new2.head()

Unnamed: 0,age,bmi,charges,smoker_encoded
0,-1.438764,-0.45332,0.298584,1
1,-1.509965,0.509621,-0.953689,0
2,-0.797954,0.383307,-0.728675,0
3,-0.441948,-1.305531,0.719843,0
4,-0.513149,-0.292556,-0.776802,0


In [687]:
X = df_new2.drop('charges', axis = 1)
y = df_new2['charges']

X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   test_size=0.2, random_state = 100)

In [688]:
from sklearn.ensemble import RandomForestRegressor
  
random_forest = RandomForestRegressor(n_estimators = 100, random_state = 10)
random_forest.fit(X, y)  

RandomForestRegressor(random_state=10)

In [689]:
y_pred = random_forest.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_random_forest= metrics.mean_absolute_error(y_test, y_pred)
MSE_random_forest = metrics.mean_squared_error(y_test, y_pred)
RMSE_random_forest =np.sqrt(MSE_random_forest)

pd.DataFrame([MAE_random_forest, MSE_random_forest, RMSE_random_forest], 
             index=['MAE Random Forest', 'MSE Random Forest', 'RMSE Random Forest'], columns=['Metrics'])

Unnamed: 0,Metrics
MAE Random Forest,0.088473
MSE Random Forest,0.027698
RMSE Random Forest,0.166427


In [690]:
r2_score(y_test, y_pred)

0.9741152893564211

In [691]:
accuracy = random_forest.score(X_test, y_test)
print('Accuracy = '+ str(accuracy))

Accuracy = 0.9741152893564211


### - CatBoost

In [597]:
X = df_new2.drop('charges', axis = 1)
y = df_new2['charges']

X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   test_size=0.2, random_state = 100)

In [598]:
catboost_model = CatBoostRegressor(iterations=200, verbose=False)

In [621]:
catboost_model.fit(X_train, y_train)

y_pred = catboost_model.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_catboost_model= metrics.mean_absolute_error(y_test, y_pred)
MSE_catboost_model = metrics.mean_squared_error(y_test, y_pred)
RMSE_catboost_model =np.sqrt(MSE_catboost_model)

pd.DataFrame([MAE_catboost_model, MSE_catboost_model, RMSE_catboost_model], 
             index=['MAE Catboost New', 'MSE Catboost New', 'RMSECatboost New'], columns=['Metrics'])

Unnamed: 0,Metrics
MAE Catboost New,0.206783
MSE Catboost New,0.142204
RMSECatboost New,0.3771


In [320]:
r2_score(y_test, y_pred)

0.8821109375889608

In [321]:
accuracy = catboost_model.score(X_test, y_test)
print('Accuracy = '+ str(accuracy))

Accuracy = 0.8821109375889608


### - XGB

In [602]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [603]:
X = df_new2.drop('charges', axis = 1)
y = df_new2['charges']

X_train, X_test, y_train, y_test= train_test_split(X, y, 
                                                   test_size=0.2, random_state=123)

In [604]:
xgb = xgb.XGBRegressor(objective='reg:squarederror',
    n_estimators=1000,
    learning_rate=0.12,
    subsample=0.5,
    colsample_bytree=1, 
    max_depth=5,
)
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.12, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [605]:
xgb.fit(X_train,y_train)
predictions = xgb.predict(X_test)

In [608]:
y_pred = xgb.predict(X_test)
y_pred = pd.DataFrame(y_pred)
MAE_xgb = metrics.mean_absolute_error(y_test, y_pred)
MSE_xgb= metrics.mean_squared_error(y_test, y_pred)
RMSE_xgb =np.sqrt(MSE_xgb)

df_xgb = pd.DataFrame([MAE_xgb, MSE_xgb, RMSE_xgb], 
             index=['MAE XGB', 'MSE XGB', 'RMSE  XGB'], columns=['Metrics'])
df_xgb

Unnamed: 0,Metrics
MAE XGB,0.278482
MSE XGB,0.184807
RMSE XGB,0.429892


In [609]:
r2_score(y_test, y_pred)

0.8228712582723372

In [610]:
accuracy = xgb.score(X_test, y_test)
print('Accuracy = '+ str(accuracy))

Accuracy = 0.8228712582723372
