In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
data_dir = "../input/insurance/insurance.csv"
df = pd.read_csv(data_dir)

In [None]:
df.head(5)

Purpose of model: predict insurance price charged to individual depending on information provided

# EDA

In [None]:
plt.figure(figsize=(10,10))
corr = df.corr(method='spearman')
ax = sns.heatmap(corr, square=True, linewidths=.5, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

age seems to have high correlation to the insurance charges with very low correlation value for bmi and children

In [None]:
# Explore if there seem to be any relation between age and sex with price charged
plt.figure(figsize=(15,10))
ax = sns.scatterplot(x="charges", y="age", hue="sex", style="smoker", data=df)
ax.plot()

Plot shows that there is a general trend of charges increasing with age, although plot also divided into 3 groups (0~10000, 10000~25000, 30000~45000) and some outliers. It can be shown the first group consist of mostly smokers with some smoker on the second group and none on the third group.

In [None]:
# Explore if there seem to be any relation between age and sex with price charged
plt.figure(figsize=(15,10))
ax = sns.scatterplot(x="charges", y="age", hue="children", data=df)
ax.plot()

for the first group, children seems to have small correlation to the insurance price charge

In [None]:
# Explore if there seem to be any relation between age and sex with price charged
plt.figure(figsize=(15,10))
ax = sns.scatterplot(x="charges", y="age", hue="bmi", data=df)
ax.plot()

Similarly, the bmi value seems to affect insurance price on the higher end insurance charges

In [None]:
plt.figure(figsize=(15,10))
ax = sns.violinplot(x="region", y="bmi", data=df, inner="quartile")
ax.plot()

People in north has lower bmi compared to people living in the south

In [None]:
df.groupby(["region"]).mean()

people living in the east seems to be charged higher on average compared to west? not due to age as average age is similar in all regions, might be due to smoker or sex?

# To Summarize
preliminary analysis seems to show that age and smoking have high correlation to the insurance price, sex has medium correlation, children and bmi has small correlations

In [None]:
# Seperate data into features/X and label/Y
df_X = df[["age", "sex", "bmi", "children", "smoker", "region"]]
df_y = df["charges"]

In [None]:
cat_col = ["sex", "region", "smoker"]
num_col = ["age", "bmi", "children"]

ohe = OneHotEncoder()
encoded_cat = ohe.fit_transform(df[cat_col])
encoded_cat.toarray()
ohe.categories_

In [None]:
def OHE_attribs(df, cat_attribs):
    ohe = OneHotEncoder()
    encoded_cols = ohe.fit_transform(df[cat_attribs])
    cat_columns = []
    for i in range(len(cat_attribs)):
        for j in ohe.categories_[i]:
            cat_columns.append(cat_attribs[i]+'_'+str(j))
    return cat_columns, encoded_cols.toarray()

cat_cols, encoded_cols  = OHE_attribs(df, cat_col)
cat_cols, encoded_cols

In [None]:
df_X = df[num_col].merge(pd.DataFrame(encoded_cols, columns=cat_cols), how="left", right_index=True, left_index=True)
df_X.head(5)

In [None]:
X = df_X
y = df_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Linear Regression is: {:,.0f}'.format(error))

dt_model = DecisionTreeRegressor(max_leaf_nodes=900, random_state=1)
dt_model.fit(X_train, y_train)
pred = dt_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Decision Tree is: {:,.0f}'.format(error))

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Random Forest is: {:,.0f}'.format(error))

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of XGBoost is: {:,.0f}'.format(error))

In [None]:
print(f"Linear Regression Model Accuray: {(lr_model.score(X_test, y_test)*100):.2f}")
print(f"Decision Tree Model Accuray: {(dt_model.score(X_test, y_test)*100):.2f}")
print(f"Random Forest Model Accuray: {(rf_model.score(X_test, y_test)*100):.2f}")
print(f"XGBoost Model Accuray: {(xgb_model.score(X_test, y_test)*100):.2f}")

In [None]:
# Plot predicted vs actual results of best model
pred = rf_model.predict(X_test)

plt.figure(figsize=(10,10))
plt.scatter(y_test, pred, c='crimson')
#plt.yscale('log')
#plt.xscale('log')

p1 = max(max(pred), max(y_test))
p2 = min(min(pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

What if columns with low correlation were dropped

In [None]:
drop_columns = ["children", "region_northeast", "region_northwest", "region_southeast", "region_southwest"]
df_X2 = df_X.drop(drop_columns, axis=1)
# Standardize all values in dataframe
X = df_X2
y = df_y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
df_X2.head(5)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Linear Regression is: {:,.0f}'.format(error))

dt_model = DecisionTreeRegressor(max_leaf_nodes=900, random_state=1)
dt_model.fit(X_train, y_train)
pred = dt_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Decision Tree is: {:,.0f}'.format(error))

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of Random Forest is: {:,.0f}'.format(error))

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_test)
error = mae(pred, y_test)
print('MAE of XGBoost is: {:,.0f}'.format(error))

In [None]:
print(f"Linear Regression Model Accuray: {(lr_model.score(X_test, y_test)*100):.2f}")
print(f"Decision Tree Model Accuray: {(dt_model.score(X_test, y_test)*100):.2f}")
print(f"Random Forest Model Accuray: {(rf_model.score(X_test, y_test)*100):.2f}")
print(f"XGBoost Model Accuray: {(xgb_model.score(X_test, y_test)*100):.2f}")

Difference in performance is minimal

In [None]:
# Plot predicted vs actual results of best model
pred = rf_model.predict(X_test)

plt.figure(figsize=(10,10))
plt.scatter(y_test, pred, c='crimson')

p1 = max(max(pred), max(y_test))
p2 = min(min(pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

# Conclusion
Best performing model achieved accuracy of 86.35 with MSE of 2752