# Multiple linear regression for Insurance charges prediction

<ul style="list-style-type:none;">
    <li>1. Exploratory data analysis</li>
    <li>2. Feature encoding and feature engineering</li>
    <li>3. Model performance contrast</li>
    <li>4. Conclusions</li>
</ul>

In [None]:
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from sklearn.linear_model import LinearRegression
from scipy.stats import f
from scipy.stats import t
from scipy.stats import shapiro
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
def linearRegression(X,y):
    n = X.shape[0]
    p = X.shape[1]
    reg = LinearRegression().fit(X, y)
    y_pred = reg.predict(X)
    error = y - y_pred
    residuals = pd.DataFrame({'idx':X.index,'residuals':error.values.reshape(error.shape[0])}) 
    residuals.set_index('idx',inplace=True)
    squared_error = (error)**2
    var = squared_error.mean()[0]
    std = math.sqrt(var)
    y_pred_aux = y_pred.reshape(y_pred.shape[0])
    y_aux = y.values.reshape((y.shape[0]))
    
    SSR = ((y_pred_aux - y.mean()[0])**2).sum()
    SSE = ((y_aux - y_pred_aux)**2).sum()
    SSTO = ((y_aux - y.mean()[0])**2).sum()
    SSR = SSTO - SSE
    r_squared = 1 - SSE/SSTO
    adjusted_r_squared = 1 - (1-r_squared)*(len(y.values)-1)/(len(y.values)-X.shape[1]-1)
    
    
    reg_statistics = pd.DataFrame({'Statistics': ['R square', 'Adjusted R square','Standard error','Observations'],
                   'values': [r_squared,adjusted_r_squared,std,n]})
    reg_statistics.set_index('Statistics',inplace=True)
    
    anova = pd.DataFrame({'Source': ['Model','Error','Total'],
                   'Degrees of freedom': [p,n-p-1,n-1],
                    'Sum of squares':[SSR,SSE,SSTO]
                     })
    anova['Sum of squares'] = anova['Sum of squares'].astype('int64')
    anova['Mean square'] = anova['Sum of squares']/anova['Degrees of freedom']
    anova['Sum of squares'] = anova['Sum of squares'].astype('int64')
    anova.set_index('Source',inplace=True)
    
    F = anova['Mean square'].iloc[0]/anova['Mean square'].iloc[1]
    p_value = 1 - f.cdf(F, p, n-p-1)
    
    coefs_df = pd.DataFrame(data={'variables':['intercept'],'betas':[reg.intercept_[0]]})
    variables = pd.DataFrame(X.columns.values,columns=['variables'])
    betas = pd.DataFrame(reg.coef_[0], columns=['betas'])
    coefs = pd.concat([variables,betas],axis=1)
    coefs_df = pd.concat([coefs_df,coefs],axis=0)
    coefs_df['betas_int'] = coefs_df['betas'].astype('int64')
    
    X_aux = pd.DataFrame({'idx':X.index,'intercept':[1 for i in range(n)]})
    X_aux.set_index('idx',inplace=True)
    X_aux = pd.concat([X_aux,X],axis=1)
    num = math.sqrt(squared_error.values.sum()/(n-p-1))
    x_mat = X_aux.values
    C = var*np.linalg.inv(np.matmul(x_mat.transpose(),x_mat))
    diff = []
    for i, c in enumerate(X_aux.columns):
        diff.append(math.sqrt(abs(C[i][i])))
    se = pd.DataFrame({'standard error': diff})
    coefs_df.reset_index(inplace = True)
    coefs_df = pd.concat([coefs_df,se],axis=1)
    coefs_df.set_index('variables',inplace=True)
    coefs_df['t stat'] = coefs_df['betas']/coefs_df['standard error']
    coefs_df['significance'] = 2*(1 - t.cdf(abs(coefs_df['t stat']), n-p-1))
    coefs_df.drop(['index'],axis=1,inplace=True)
    return reg_statistics, anova, F, p_value, coefs_df, residuals

def predict(coefs_df, values_df):
    df = pd.DataFrame({'intercept':[1 for i in range(values_df.shape[0])]}, index=values_df.index)
    values_df = pd.concat([df,values_df],axis=1)
    coefs = coefs_df['betas'].values
    values = values_df.values
    coefs = coefs.reshape((coefs.shape[0],1))
    out = np.matmul(values,coefs)
    out = out.reshape(out.shape[0])
    return out

## 1. Exploratory data analysis

In [None]:
path = "../input/insurance"
insurance_df = pd.read_csv(path+"/insurance.csv")

In [None]:
insurance_df

In [None]:
insurance_df.info()

##### From the above table we can see that there is no missing values, there are 4 numeric variables (age, bmi, children, charges) and 3 categorical (sex, smoker, and region).

##### Charges is the dependet variable and the rest are the independent variables.

In [None]:
insurance_df.describe()

In [None]:
insurance_df.describe(include=['O'])

In [None]:
sns.set()
fig = plt.figure(figsize=(16,5))
ax = sns.histplot(insurance_df['age'])
plt.title('Age distribution')

In [None]:
fig = plt.figure(figsize=(16,5))
ax = sns.histplot(insurance_df['bmi'])
plt.title('BMI distribution')

In [None]:
fig = plt.figure(figsize=(5,5))
ax = sns.countplot(x='smoker', data = insurance_df)
plt.title('Smoker distribution')

In [None]:
fig = plt.figure(figsize=(5,5))
ax = sns.countplot(x='sex', data = insurance_df)
plt.title('Sex distribution')

In [None]:
fig = plt.figure(figsize=(15,5))
ax = sns.countplot(x='children', data = insurance_df)
plt.title('Children distribution')

In [None]:
fig = plt.figure(figsize=(14,5))
ax = sns.histplot(insurance_df['region'])
plt.title('Region distribution')

In [None]:
fig = plt.figure(figsize=(16,5))
ax = sns.histplot(insurance_df['charges'])
plt.title('Cost distribution')

In [None]:
fig = plt.figure(figsize=(16,5))
sns.catplot(x="sex", y="charges",
                hue="smoker",
                data=insurance_df, kind="box",
                height=8, aspect=2)

## 2. Feature encoding and feature engineering

##### We need to encode the categorical variables and we create two more variables to see if they improve the performance of the model.

##### We create a variable for smoker people with bmi greaters than 30.0.

##### We create another variable for males with children.

In [None]:
insurance_df['sex_binary'] = insurance_df.sex.map( lambda s : 1 if s == 'male' else 0 )
insurance_df['smoker_binary'] = insurance_df.smoker.map( lambda s : 1 if s == 'yes' else 0 )
insurance_df['region_ordinal'] = insurance_df.region.map({'northeast':0, 'northwest':1, 'southeast':2, 'southwest':3})
region_dummies = pd.get_dummies(insurance_df['region'])
insurance_df = pd.concat([insurance_df, region_dummies],axis=1)
insurance_df['not_healthy'] = insurance_df.bmi.map(lambda s : 1 if s > 30.0 else 0)
insurance_df['not_healthy'] = insurance_df['not_healthy']*insurance_df['smoker_binary']
insurance_df['father'] = (insurance_df.children.map(lambda s : 1 if s > 0 else 0))*(insurance_df['sex_binary'])

In [None]:
cols_order = ['region','sex','smoker','region_ordinal','northeast','northwest','southeast','southwest','children','father','not_healthy','smoker_binary','sex_binary','age','bmi','charges']
insurance_df = insurance_df[cols_order]
X = insurance_df.copy()

In [None]:
y = pd.DataFrame(insurance_df['charges'])
X.drop(columns=['charges','region','sex','smoker','northeast','northwest','southeast','southwest'],inplace=True)

In [None]:
corr = insurance_df.corr()
plt.figure(figsize=(13,10))
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(data=corr, annot=True, mask=mask)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['bmi'],y=insurance_df['charges'],hue=insurance_df['smoker_binary'],style=insurance_df['sex_binary'],size=5 + 5*insurance_df['smoker_binary'])
plt.title('BMI vs Insurance charges segmented by smoke')
plt.xlabel('BMI')
plt.ylabel('Insurance charges')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['age'],y=insurance_df['charges'],hue=insurance_df['bmi'],style=insurance_df['sex_binary'],size=5 + 5*insurance_df['smoker_binary'])
plt.title('Age vs Insurance charges segmented by BMI')
plt.xlabel('Age')
plt.ylabel('Insurance charges')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['region_ordinal'],y=insurance_df['charges'],hue=insurance_df['sex'],style=insurance_df['sex_binary'],size=insurance_df['bmi'])
plt.title('Region vs Insurance charges segmented by BMI')
plt.xlabel('Region')
plt.ylabel('Insurance charges')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['children'],y=insurance_df['charges'],hue=insurance_df['sex'],style=insurance_df['sex_binary'],size=5 + 5*insurance_df['bmi'])
plt.title('Age vs Insurance charges segmented by BMI')
plt.xlabel('Age')
plt.ylabel('Insurance charges')

In [None]:
fig = px.scatter_3d(insurance_df, x='bmi', y='age', z='charges',
              color='smoker_binary', size=10*(0.1+insurance_df['smoker_binary']), size_max=20,
              symbol='sex_binary', opacity=1,range_color=(0,2))

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

#### Train, test split sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## 3. Model performance contrast

##### First we use all the variables for a performance benchmark

In [None]:
reg_statistics, anova, F, p_value, coefs_df, res = linearRegression(X_train,y_train)

In [None]:
reg_statistics

In [None]:
coefs_df

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=[i for i in range(res.shape[0])],y=res['residuals'])
plt.title('Model residuals error')
plt.xlabel('Index')
plt.ylabel('Residual error')

In [None]:
y_test_pred = predict(coefs_df, X_test)
model0_mae = mean_absolute_error(y_test, y_test_pred)
r0_test = metrics.r2_score(y_test, y_test_pred)
print(model0_mae,r0_test)

##### From the coefficients table of the above model, we can see that there are variables non significative (father, sex_binary).

In [None]:
aux1_cols=['region_ordinal','children','not_healthy','smoker_binary','age','bmi']
X_train1 = X_train[aux1_cols]

In [None]:
reg_statistics1, anova1, F1, p_value1, coefs_df1, res1 = linearRegression(X_train1,y_train)

In [None]:
reg_statistics1

In [None]:
coefs_df1

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=[i for i in range(res1.shape[0])],y=res1['residuals'])
plt.title('Model residuals error')
plt.xlabel('Index')
plt.ylabel('Residual error')

In [None]:
y1_test_pred = predict(coefs_df1, X_test[aux1_cols])
model1_mae = mean_absolute_error(y_test, y1_test_pred)
r1_test = metrics.r2_score(y_test, y1_test_pred)
print(model1_mae,r1_test)

##### We are going to try two more models taking out more variables, and see if we can improve the performance.

In [None]:
aux2_cols=['not_healthy','smoker_binary','age','bmi']
X_train2 = X_train[aux2_cols]

In [None]:
reg_statistics2, anova2, F2, p_value2, coefs_df2, res2 = linearRegression(X_train2,y_train)

In [None]:
reg_statistics2

In [None]:
coefs_df2

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=[i for i in range(res2.shape[0])],y=res2['residuals'])
plt.title('Model residuals error')
plt.xlabel('Index')
plt.ylabel('Residual error')

In [None]:
y2_test_pred = predict(coefs_df2, X_test[aux2_cols])
model2_mae = mean_absolute_error(y_test, y2_test_pred)
r2_test = metrics.r2_score(y_test, y2_test_pred)
print(model2_mae,r2_test)

In [None]:
aux3_cols=['smoker_binary','age','bmi']
X_train3 = X_train[aux3_cols]

In [None]:
reg_statistics3, anova3, F3, p_value3, coefs_df3, res3 = linearRegression(X_train3,y_train)

In [None]:
reg_statistics3

In [None]:
coefs_df3

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=[i for i in range(res3.shape[0])],y=res3['residuals'])
plt.title('Model residuals error')
plt.xlabel('Index')
plt.ylabel('Residual error')

In [None]:
y3_test_pred = predict(coefs_df3, X_test[aux3_cols])
model3_mae = mean_absolute_error(y_test, y3_test_pred)
r3_test = metrics.r2_score(y_test, y3_test_pred)
print(model3_mae,r3_test)

In [None]:
models = ['model_0','model_1','model_2','model_3']
maes = [model0_mae,model1_mae,model2_mae,model3_mae]
r2s = [r0_test,r1_test,r2_test,r3_test]
mae_df = pd.DataFrame({'model':models,'mae':maes,'r2':r2s})

In [None]:
fig = plt.figure(figsize=(5,5))
ax = sns.barplot(x='model',y='mae', data = mae_df)
plt.title('Model performance')

In [None]:
fig = plt.figure(figsize=(5,5))
ax = sns.barplot(x='model',y='r2', data = mae_df)
plt.title('Model performance')

## 4. Conclusions

##### The best performance was from the model 2 using the variables (not_healthy, smoker_binary, age, bmi) so finally the feature we construct did improve the model.

In [None]:
y_pred = predict(coefs_df2, X[aux2_cols])
mean_absolute_error(y, y_pred)

In [None]:
r2_test = metrics.r2_score(y, y_pred)
r2_test

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=y['charges'],y=y_pred)
plt.title('Actual insurance charges vs Predicted insurance charges')
plt.xlabel('Actual insurance charges')
plt.ylabel('Predicted insurance charges')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['bmi'],y=insurance_df['charges'],hue=insurance_df['smoker_binary'],style=insurance_df['sex_binary'],size=5 + 5*insurance_df['smoker_binary'])
plt.title('BMI vs Insurance charges segmented by smoke')
plt.xlabel('BMI')
plt.ylabel('Insurance charges')

In [None]:
fig = plt.figure(figsize=(16,8))
sns.scatterplot(x=insurance_df['bmi'],y=y_pred,hue=insurance_df['smoker_binary'],style=insurance_df['sex_binary'],size=5 + 5*insurance_df['smoker_binary'])
plt.title('BMI vs Predicted insurance charges segmented by smoke')
plt.xlabel('BMI')
plt.ylabel('Predicted insurance charges')

#### From the last two plots we see that the predictions are similar to the real values, and we use few variables so we respect the parsimony principle. 