In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LassoLarsIC, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, make_scorer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV, Ridge

### Functions used to summaries results:

In [2]:
def accuracy(model, y_test, x_test):
    pred = model.predict(x_test)
    mse = mean_squared_error(y_test, pred)
    mape = mean_absolute_percentage_error(y_test, pred)
    if model.__class__.__name__ == 'RegressionResultsWrapper':
        print('Multiple linear regression\n---------------------')
    
    else:
        print('Lasso regression')
        
    
    print(f'Mean Squared Error: {round(mse,2)} \nMean Absolute Percentage Error: {round(mape*100,2)}%')

In [3]:
df = pd.read_csv('insurance_dataset.csv')
df

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.307669
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899218
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.476302
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.029843
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309838
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,59,male,46.67,2,no,northeast,High blood pressure,,Frequently,Student,Basic,11584.134900
999996,33,male,36.83,2,no,northeast,,High blood pressure,Frequently,Unemployed,Basic,9834.871456
999997,39,male,39.84,0,yes,northeast,Heart disease,High blood pressure,Rarely,Blue collar,Standard,22076.632856
999998,37,female,45.06,4,yes,northeast,High blood pressure,Diabetes,Occasionally,Unemployed,Premium,20297.618728


## Medical insurance charges
- **Age**: The age of the insured individual.
- **Gender**: The gender of the insured individual.
- **BMI** (Body Mass Index): A measure of body fat based on height and weight.
- **Children**: The number of children covered by the insurance plan.
- **Smoking Status**: Indicates whether the individual is a smoker.
- **Region**: The geographical region of the insured individual.
- **Medical History**: Information about the individual's old medical problems.
- **Family Medical History**: Information about the family's medical record.
- **Exercise Frequency**: The frequency of the individual's exercise routine.
- **Occupation**: The occupation of the insured individual.
- **Coverage Level**: The type of insurance plan.
- **Charges**: The health insurance charges for the individual

Data is from Kaggle: https://www.kaggle.com/datasets/sridharstreaks/insurance-data-for-machine-learning/data


## Regressions

In [4]:
# Divide into dependent and independent vairables
X = df.drop('charges', axis=1)
y = df['charges']

# Split data into train and test.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.30)
X_train['charges'] = y_train
training = X_train.copy()

In [5]:
# Good way to code regression with categorical variables
formula = 'charges ~ age + C(gender) + bmi + children + C(smoker) + C(region) + C(medical_history) + C(family_medical_history) + C(exercise_frequency) + C(occupation) + C(coverage_level)'
model = smf.ols(formula = formula, data = training).fit()
model.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.996
Model:,OLS,Adj. R-squared:,0.996
Method:,Least Squares,F-statistic:,7422000.0
Date:,"Fri, 01 Nov 2024",Prob (F-statistic):,0.0
Time:,11:07:21,Log-Likelihood:,-4958800.0
No. Observations:,700000,AIC:,9918000.0
Df Residuals:,699977,BIC:,9918000.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.05e+04,2.305,4553.435,0.000,1.05e+04,1.05e+04
C(gender)[T.male],999.6774,0.690,1449.003,0.000,998.325,1001.030
C(smoker)[T.yes],5000.2168,0.690,7247.585,0.000,4998.865,5001.569
C(region)[T.northwest],-699.4313,0.976,-716.990,0.000,-701.343,-697.519
C(region)[T.southeast],-498.2446,0.976,-510.592,0.000,-500.157,-496.332
C(region)[T.southwest],-797.8655,0.975,-818.195,0.000,-799.777,-795.954
C(medical_history)[T.Heart disease],3000.2458,0.976,3072.807,0.000,2998.332,3002.159
C(medical_history)[T.High blood pressure],-1001.6801,0.976,-1026.128,0.000,-1003.593,-999.767
C(medical_history)[T.None],-1999.8154,0.976,-2050.000,0.000,-2001.727,-1997.903

0,1,2,3
Omnibus:,630324.236,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,42102.596
Skew:,0.001,Prob(JB):,0.0
Kurtosis:,1.799,Cond. No.,397.0


In [6]:
# Out-of-sample testing
accuracy(model, y_test, X_test)

Multiple linear regression
---------------------
Mean Squared Error: 83509.42 
Mean Absolute Percentage Error: 1.62%


## Regularization with Lasso


Previously we used multiple linear regression to make our model by the OLS method. The model follows the format:
$$y = \alpha + \beta_1 x_1 + \beta_2 x_2 + ... \beta_n x_n$$
Where the $\beta$ parameters are set to minimize its cost function:
$$ \sum_{i=1}^n(y_i - \hat{y}_i)^2 $$
In regularization we add a penalty term to the cost function and try to minimize it. With Lasso, the cost function we will work with is:
$$ \sum_{i=1}^n(y_i - \hat{y}_i)^2 + \alpha \sum_{i=1}^n |\beta_i| $$ 

If a feature is less significant, it is shrunk towards zero. As a result, features whos coefficients have been shrunk to zero are functionally removed from the model, essentially performing feature selection. 


In [7]:
X = df.drop('charges', axis=1)
y = df['charges']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest,medical_history_Heart disease,medical_history_High blood pressure,...,family_medical_history_High blood pressure,family_medical_history_None,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,1,1,0,1,0,0,0,...,0,1,1,0,0,0,0,0,1,0
1,25,25.38,2,0,1,1,0,0,0,0,...,1,0,0,1,0,0,0,1,1,0
2,38,44.88,2,1,1,0,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
3,25,19.89,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,49,38.21,3,1,1,1,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,59,46.67,2,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
999996,33,36.83,2,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
999997,39,39.84,0,1,1,0,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
999998,37,45.06,4,0,1,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,0


In [8]:

lasso = Lasso(max_iter=10000)

# Lasso cross validation
test_alphas = np.linspace(0.01,4, 50)
lassocv = LassoCV(alphas=test_alphas, cv=10, max_iter=100000)

lassocv.fit(X, y)

In [16]:
# Alpha value choosen by the cross validation
# Choose the smallest alpha given as an option, showing little willingness to put less importance on any variables
lassocv.alpha_

0.01

In [17]:
lasso.set_params(alpha = lassocv.alpha_)

X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X,y, test_size= 0.30)

lasso.fit(X_train_l, y_train_l)

accuracy(lasso, y_test_l, X_test_l)

Lasso regression
Mean Squared Error: 83453.31 
Mean Absolute Percentage Error: 1.62%


#### Interpretation

Seems Lasso yields little improvement from the basic OLS model. This is of no suprise as on the summary for the OLS model it seemed to strongly indicate that all variables are important. Since Lasso is a linear regression with a penalty term that helps to prevent overfitting and facilitate automatic feature selection, it is not suprising that it did not yield much improvements as all features in the dataset seems important.

## Regression with Ridge regularization 

In [28]:
alphas = np.logspace(-4, 1, 100)
ridge_cv = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas, scoring='neg_mean_squared_error')).fit(X_train_l, y_train_l)

best_alpha_ridge = ridge_cv[-1].alpha_
ridge_model = Ridge(alpha=best_alpha_ridge).fit(X_train_l, y_train_l)

y_pred_ridge = ridge_model.predict(X_test_l)
ridge_mse = mean_squared_error(y_test_l, y_pred_ridge)

print(f"Best alpha for Ridge Regression: {round(best_alpha_ridge,5)}")
print(f"Mean Squared Error on test data: {round(ridge_mse,2)}")

Best alpha for Ridge Regression: 0.08498
Mean Squared Error on test data: 83453.26


In [30]:
#Ridge with same alpha as for Lasso regression
ridge_cv = make_pipeline(StandardScaler(), RidgeCV(alphas=alphas, scoring='neg_mean_squared_error')).fit(X_train_l, y_train_l)
ridge_model = Ridge(alpha=best_alpha_ridge).fit(X_train_l, y_train_l)
y_pred_ridge = ridge_model.predict(X_test_l)
ridge_mse = mean_squared_error(y_test_l, y_pred_ridge)
accuracy(ridge_model, y_test_l, X_test_l)

Lasso regression
Mean Squared Error: 83453.26 
Mean Absolute Percentage Error: 1.62%
