# Linear Regression Model 1: All Features

In [165]:
# read csv into DataFrame
# import pandas for data manipulation
import pandas as pd

df = pd.read_csv('medical_prepared_data.csv', index_col=0)

In [166]:
# install statsmodels
# !pip install statsmodels

In [167]:
# read csv into DataFrame
# import pandas for data manipulation
import pandas as pd

# import statsmodels to perform linear regression
import statsmodels.api as sm
 
# define the variables
x = df.drop(columns='Additional_charges').assign(const=1)
y = df['Additional_charges']
 
# perform the regression and fitting the model
model1 = sm.OLS(y, x).fit()
model1.summary()

0,1,2,3
Dep. Variable:,Additional_charges,R-squared:,0.939
Model:,OLS,Adj. R-squared:,0.939
Method:,Least Squares,F-statistic:,4457.0
Date:,"Wed, 22 Mar 2023",Prob (F-statistic):,0.0
Time:,21:17:30,Log-Likelihood:,-82196.0
No. Observations:,9331,AIC:,164500.0
Df Residuals:,9298,BIC:,164700.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Population,0.0006,0.001,0.414,0.679,-0.002,0.003
Children,18.4695,8.863,2.084,0.037,1.095,35.844
Age,225.5985,0.816,276.420,0.000,223.999,227.198
Income,0.0002,0.001,0.256,0.798,-0.001,0.001
VitD_levels,-12.9507,8.472,-1.529,0.126,-29.558,3.656
Doc_visits,-11.1326,16.190,-0.688,0.492,-42.868,20.603
Full_meals_eaten,22.9622,17.219,1.334,0.182,-10.790,56.715
vitD_supp,9.6960,28.743,0.337,0.736,-46.647,66.039
Soft_drink,13.0740,38.464,0.340,0.734,-62.324,88.472

0,1,2,3
Omnibus:,1472.775,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,336.224
Skew:,-0.024,Prob(JB):,9.77e-74
Kurtosis:,2.071,Cond. No.,553000.0


In [168]:
# evaluate model
from statsmodels.tools.eval_measures import rmse

# generate predictions
y_pred = model1.predict(x)

# calculate RMSE
RMSE1 = rmse(y, y_pred)
RMSE1

1619.679345691746

In [169]:
# install tabulate
# !pip install tabulate

In [170]:
# calculate VIF to detect multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

#import tabulate to view results
import tabulate

# create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Population,1.002979
1,Children,1.003222
2,Age,1.004033
3,Income,1.00295
4,VitD_levels,1.004465
5,Doc_visits,1.003769
6,Full_meals_eaten,1.005218
7,vitD_supp,1.004086
8,Soft_drink,1.004003
9,HighBlood,1.003691


# Linear Regression Model 2: Reduced Features

In [185]:
# import packages from scikit-learn for feature reduction
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

# re-define independent variables
x = df.drop(columns='Additional_charges')

# linear regression estimator to be used in feature selection
reg = LinearRegression().fit(x, y)

In [186]:
# reduce features using SequentialFeatureSelector
sfs = SequentialFeatureSelector(reg, n_features_to_select=16)
sfs.fit(x, y)

In [187]:
# return selected features
sfs.get_feature_names_out()

array(['Children', 'Age', 'VitD_levels', 'Full_meals_eaten', 'HighBlood',
       'Stroke', 'Complication_risk', 'Arthritis', 'Diabetes',
       'Reflux_esophagitis', 'Asthma', 'Gender_Male',
       'Initial_admin_Emergency Admission',
       'Initial_admin_Observation Admission', 'Services_Intravenous',
       'Services_MRI'], dtype=object)

In [188]:
# reduce x to selected features
x = df[['Children', 'Age', 'VitD_levels', 'Full_meals_eaten', 'HighBlood',
       'Stroke', 'Complication_risk', 'Arthritis', 'Diabetes',
       'Reflux_esophagitis', 'Asthma', 'Gender_Male',
       'Initial_admin_Emergency Admission',
       'Initial_admin_Observation Admission', 'Services_Intravenous',
       'Services_MRI']].assign(const=1)

In [189]:
# re-perform linear regression 
# perform the regression and fitting the model
model2 = sm.OLS(y, x).fit() 
model2.summary()

0,1,2,3
Dep. Variable:,Additional_charges,R-squared:,0.939
Model:,OLS,Adj. R-squared:,0.939
Method:,Least Squares,F-statistic:,8923.0
Date:,"Wed, 22 Mar 2023",Prob (F-statistic):,0.0
Time:,21:22:56,Log-Likelihood:,-82199.0
No. Observations:,9331,AIC:,164400.0
Df Residuals:,9314,BIC:,164600.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Children,18.3131,8.850,2.069,0.039,0.965,35.661
Age,225.5533,0.815,276.822,0.000,223.956,227.150
VitD_levels,-12.8290,8.463,-1.516,0.130,-29.419,3.761
Full_meals_eaten,22.9799,17.186,1.337,0.181,-10.709,56.669
HighBlood,8632.3051,34.183,252.530,0.000,8565.298,8699.312
Stroke,356.2724,42.006,8.481,0.000,273.931,438.614
Complication_risk,273.3831,23.025,11.874,0.000,228.250,318.516
Arthritis,-61.9628,35.045,-1.768,0.077,-130.658,6.732
Diabetes,45.4298,37.678,1.206,0.228,-28.427,119.287

0,1,2,3
Omnibus:,1472.445,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,336.117
Skew:,-0.023,Prob(JB):,1.0300000000000001e-73
Kurtosis:,2.071,Cond. No.,602.0


In [190]:
# re-evaluate model
# generate predictions
y_pred = model2.predict(x)

# calculate RMSE
RMSE2 = rmse(y, y_pred)
RMSE2

1620.2192432588993

In [191]:
# re-create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Children,1.00126
1,Age,1.001771
2,VitD_levels,1.003501
3,Full_meals_eaten,1.002494
4,HighBlood,1.001293
5,Stroke,1.001214
6,Complication_risk,1.001731
7,Arthritis,1.001301
8,Diabetes,1.002033
9,Reflux_esophagitis,1.001311
