# Linear Regression Model 1: All Features

In [2]:
# read csv into DataFrame
# import pandas for data manipulation

# Title: panda-dev/pandas
# Author: The pandas development team
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7741580

import pandas as pd

df = pd.read_csv('medical_prepared_data2.csv', index_col=0)

In [3]:
# install statsmodels
# !pip install statsmodels

In [4]:
# import statsmodels to perform linear regression
import statsmodels.api as sm

# Title: statsmodels/statsmodels
# Author: Seabold, et al.
# Date: 2017
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.275519
 
# define the variables
x = df.drop(columns='ReAdmis').assign(const=1)
y = df['ReAdmis']
 
# perform the regression and fitting the model
model1 = sm.GLM(y, x, family=sm.families.Binomial()).fit()
model1.summary()

0,1,2,3
Dep. Variable:,ReAdmis,No. Observations:,9331.0
Model:,GLM,Df Residuals:,9297.0
Model Family:,Binomial,Df Model:,33.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-308.78
Date:,"Mon, 27 Mar 2023",Deviance:,617.56
Time:,20:18:23,Pearson chi2:,1320.0
No. Iterations:,13,Pseudo R-squ. (CS):,0.7129
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Population,3.309e-06,8.96e-06,0.369,0.712,-1.42e-05,2.09e-05
Children,0.0931,0.053,1.767,0.077,-0.010,0.196
Age,0.0016,0.005,0.310,0.757,-0.008,0.011
Income,-7.68e-07,3.99e-06,-0.192,0.848,-8.6e-06,7.06e-06
VitD_levels,0.0328,0.052,0.633,0.527,-0.069,0.134
Doc_visits,-0.0022,0.103,-0.022,0.983,-0.204,0.199
Full_meals_eaten,-0.0173,0.108,-0.160,0.873,-0.230,0.195
vitD_supp,0.0138,0.185,0.074,0.941,-0.349,0.377
Soft_drink,0.3146,0.248,1.266,0.206,-0.172,0.802


## Evaluation

In [32]:
# import sklearn for metrics
from sklearn import metrics

# generate predictions; round to 0 or 1
y_pred = round(model1.predict(x))

#calculate accuracy
metrics.accuracy_score(y, y_pred)

0.9861751152073732

In [33]:
# import sklearn for metrics
from sklearn import metrics

# generate confusion matrix
metrics.confusion_matrix(y, y_pred)

array([[5843,   67],
       [  62, 3359]], dtype=int64)

In [34]:
# calculate VIF to detect multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Population,1.002998
1,Children,1.003669
2,Age,1.004163
3,Income,1.003025
4,VitD_levels,1.00447
5,Doc_visits,1.003784
6,Full_meals_eaten,1.005529
7,vitD_supp,1.004694
8,Soft_drink,1.004004
9,HighBlood,1.00384


# Linear Regression Model 2: Reduced Features

In [40]:
# import packages from scikit-learn for feature reduction
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Title: scikit-learn/scikit-learn
# Author: Grisel, et al.
# Date: 2022
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.6543413

# re-define independent variables
x = df.drop(columns='ReAdmis')

# logistic regression estimator to be used in feature selection
reg = LogisticRegression(max_iter=1000).fit(x, y)

In [41]:
# reduce features using SequentialFeatureSelector
sfs = SequentialFeatureSelector(reg, n_features_to_select=8)
sfs.fit(x, y)

In [42]:
# return selected features
sfs.get_feature_names_out()

array(['Stroke', 'Complication_risk', 'Arthritis', 'Asthma',
       'Initial_days', 'Initial_admin_Emergency Admission',
       'Services_CT Scan', 'Services_MRI'], dtype=object)

In [43]:
# reduce x to selected features
x = df[['Stroke', 'Complication_risk', 'Arthritis', 'Asthma',
       'Initial_days', 'Initial_admin_Emergency Admission',
       'Services_CT Scan', 'Services_MRI']].assign(const=1)

In [44]:
# re-perform linear regression 
# perform the regression and fitting the model
model2 = sm.GLM(y, x, family=sm.families.Binomial()).fit()
model2.summary()

0,1,2,3
Dep. Variable:,ReAdmis,No. Observations:,9331.0
Model:,GLM,Df Residuals:,9322.0
Model Family:,Binomial,Df Model:,8.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-343.14
Date:,"Mon, 27 Mar 2023",Deviance:,686.29
Time:,20:58:00,Pearson chi2:,1230.0
No. Iterations:,12,Pseudo R-squ. (CS):,0.7108
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Stroke,1.5458,0.259,5.959,0.000,1.037,2.054
Complication_risk,0.8125,0.140,5.784,0.000,0.537,1.088
Arthritis,-1.2324,0.216,-5.714,0.000,-1.655,-0.810
Asthma,-1.1419,0.224,-5.090,0.000,-1.582,-0.702
Initial_days,1.3273,0.072,18.372,0.000,1.186,1.469
Initial_admin_Emergency Admission,1.9499,0.220,8.846,0.000,1.518,2.382
Services_CT Scan,1.4073,0.337,4.172,0.000,0.746,2.069
Services_MRI,2.6500,0.487,5.446,0.000,1.696,3.604
const,-73.7939,4.024,-18.340,0.000,-81.680,-65.908


## Evaluation

In [45]:
# re-evaluate model
# generate predictions; round to 0 or 1
y_pred = round(model2.predict(x))

#calculate accuracy
metrics.accuracy_score(y, y_pred)

0.9839245525667131

In [46]:
# re-generate confusion matrix
metrics.confusion_matrix(y, y_pred)

array([[5831,   79],
       [  71, 3350]], dtype=int64)

In [47]:
# re-create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# re-calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Stroke,1.000699
1,Complication_risk,1.000917
2,Arthritis,1.001204
3,Asthma,1.00052
4,Initial_days,1.001277
5,Initial_admin_Emergency Admission,1.000458
6,Services_CT Scan,1.006297
7,Services_MRI,1.005848
8,const,7.618545


In [48]:
# import matplotlib to create visualizations

# Title: matplotlib/matplotlib
# Author: Caswell, et al.
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7697899

# Title: seaborn: statistical data visualization
# Author: Michael Waskom
# Date: 2021
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.4645478

import matplotlib.pyplot as plt
import seaborn as sns

# TODO