# Logistic Regression Model 1: All Features

In [1]:
# read csv into DataFrame
# import pandas for data manipulation

# Title: panda-dev/pandas
# Author: The pandas development team
# Date: 2023
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.7741580

import pandas as pd

df = pd.read_csv('medical_prepared_data2.csv', index_col=0)

In [2]:
# install statsmodels
# !pip install statsmodels

In [3]:
# import statsmodels to perform linear regression
import statsmodels.api as sm

# Title: statsmodels/statsmodels
# Author: Seabold, et al.
# Date: 2017
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.275519
 
# define the variables
x = df.drop(columns='ReAdmis').assign(const=1)
y = df['ReAdmis']
 
# perform the regression and fitting the model
model1 = sm.GLM(y, x, family=sm.families.Binomial()).fit()
model1.summary()

0,1,2,3
Dep. Variable:,ReAdmis,No. Observations:,9331.0
Model:,GLM,Df Residuals:,9297.0
Model Family:,Binomial,Df Model:,33.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-308.78
Date:,"Thu, 30 Mar 2023",Deviance:,617.56
Time:,15:51:30,Pearson chi2:,1320.0
No. Iterations:,13,Pseudo R-squ. (CS):,0.7129
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Population,3.309e-06,8.96e-06,0.369,0.712,-1.42e-05,2.09e-05
Children,0.0931,0.053,1.767,0.077,-0.010,0.196
Age,0.0016,0.005,0.310,0.757,-0.008,0.011
Income,-7.68e-07,3.99e-06,-0.192,0.848,-8.6e-06,7.06e-06
VitD_levels,0.0328,0.052,0.633,0.527,-0.069,0.134
Doc_visits,-0.0022,0.103,-0.022,0.983,-0.204,0.199
Full_meals_eaten,-0.0173,0.108,-0.160,0.873,-0.230,0.195
vitD_supp,0.0138,0.185,0.074,0.941,-0.349,0.377
Soft_drink,0.3146,0.248,1.266,0.206,-0.172,0.802


## Evaluation

### Accuracy

In [4]:
# import sklearn for metrics
from sklearn import metrics

# generate predictions; round to 0 or 1
y_pred = round(model1.predict(x))

#calculate accuracy
metrics.accuracy_score(y, y_pred)

0.9861751152073732

### Confusion Matrix

In [5]:
# import sklearn for metrics
from sklearn import metrics

# generate confusion matrix
metrics.confusion_matrix(y, y_pred)

array([[5843,   67],
       [  62, 3359]], dtype=int64)

### Multicollinearity

In [6]:
# calculate VIF to detect multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Population,1.002998
1,Children,1.003669
2,Age,1.004163
3,Income,1.003025
4,VitD_levels,1.00447
5,Doc_visits,1.003784
6,Full_meals_eaten,1.005529
7,vitD_supp,1.004694
8,Soft_drink,1.004004
9,HighBlood,1.00384


# Logistic Regression Model 2: Reduced Features

In [7]:
# import packages from scikit-learn for feature reduction
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

# Title: scikit-learn/scikit-learn
# Author: Grisel, et al.
# Date: 2022
# Code Version: latest
# Availability: https://doi.org/10.5281/zenodo.6543413

# re-define independent variables
x = df.drop(columns='ReAdmis')

# logistic regression estimator to be used in feature selection
reg = LogisticRegression(max_iter=1000).fit(x, y)

In [9]:
# reduce features using SequentialFeatureSelector
sfs = SequentialFeatureSelector(reg, n_features_to_select=4)
sfs.fit(x, y)

In [10]:
# return selected features
sfs.get_feature_names_out()

array(['Stroke', 'Complication_risk', 'Initial_days',
       'Initial_admin_Emergency Admission'], dtype=object)

In [11]:
# reduce x to selected features
x = df[['Stroke', 'Complication_risk', 'Initial_days',
       'Initial_admin_Emergency Admission']].assign(const=1)

In [12]:
# re-perform linear regression 
# perform the regression and fitting the model
model2 = sm.GLM(y, x, family=sm.families.Binomial()).fit()
model2.summary()

0,1,2,3
Dep. Variable:,ReAdmis,No. Observations:,9331.0
Model:,GLM,Df Residuals:,9326.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-396.11
Date:,"Thu, 30 Mar 2023",Deviance:,792.22
Time:,15:55:22,Pearson chi2:,2010.0
No. Iterations:,12,Pseudo R-squ. (CS):,0.7075
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Stroke,1.3984,0.240,5.832,0.000,0.928,1.868
Complication_risk,0.6286,0.127,4.941,0.000,0.379,0.878
Initial_days,1.1260,0.057,19.813,0.000,1.015,1.237
Initial_admin_Emergency Admission,1.6313,0.198,8.244,0.000,1.243,2.019
const,-62.9604,3.186,-19.764,0.000,-69.204,-56.717


## Evaluation

### Accuracy

In [13]:
# re-evaluate model
# generate predictions; round to 0 or 1
y_pred = round(model2.predict(x))

#calculate accuracy
metrics.accuracy_score(y, y_pred)

0.9814596506269424

### Confusion Matrix

In [14]:
# re-generate confusion matrix
metrics.confusion_matrix(y, y_pred)

array([[5824,   86],
       [  87, 3334]], dtype=int64)

### Multicollinearity

In [15]:
# re-create VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns
  
# re-calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                          for i in range(len(x.columns))]
  
vif_data

Unnamed: 0,feature,VIF
0,Stroke,1.000261
1,Complication_risk,1.000282
2,Initial_days,1.000373
3,Initial_admin_Emergency Admission,1.00029
4,const,6.454001
