## Standard imports
### Some may not be needed, but it's best to overprepare now

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pylab
from pylab import rcParams
import statsmodels.api as sm
import statistics
from scipy import stats

import sklearn
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report


from scipy.stats import chisquare
from scipy.stats import chi2_contingency

import warnings
warnings.filterwarnings('ignore')


## Data Understanding (wrangling, rename, etc)

In [None]:
# Load data set into Pandas dataframe
df = pd.read_csv('/Users/Spence604/Library/CloudStorage/OneDrive-WesternGovernorsUniversity/Docs/Med Data/medical_clean.csv')

# Rename last 8 survey columns for better description of variables
df.rename(columns = {'Item1':'Timely Admission', 
                    'Item2':'Timely Treatment', 
                     'Item3':'Timely Visits', 
                     'Item4':'Reliability', 
                     'Item5':'Options', 
                     'Item6':'Hours of Treatment', 
                     'Item7':'Courteous Staff', 
                     'Item8':'Doc_listen'}, 
          inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
## Theses variables aren't of value
df = df.drop(columns=['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 
                            'State', 'County', 'Zip', 'Lat', 'Lng', 'Population', 
                            'Area', 'TimeZone', 'Job', 'Marital', 'Timely Visits', 'Timely Admission',
       'Timely Treatment', 'Timely Visits', 'Reliability', 'Options',
       'Hours of Treatment', 'Courteous Staff'])
df.describe()


In [None]:
df = df[['Income','Children','Age','Full_meals_eaten','Initial_days','Doc_listen', 'Overweight', 'Diabetes','HighBlood','Gender', 'Asthma','Soft_drink']]

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
Nulls = df.isnull().sum()
print(Nulls)


In [None]:
df.columns

In [None]:
plt.figure(figsize = (5,5))
sns.heatmap(df.corr())
plt.show()


In [None]:
df.head()

In [None]:
df.replace({'No': 0, 'Yes': 1}, inplace=True)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.replace({'Female': 1, 'Male': 0}, inplace=True)
df.replace({'Nonbinary': 0}, inplace=True)

In [None]:
df.columns

In [None]:
# Create histograms of contiuous variables
df[['Income', 'Children', 'Age', 'Full_meals_eaten', 'Initial_days',
       'Evidence of active listening from doctor', 'Overweight', 'Diabetes', 'HighBlood',
       'Gender', 'Asthma', 'Soft_drink']].hist()
plt.savefig('pyplot.jpg')
plt.tight_layout()


In [None]:
# list to iterate through
tst = ['Income', 'Children', 'Age', 'Full_meals_eaten',
       'Doc_listen', 'Diabetes', 'HighBlood',
       'Gender', 'Asthma', 'Soft_drink']
for i in tst:
 df[['Overweight', i]].value_counts().plot(kind='barh')
 plt.savefig('barh%s.jpg' % (i))
 plt.close()
print('scatterplots done')

In [None]:
df.head()

In [None]:
sns.boxplot('Initial_days', data = df)
plt.show()


In [100]:
import statsmodels.formula.api as smf
test = smf.logit(formula = "Overweight ~ Doc_listen + Income + Children + Age + Full_meals_eaten + Initial_days + Diabetes + HighBlood + Gender + Asthma + Soft_drink ", data=df).fit()

print(test.summary())

Optimization terminated successfully.
         Current function value: 0.601765
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             Overweight   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9988
Method:                           MLE   Df Model:                           11
Date:                Tue, 29 Nov 2022   Pseudo R-squ.:                0.001532
Time:                        00:19:12   Log-Likelihood:                -6017.6
converged:                       True   LL-Null:                       -6026.9
Covariance Type:            nonrobust   LLR p-value:                   0.07143
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.9541      0.116      8.250      0.000       0.727       1.181
Doc_listen 

In [102]:
import statsmodels.formula.api as smf
test2 = smf.logit(formula = "Overweight ~ Income + HighBlood", data=df).fit()

print(test2.summary())

Optimization terminated successfully.
         Current function value: 0.602169
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:             Overweight   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Tue, 29 Nov 2022   Pseudo R-squ.:               0.0008614
Time:                        00:19:45   Log-Likelihood:                -6021.7
converged:                       True   LL-Null:                       -6026.9
Covariance Type:            nonrobust   LLR p-value:                  0.005562
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9032      0.042     21.375      0.000       0.820       0.986
Income      -1.43e-06   7.64e

In [None]:
df

In [None]:
df['intercept'] = 1
df = pd.get_dummies(df, drop_first=True)

logit = sm.Logit(df['Overweight'], df[['Income', 'Children', 'Age', 'Full_meals_eaten', 'Initial_days',
       'Doc_listen','Diabetes', 'HighBlood', 'Gender', 'Asthma',
       'Soft_drink']]).fit()
print(logit.summary())


In [None]:
from statsmodels.api import add_constant
Y = df['Overweight']
X = df[['Income', 'Children', 'Age', 'Full_meals_eaten', 'Initial_days',
       'Doc_listen','Diabetes', 'HighBlood', 'Gender', 'Asthma',
       'Soft_drink']]
X = add_constant(X)
result = sm.Logit( Y, X ).fit()
result.summary()

In [None]:
Y = df['Overweight']
X = df[['Income', 'HighBlood', 'Asthma', 'Children', 'Initial_days']]
X = add_constant(X)
result = sm.Logit( Y, X ).fit()
result.summary()