### Standard Imports

In [None]:
import os
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/"
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
from scipy import stats
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings(action='ignore')
pd.options.mode.chained_assignment = None 

In [None]:
# write csv into datafile and select columns to analyze
df = pd.read_csv('/Users/Spence604/Library/CloudStorage/OneDrive-WesternGovernorsUniversity/Docs/medical_clean.csv')

In [None]:
print("Are there any columns with null values: " + str(df.isnull().all(axis=1).any()))

In [None]:
dupes = df.duplicated()
print("Are there duplicates? ")
print(dupes.value_counts())

In [None]:
df = df[['Children', 'HighBlood','Age', 'Income', 'Gender', 'ReAdmis', 
         'VitD_levels', 'Doc_visits', 'Stroke', 'Initial_days','Additional_charges', 'Overweight','Anxiety']]
print(df.loc[df.duplicated()])
print(df.isnull().sum())

In [None]:
# check for outliers and remove
print(df.shape)
df = df[(np.abs(stats.zscore(df.select_dtypes(include=np.number))) < 3).all(axis=1)]
print(df.shape)

In [None]:
df.head()

In [None]:
df['Anxiety'] = df['Anxiety'].map(
                   {'Yes':1 ,'No':0})

In [None]:
df['Overweight'] = df['Overweight'].map(
                   {'Yes':1 ,'No':0})

In [None]:
df['HighBlood'] = df['HighBlood'].map(
                   {'Yes':1 ,'No':0})

In [None]:
df['Stroke'] = df['Stroke'].map(
                   {'Yes':1 ,'No':0})

In [None]:
df['ReAdmis'] = df['ReAdmis'].map(
                   {'Yes':1 ,'No':0})

In [None]:
# determine how many Nonbinary values 
df['Gender'].value_counts()

In [None]:
df['Gender'] = df['Gender'].map(
                   {'Male':1 ,'Female':0 , 'Nonbinary':0})

In [None]:
ax = plt.subplots(figsize=(16,16))
ax = sns.heatmap(df.corr(), annot=True)
plt.savefig('Final_heatmap.jpg')
plt.close()

In [None]:
df.hist(figsize = (16,16))
plt.savefig('pyplot.jpg')
plt.tight_layout()
plt.close()
print('Histogram done')

In [None]:
# list to iterate through
testList = ['Children', 'HighBlood', 'Overweight', 'Age', 'Income', 'Gender', 'ReAdmis', 'VitD_levels', 'Doc_visits', 'Stroke', 'Initial_days', 'Additional_charges', 'Anxiety']
for i in testList:
 df[['Anxiety', i]].value_counts().plot(kind='barh')
 plt.savefig('barh%s.jpg' % (i))
 plt.close()
print('scatterplots done')

In [None]:
df.head()

In [None]:
test = smf.logit("Anxiety ~ Children + HighBlood + Age + Gender + Income + ReAdmis + VitD_levels + Doc_visits + Stroke + Overweight + Additional_charges + Initial_days", data=df).fit()
print(test.summary())

In [None]:
conf = test.conf_int()
conf['OR'] = test.params # create odds ratio
conf.columns = ['2.5%', '97.5%', 'OR']
print(np.exp(conf))

In [None]:
reduced = smf.logit('Anxiety ~ ReAdmis + Stroke + Initial_days + Overweight', data=df).fit()
print(reduced.summary())

#### Output saved as jpg

In [None]:
## Plotting multiple plots same figure
fig, (axL, axR) = plt.subplots(2, figsize=(15, 15))

# Deviance Residuals
sns.regplot(test.fittedvalues, test.resid_dev, ax= axL,
            color="black", scatter_kws={"s": 5},
            line_kws={"color":"b", "alpha":1, "lw":2}, lowess=True)
# Studentized Pearson Residuals
sns.regplot(reduced.fittedvalues, reduced.resid_pearson, ax= axR,
            color="black", scatter_kws={"s": 5},
            line_kws={"color":"g", "alpha":1, "lw":2}, lowess=True)
plt.savefig('residual.jpg')
plt.close()                                                                                                                

In [None]:
df.to_csv('cleaned_med_SS.csv')

In [None]:
reduced_med_SS = df[['ReAdmis', 'Stroke',  'Initial_days' ,'Overweight']]
reduced_med_SS.to_csv('reduced_cleaned_data.csv')

In [None]:
# bivariate heatmap
ax = plt.subplots(figsize=(12,12))
ax = sns.heatmap(reduced_med_SS.corr(), annot=True)
plt.savefig('reduced_heat.jpg')
plt.close()
print('Reduced Heatmap Complete')

In [None]:
# confusion matrix
X = df.loc[:, df.columns != 'Anxiety']
y = df.loc[:, df.columns == 'Anxiety']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
#accuracy of confusion matrix
print('Accuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))
print('Classification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2']))

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
