In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import tensorflow as tf
import keras
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [212]:
#Only use this if you want to read the whole 2016 Core File
df_core = pd.read_csv("Data16/NIS_2016_Core.csv")

In [101]:
#Read Filtered Data
df_new = pd.read_csv("results new/NIS_Core_Hospitalized_grouped_cci.csv")

In [102]:
#Save indices where procedures were done
ind_val = []
for i in range(0, len(df_new["i10_pr1"])):
    if isinstance(df_new["i10_pr1"][i], str) == True:
        ind_val.append(i)

In [103]:
#Dataframe now only has stroke cases where surgery was performed
refined_df = df_new.loc[ind_val]
refined_df.shape

In [104]:
np.array(refined_df.index)

In [105]:
#Find days where length of stay is not stored
indices_cor = np.where(refined_df['los'] >= 0)[0]

indices = np.array(refined_df.index)
indices = indices[indices_cor]

refined_df = refined_df.loc[indices]

In [130]:
#Variables to consider in the ANOVA

#Key Independent Variables
## Number of Days from Admission to Procedure 1
Ad_Pr1 = np.array(refined_df['prday1'])

#Key Baseline Variables
## Race (classify this as white (1) or non-white(0))
race = np.array(refined_df['race'])
race[race>1] = 0

## Urban-Rural Classification (Categories 1-3 ie places with >250K (1) AND Categories 4-6 ie places with <250K(0))
urb = np.array(refined_df['pl_nchs'])
urb[urb>3] = 10
urb[urb<4] = 1
urb[urb==10] = 0

## Hospital NIS
hnis = refined_df['hosp_nis']

##CCI
cci = np.array(refined_df['cci'])

## Gender
gender = np.array(refined_df['female'])

## Number of Procedures (continuous var)
num_pro = np.array(refined_df['i10_npr'])

## Age (continuous var)
age = np.array(refined_df['age'])

## Primary Payer (Categorical: Medicare vs Everything Else)
payer = np.array(refined_df['pay1'])
payer[payer>1.5] = 0
payer[payer>0.1] = 1 

#Key Outcomes Variables
## Length of Stay following procedure 1 (los - prday1)
los_follow = np.array(refined_df['los'])

## Death
dead = np.array(refined_df['died'])

## Total Charges
totchg = np.array(refined_df['totchg'])

In [131]:
#Add Hospital Data to refined_df
hosp_df = pd.read_csv('Data16/NIS_2016_Hospital.csv')

In [132]:
refined_df

In [133]:
hosp_nis = list(hosp_df['hosp_nis'])
hosp_bed = list(hosp_df['hosp_bedsize'])
hosp_teach = list(hosp_df['hosp_locteach'])
hosp_contrl = list(hosp_df['h_contrl'])

ar_nis = np.array(hnis)

ar_bed = []
ar_teach = []
ar_contrl = []

for i in ar_nis:
    index = hosp_nis.index(i)
    ar_bed.append(hosp_bed[index])
    ar_teach.append(hosp_teach[index])
    ar_contrl.append(hosp_contrl[index])
    
refined_df['bedsize'] = ar_bed
refined_df['teach'] = ar_teach
refined_df['control'] = ar_contrl

In [134]:
#Hospital Variables
##Bedsize is a categorical variable (small vs medium + big)
bedsize = np.array(refined_df['bedsize'])
bedsize[bedsize > 1.9] = 0
bedsize[bedsize > 0.9] = 1

##Teach is a categorical variable with urban/rural distinction but let's treat it as "Teaching vs Non-Teaching"
teach = np.array(refined_df['teach'])
teach[teach < 2.5] = 0
teach[teach == 3] = 1

##Control is a categorical variable that we will be dividing into governvment vs private
control = np.array(refined_df['control'])
control[control > 1.1] = 0
control[control == 1] = 1

In [145]:
#Use num diagnoses and num procedures as well. Also, change stay length to actual stay length and not length between
#procedure 1 and discharge date

analysis_df = pd.DataFrame()
analysis_df['Time to Procedure'] = Ad_Pr1
analysis_df['Race'] = race
analysis_df['Urban'] = urb
analysis_df['Gender'] = gender
analysis_df['Age'] = age
analysis_df['Payer Type'] = payer
analysis_df['Length of Stay'] = los_follow
analysis_df['Death'] = dead
analysis_df['Charges'] = totchg
analysis_df['Hospital Bedsize'] = bedsize
analysis_df['Teaching'] = teach
analysis_df['CCI'] = cci
analysis_df['Control'] = control

In [147]:
analysis_df.fillna(analysis_df.mean(), inplace=True)

X = analysis_df[['Race','Urban','Gender','Age','Payer Type','Time to Procedure','Hospital Bedsize','Teaching','Control','CCI']]
y = analysis_df['Length of Stay']

In [148]:
analysis_df.to_csv("analysis_df.csv", index=False)

In [163]:
from stargazer.stargazer import Stargazer
from pystout import pystout

In [165]:
pystout(models=[model],
       file='A.tex')

In [173]:
from statsmodels.api import OLS
model = OLS(y,X).fit()
print(model.summary())

In [156]:
y_pred = model.predict(X)
plt.scatter(y, y_pred, marker = '.', s = 1)
plt.plot(range(1,90), range(1, 90), color = 'red')
plt.xlim([0,100])
plt.ylim([0,100])
plt.xlabel("Length of Stay")
plt.ylabel("Length of Stay Predictions")
plt.show()

In [198]:
len(A)

In [197]:
a = 0
for i in A:
    if i in B:
        a = a + 1
a

In [192]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))
intersection(A, B)

In [126]:
len(newdf['i10_pr1'])

In [166]:
a = 0
for i in range(0, len(list(newdf['i10_pr1']))):
    if type(list(newdf['i10_pr1'])[i])==str:
        a = a + 1

In [172]:
newdf.head(4)['i10_dx1']

In [160]:
#np.isnan(np.array(list(newdf['i10_pr1'])[0]))
list()

In [120]:
np.where(df_core['i10_dx1'].str[0:4] == 'S320')[0]

In [94]:
for i in range(0, 30):
    print(len(np.where(df_new[df_new.columns[18:48][i]]=='Z9282')[0]))

In [51]:
df_new.columns[55:71]

In [57]:
df_new['age']

In [53]:
CT_use = []
for i in df_core.columns[55:71]:
    CT_use.append(len(np.where(df_core[i] == "B020ZZZ")[0]))
sum(CT_use)

In [54]:
MRI_use = []
for i in df_core.columns[54:69]:
    MRI_use.append(len(np.where(df_core[i] == "B030ZZZ")[0]))
sum(MRI_use)

In [65]:
charges = []
for i in range(0, len(np.unique(df_core["i10_npr"]))):
    charges.append(np.mean(df_core["totchg"].where(df_core["i10_npr"] == i)))

In [70]:
plt.scatter([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15], charges)

In [13]:
lol = df_core.head(50000)
lol["i10_dx10"]