In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandasql import *
import requests
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import plot_confusion_matrix
from collections import Counter 
from imblearn.under_sampling import RandomUnderSampler

In [None]:
import os
print(os.listdir("../input"))

In [None]:
import warnings            
warnings.filterwarnings("ignore") 

In [None]:
hos_in_pt_dis_df = pd.read_csv('../input/2015-deidentified-ny-inpatient-discharge-sparcs/Hospital_Inpatient_Discharges__SPARCS_De-Identified___2015.csv')

In [None]:
hos_in_pt_dis_df.head()

In [None]:
display(hos_in_pt_dis_df)

In [None]:
hos_in_pt_dis_df.dtypes

In [None]:
hos_in_pt_dis_df.describe()

# **Data Wrangling**

Data Wrangling
Next, let's check for the presence of null values in our table and deal with them appropriately. Here, I find that some columns have a few null values, which can be dropped easily, while other columns have large number of null values. For these columns, I simply drop them as they do not provide much information.

In [None]:
hos_in_pt_dis_df.isna().sum()

Lets drop Other Provider License Number since over a almost 2/3 of these rows have NaN values. Lets also drop the other columns with a significant amount on NaNs since it will be difficult to extract value from them

In [None]:
#Lets drop Other Provider License Number since over a almost 2/3 of these rows have NaN values
#Lets also drop the other columns with a significant amount on NaNs since it will be difficult to extract value from them
hos_df = hos_in_pt_dis_df.drop(['Other Provider License Number','Payment Typology 2','Payment Typology 3','Operating Provider License Number'], axis=1)



In [None]:
hos_df.isna().sum()

In [None]:
#next, lets drop rows with NaNs for columns such as the APR Severity of Illness Description and APR Risk of Mortality since these have a 
#relatively small number of NaN values and thus will be insignificant to drop.
hos_df = hos_df.dropna(subset=['APR Severity of Illness Description', 'APR Risk of Mortality','Zip Code - 3 digits','Facility Id','Attending Provider License Number'])

In [None]:
hos_df.isna().sum()

Now, I would like to convert some of these object columns to other data types where applicable. First I will look at the length of stay column, which is currently an object. Although I would like to convert it to int to allow for the possibility of regression analysis, some entries are written as '120 +', which creates problems for this conversion. Thus, I will treat all entires '120 +' as 120. I do similar processes for the other numeric columns

In [None]:
#Here, the apply operation is used as an alternative to an if statement for highest computational efficiency
#Convert number objects
hos_df['Length of Stay'] = hos_df['Length of Stay'].apply(lambda x: str(x).split(' ')[0])
hos_df['Length of Stay'] = pd.to_numeric(hos_df['Length of Stay'])
hos_df['Total Costs'] = hos_df['Total Costs'].apply(lambda x: str(x).replace('$',''))
hos_df['Total Costs'] = pd.to_numeric(hos_df['Total Costs'])
hos_df['Total Charges'] = hos_df['Total Charges'].apply(lambda x: str(x).replace('$',''))
hos_df['Total Charges'] = pd.to_numeric(hos_df['Total Charges'])
#upon inspection, I also found that some entries in the zip code column had the string OOS instead of a number. 67,000 rows had this
#which seems to large to simply drop these rows. Looking into the information about the dataset, these zipcodes refer to out of state. 
#This could be useful because these people might be rich so there might be differences in length of stay
#Thus, I will keep these rows and signify them with a 999, which now indicates out of state
hos_df['Zip Code - 3 digits'] = hos_df['Zip Code - 3 digits'].apply(lambda x: str(x).replace('OOS','999'))  
hos_df['Zip Code - 3 digits'] = pd.to_numeric(hos_df['Zip Code - 3 digits'])
display(hos_df)

In [None]:

#Now lets visualize some initial stats on the results of the data cleaning above
hos_df.describe()

In [None]:
#Make a heatmap
hos_df.corr()

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
corr = hos_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

From this initial look at a correlation matrix of our data, we can see that a couple of features have relatively strong positive correlation with each other, and more importantly, length of stay. Although the correlation matrix does not make sense to look at for all of our columns, we can see from it that APR Severity of Illness Code has a strong positive correlation with length of stay, along with total charges and total costs. CCS Diagnosis code also seems to have a slight positive correlation with length of stay. We can see other positive correlations in the dataset between features such as CCS Diagnosis codes and APR DRG codes.

# **Exploratory Data Analysis & Data Visualization**
In this section, I will explore and visualize underlying relationships in the data. For the following analysis, I keep length of stay as the primary varaiable along the y-axis since it is the predictor variable for this project. Thus, I am most interested at looking how features in this dataset affect length of stay. Some of the features found to have the most variance in length of stay are listed below:

Payment Typology
Severity of Illness
Age Group
Type of Admission
First, let's look at the univariate distribution of length of stay values in the dataset

In [None]:
sns.set(font_scale=1.2)
sns.set_style("white")
f, ax = plt.subplots(figsize=(11, 9))
sns.distplot(hos_df['Length of Stay'], norm_hist=False);
plt.ylabel('Probability Density')
plt.title('Univariate Distribution Plot of Length of Stay')
plt.show()

We can see this distribution is very skewed. Let's take a look at birthweight next as I noticed that the range of the value seems odd. From the df.describe() output, we can see that the 25%, 50% and 75% quartile of values are all 0. The histogram below confirms that most of the birth weights are 0. This makes sense when we consider that this column is probably only relevant for new born babies, and is entered as 0 for all other patients. Let's confirm this with some further analysis

In [None]:
#Lets see if we can confirm that there is data only input for the youngest age group

birth_weight_df = hos_df[['Type of Admission', 'Birth Weight']].groupby('Type of Admission').mean()
display(birth_weight_df)
weight_age_df = hos_df[['Age Group', 'Birth Weight']].groupby('Age Group').mean()
display(weight_age_df)

In [None]:
#Lets look at the relationship between birth weight and length of stay only for the newborns as this feature makes the most sense
#for this group

birth_youngest_stay = hos_df[hos_df['Type of Admission'].str.contains('Newborn')]
birth_youngest_stay['Birth Weight'] = birth_youngest_stay['Birth Weight'].apply(lambda x: float(x/454)) #convert from grams to pounds
f, ax = plt.subplots(figsize=(11, 9))
sns.scatterplot(x="Birth Weight", y="Length of Stay",
                data=birth_youngest_stay)

When looking at a scatter plot of birth weight and length of stay, we can see two distinct clusters - one at the lower left hand corner and one at the upper right. It is interesting to see some newborns with a birth weight of approximately 20 pounds (newborns must be a loose term here and it is most likely referring to children under a certain age). These babies seem to have particularly long lengths of stay as they are clustered at the top right hand corner of the graph. This indicates to me that birth weight might be an important feature for predicting length of stay for newborns. However, for the rest of patients, the data does not contain much value as over 75% of the values are 0. This column will be dropped for the bulk of the analysis, with the option of being added back in to create a model separately for new born babies.

Next, I am curious what the babies who have the longest lengths of stay are diagnosed with.

In [None]:
longest_newborn_df = birth_youngest_stay[birth_youngest_stay['Length of Stay']==120]
display(longest_newborn_df)

I am interested in seeing what procedures the babies who stayed in the hospital for 120 days or longer had to recieve. A frequency plot for each procedure description in this group of newborns is shown below

In [None]:
f, ax = plt.subplots(figsize=(25, 15))
sns.countplot(x='CCS Procedure Description', data = longest_newborn_df)
plt.xticks(rotation=90)
plt.title('Procedure Descriptions for Newborns Who Stay 120 Days or Longer')
plt.show()

Most of the babies in this group required a ventilator. This may imply that if a baby requires to be put on a ventilator and is having respiratory issues, there is a high probability that they will have a long stay at the hospital (120+ days)

Below, I begin the bulk of the exploratory data visualization for this project as I analyze the relationships bewteen various features and the predictor feaature for this project: length of stay. Major takeaways and interpretations are described below each figure. However, many of the graphs are pretty intuitive and simply show an interesting underlying relationship between features

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Gender", y="Length of Stay",
            hue="Race",
            data=hos_df)

We can see the length of stay data is very skewed across all genders and races, which makes it difficult to see relationships. I will limit the range of the y axis from now on in order to more clearly see all underlying distributions

In [None]:

f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Gender", y="Length of Stay",
            hue="Race",
            data=hos_df)
ax.set(ylim=(0, 30))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Age Group", y="Length of Stay",
            data=hos_df)
ax.set(ylim=(0, 30))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Age Group", y="Length of Stay", order=['0 to 17','18 to 29','30 to 49','50 to 69','70 or Older'],
            palette="Set1", data=hos_df)
ax.set(ylim=(0, 30))
plt.title('Length of Stay vs. Age Group')
plt.show()

Here, we can see some significant variance in length of stay across different age groups. It is evident that age groups 50-69 and 70 or older tend to have the longest stay distributions

In [None]:

f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Race", y="Length of Stay", data=hos_df, palette='Set1')
plt.title('Length of Stay vs. Patient Race')
ax.set(ylim=(0, 30))
plt.show()

What is the average length of stay for each diagnosis description in the dataset?

In [None]:
f, ax = plt.subplots(figsize=(35, 20))
sns.barplot(x="CCS Diagnosis Description", y="Length of Stay", data=hos_df[['CCS Diagnosis Description','Length of Stay']].groupby('CCS Diagnosis Description', as_index=False).mean())
ax.set(ylim=(0, 20))
plt.xticks(rotation=90)
plt.show()

This plot makes it nice to visualize the spread of length of stay values for different diagnoses, but there are a lot of categories which makes it difficult to read. Let's create a dataframe and list diagnoses in descending order of average length of stay to see the diagnosis descriptions that have the longest length of stay.

In [None]:
diag_stay_df = hos_df[['CCS Diagnosis Description','Length of Stay']].groupby('CCS Diagnosis Description', as_index=False).mean()
diag_stay_df = diag_stay_df.sort_values(by='Length of Stay', ascending=False, ignore_index=True)
display(diag_stay_df)

In [None]:
sns.set(font_scale=1.2)
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 9))
sns.barplot(y="CCS Diagnosis Description", x="Length of Stay", data=diag_stay_df[0:5], palette='Set1')
#plt.xticks(rotation=45)
plt.title('Top 5 Diagnoses with Longest Average Length of Stay')
plt.show()

Let's see which diagnosis descriptions cost the hospital the most money

In [None]:
f, ax = plt.subplots(figsize=(30, 15))
sns.barplot(x="CCS Diagnosis Description", y="Total Costs", data=hos_df[['CCS Diagnosis Description','Total Costs']].groupby('CCS Diagnosis Description', as_index=False).mean())
plt.xticks(rotation=90)
plt.show()

Again, this plot is hard to read so lets print out a dataframe with the rows listed in descending order of cost

In [None]:
diag_costs_df = hos_df[['CCS Diagnosis Description','Total Costs']].groupby('CCS Diagnosis Description', as_index=False).mean()
diag_costs_df = diag_costs_df.sort_values(by='Total Costs', ascending=False, ignore_index=True)
display(diag_costs_df)

Next, through the use of an inner join, let's see how the diagnosis descriptions that cost the most compare with their length of stay

In [None]:
#Rank the total costs and then do an inner join
diag_costs_df = diag_costs_df.reset_index()
joined_df = diag_stay_df.merge(right=diag_costs_df, how='inner', on='CCS Diagnosis Description')
display(joined_df)

We can see that there is definitely a strong relationship between the most expensive diagnoses and a longer inpatient length of stay. However, for example, the diagnosis description with the second longest average length of stay is only the fifth most expensive to the hospital

In [None]:
sns.set(font_scale=1.25)
sns.set_style("white")
f, ax = plt.subplots(figsize=(15, 9))
sns.boxplot(x="Payment Typology 1", y="Length of Stay", data=hos_df, palette='Set1')
#plt.title('Type of Patient Payments vs. Length of Stay')
ax.set(ylim=(0, 20))
plt.xticks(rotation=80)
plt.title('Length of Stay vs. Primary Payment Typology')
plt.show()

Here, we see another feature that has a significant amount of variance in length of stay across its categories. Medicare patients tend to have one of the longest length of stays. This may be because most medicare patients are in an older age bracket. Let's see if we can visually support this hypothesis

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.countplot(x='Age Group', data = hos_df[hos_df['Payment Typology 1']=='Medicare'], order=['0 to 17','18 to 29','30 to 49','50 to 69','70 or Older'], palette='Set1')
plt.title('Number of Medicare Patients in Each Age Group')
plt.show()

This hypothesis is supported by the graph above. Age/Payment typology seem to be correlated in some ways, and these seem like they have an effect on the length of stay of the patient

Next, let's take a look at how length of stay varies across different patient zip code areas

In [None]:
f, ax = plt.subplots(figsize=(25, 9))
sns.boxplot(x="Zip Code - 3 digits", y="Length of Stay", data=hos_df)
ax.set(ylim=(0, 20))

The plots above urge me to wonder how much income has an affect on length of stay since we can see differences across zipcodes and health insurance programs, which are highly correlated with income differences. Let's see if we can webscrape the income for each 3-digit zipcode provided and see if a relationship between these variables truly does exist.

# **Webscraping**

This section primarily serves to extract a more useful feature than raw zipcode for the models I will train: webscraped average income data for each zipcode. Following this section, I will be able to make use of this feature instead of having to encode the zipcode column, which could have the result of drastically increasing the dimensionality of my dataset if the column is one hot encoded. Furthermore, patterns in average income may be more informative for any given model than a raw zipcode

In [None]:
#We can scrap the data from the web, but I have downloaded the file and uploaded the same.

#Scraped income data by zipcode from web (data from 2006-2010)

#dls = "https://www.psc.isr.umich.edu/dis/census/Features/tract2zip/MeanZIP-3.xlsx"
#resp = requests.get(dls)

#output = open('zip_incomes.xlsx', 'wb')
#output.write(resp.content)
#output.close()

In [None]:
zip_income_df = pd.read_excel('../input/mean-zip/MeanZIP-3.xlsx')
display(zip_income_df)

Now, we only want 5 digit zipcodes at first (since some zipcodes in the table have 4 digits. These zipcodes are assumed to have a 0 at the beginning, which would place them in areas such as Massachusetts). Next, I will truncate the zipcodes to only contain the first three digits. This is because HIPAA regulations result in only the 3-digit zipcode data being available. Therefore, the zipcodes in the dataset only have 3 digits. As such, I will do an aggregate mean on the zipcodes with the same first three digits to get an average income for all zipcodes which share the same first three digits.

In [None]:
zip_income_df['Zip'] = zip_income_df['Zip'].apply(lambda x: int(x))
income_df = zip_income_df[zip_income_df['Zip'] > 9999]
income_df['Zip'] = income_df['Zip'].apply(lambda x: math.floor(x/100)) #cut down zip code to just first three digits
display(income_df)

In [None]:
income_df.dtypes

Now, lets group by and join using SQL commands to do this most efficiently. Not only is this more intuitive to write, but we can make sure that we are utilizing any behind the scenes query optimization built into the SQL language. Median income is used since income distributions are known to be skewed, and median provides a better measure of central tendency for these distributions

In [None]:
query = '''SELECT ZIP, AVG(MEDIAN) as median FROM income_df GROUP BY ZIP'''
avg_income = sqldf(query, locals())
display(avg_income)

In [None]:
#Now, we can inner join this onto our dataframe to get the average income for each patient zipcode

avg_income['Zip'] = avg_income['Zip'].astype('object')

query = '''SELECT d.*, a.median as AvgIncome FROM hos_df d inner join avg_income a on d.'Zip Code - 3 digits' = a.Zip'''
hos_sql_df = sqldf(query, locals())
display(hos_sql_df)


However, there is still one more issue we need to take care of. If you remember from above, I changed out of state zipcodes (encoded as OOS) to the number 999. Now, these zipcodes got joined with the average income for zipcodes associated with 999, even though they are not actually associated with this location. In order to deal with this, I will first find the most common health insurance program(s) for out of state patients (zipcode 999). Then, I will find the average income for this health insurance program(s) and use it for patients with zipcode 999 (out of state patients).

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.countplot(x="Payment Typology 1",
            data=hos_sql_df[hos_sql_df['Zip Code - 3 digits']==999])
plt.xticks(rotation=90)

As illustrated by the plot above, medicare and private health insurance are the two most common health insurance programs. I will find the average income for these two programs and use that number as the income for all patients with zipcode 999.

In [None]:
#for this calculation, I will exlclude the data with zipcode equal to 999
df_no999 = hos_sql_df[hos_sql_df['Zip Code - 3 digits'] != 999]
insurance_df = df_no999[df_no999['Payment Typology 1'].isin(['Medicare','Private Health Insurance'])]
mean_zip999 = insurance_df['AvgIncome'].mean()

In [None]:
#round zipcodes to make replace easier
hos_sql_df['AvgIncome'] = hos_sql_df['AvgIncome'].round(2)
hos_sql_df[hos_sql_df['Zip Code - 3 digits']==999]['AvgIncome']

In [None]:
hos_sql_df = hos_sql_df.replace(47010.32, round(mean_zip999,2))

Below, I plot the zip code against average income to visualize the distribution of the data

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.barplot(x="Zip Code - 3 digits", y="AvgIncome",
            data=hos_sql_df, palette='Set1')
plt.ylabel('Median Income')
plt.title('Income Distribution Across 3 Digit Patient Zipcodes')
plt.xticks(rotation=90)
plt.show()

In [None]:
#From left to right we have increasing income, organized this way since the x-axis labels are hard to read
f, ax = plt.subplots(figsize=(25, 9))
sns.boxplot(x="AvgIncome", y="Length of Stay",
            data=hos_sql_df.sort_values(by='AvgIncome', ascending=True))
plt.xticks(rotation=90)
ax.set(ylim=(0, 20))

We can see that there is not a huge amount of variation in length of stay across the zipcodes. However, they will still be more useful than the raw zipcode column, so I use average income as a feature instead in the modeling section of this project

# **More Data Visualization**

Now let's use this new webscraped feature to visuaize its correlation with other features in our dataset

In [None]:
sns.set(font_scale=1.25)
sns.set_style("white")
f, ax = plt.subplots(figsize=(15, 9))
sns.barplot(x="Payment Typology 1", y="AvgIncome",
            data=hos_sql_df, palette='Set1')
plt.xticks(rotation=80)
plt.title('Median Income vs. Primary Payment Typology')
plt.ylabel('Median Income')
plt.show()


We can see some variance in average income across this feature, which makes sense as different insurance programs are more common for people from different economic backgrounds. This gives me confidence that my webscrape was at least somewhat useful, supporting the notion of proceeding with it as a feature

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Health Service Area", y="Length of Stay",
            data=hos_sql_df)
plt.xticks(rotation=90)
ax.set(ylim=(0, 20))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="APR Severity of Illness Description", y="Length of Stay",
            data=hos_sql_df, palette='Reds')
ax.set(ylim=(0, 40))
plt.title('Length of Stay vs. Severity of Illness')
plt.show()

There is high amounts of variance in length of stay against the severity of illness feature. This will be an important feature to include in the model and will improve its predictive ability

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Ethnicity", y="Length of Stay",
            data=hos_sql_df)
ax.set(ylim=(0, 20))

Here we can see the ethnicity column encodes less information then the Race column and does not have much variance across different ethnicities, so we will drop it later

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Type of Admission", y="Length of Stay",
            data=hos_sql_df)
ax.set(ylim=(0, 20))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.boxplot(x="Type of Admission", y="Length of Stay",
            data=hos_sql_df, palette='Set1')
plt.title('Length of Stay vs. Type of Admission')
ax.set(ylim=(0, 20))
plt.show()

Let's visualize the positive linear relationship between length of stay and total costs, just out of curiousity for the slope of this correlation

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
sns.regplot(x="Length of Stay", y="Total Costs",
            data=hos_sql_df[0:200000])

Let's also plot a frequency plot of severity of illness for all patients who stay for 120 days or longer

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.countplot(x='APR Severity of Illness Description', data = hos_sql_df[hos_sql_df['Length of Stay']==120])
plt.title('Severity of Illness vs. Length of Stay for Patients With 120+ Lengths of Stay')
plt.show()

We can see that of the approximately 1900 patients who have stays of 120 days or longer, most of them were extreme illnesses, which makes sense. Now, lets visualize the univariate distributions of each of the variables through a pairplot, which provides a quick way of seeing this for all our numeric variables

# **Wordclouds**

In this section, I perform text parsing and analysis on diagnosis descriptions, followed by the creation of wordclouds to visualize these results. From this analysis, we can see stark differences in the diagnosis descriptions for each illness severity type. These differences are interesting to compare, and some of the results are summarized below:

Minor Illness: Pregnancy, disease, liveborn, complications, osteoarthritis
Moderate Illness: Schizophrenia, disease, complications, psychotic
Major Illness: Failure, disorders, acute
Extreme Illness: Septicemia, cerebrovascular, tuberculosis

In [None]:
diagnosis_desc_df = hos_sql_df[['CCS Diagnosis Description']][0:10000]
diagnosis_desc_df['CCS Diagnosis Description'] = diagnosis_desc_df['CCS Diagnosis Description'].astype('str', errors = 'ignore')
diagnosis_desc_df['CCS Diagnosis Description'] = diagnosis_desc_df['CCS Diagnosis Description'].apply(lambda x: x.lower())

list_of_titles = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
diagnosis_desc_df['Tokens'] = diagnosis_desc_df['CCS Diagnosis Description'].apply(lambda x: tokenizer.tokenize(x))
list_of_tokens = diagnosis_desc_df['Tokens'].tolist()
diagnosis_words = []
for sublist in list_of_tokens:
    for item in sublist:
        diagnosis_words.append(item)

from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords

#Let's remove stop words as well, such as "a", "and", and "the"
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
for word in list(diagnosis_words):
    if word in stop_words:
        diagnosis_words.remove(word)

from collections import Counter
Counter1 = Counter(diagnosis_words)
most_occur = Counter1.most_common(30) 
diagnosis_counter = []
for item in most_occur:
  diagnosis_counter.append(item[0])

#Create word cloud plot
cloud_words = ' '
for words in diagnosis_counter: 
    cloud_words = cloud_words + words + ' '

diagnosis_word_plot = WordCloud(width = 800, height = 800).generate(cloud_words)
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(diagnosis_word_plot)
plt.show()

Let's make one of these for each severity type to see if we can notice any differences.

In [None]:
#Minor
hos_sql_df['CCS Diagnosis Description'] = hos_sql_df['CCS Diagnosis Description'].astype('str', errors = 'ignore')
hos_sql_df['CCS Diagnosis Description'] = hos_sql_df['CCS Diagnosis Description'].apply(lambda x: x.lower())
minor_df = hos_sql_df[['CCS Diagnosis Description']][hos_sql_df['APR Severity of Illness Description']=='Minor']
minor_df = minor_df[['CCS Diagnosis Description']][0:40000]

#lets create a function for this
def generate_cloud(type_df):
  list_of_titles = []
  from nltk.tokenize import RegexpTokenizer
  tokenizer = RegexpTokenizer(r'\w+')
  type_df['Tokens'] = type_df['CCS Diagnosis Description'].apply(lambda x: tokenizer.tokenize(x))
  list_of_tokens = type_df['Tokens'].tolist()
  type_words = []
  for sublist in list_of_tokens:
      for item in sublist:
          type_words.append(item)

  for word in list(type_words):
    if word in stop_words:
        type_words.remove(word)

  Counter1 = Counter(type_words)
  most_occur = Counter1.most_common(30) 
  diagnosis_counter = []
  for item in most_occur:
    diagnosis_counter.append(item[0])
  
  from wordcloud import WordCloud
  #Create word cloud plot
  cloud_words = ' '
  for words in diagnosis_counter: 
      cloud_words = cloud_words + words + ' '

  type_word_plot = WordCloud(width = 800, height = 800).generate(cloud_words)
  plt.figure(figsize = (8, 8), facecolor = None) 
  plt.imshow(type_word_plot)
  plt.show()

generate_cloud(minor_df)


In [None]:
moderate_df = hos_sql_df[['CCS Diagnosis Description']][hos_sql_df['APR Severity of Illness Description']=='Moderate']
moderate_df = moderate_df[['CCS Diagnosis Description']][0:40000]

generate_cloud(moderate_df)

In [None]:
major_df = hos_sql_df[['CCS Diagnosis Description']][hos_sql_df['APR Severity of Illness Description']=='Major']
major_df = major_df[['CCS Diagnosis Description']][0:40000]

generate_cloud(major_df)


In [None]:
extreme_df = hos_sql_df[['CCS Diagnosis Description']][hos_sql_df['APR Severity of Illness Description']=='Extreme']
extreme_df = extreme_df[['CCS Diagnosis Description']][0:40000]

generate_cloud(extreme_df)

Drop Columns
Now, in preparation for the modeling component of this project and following the data visualization section where I used some of these columns, I will now drop all the columns that won't help our model predict length of stay. In this section, I end up dropping the following columns:

Zip Code
Facility Name
CCS Diagnosis Description
CCS Procedure Description
APR DRG Description
APR MDC Description
Discharge Year
Operating Certificate Number
Ethnicity
Hospital County
Birth Weight
APR Severity of Illness Description
Attending Provider License Number
Patient Disposition
Most of these columns have corresponding columns with numeric values which I use instead in order to prevent the need for one-hot encoding columns with string values, which would dramatically increase the dimensionality of the dataset. I drop these types of columns after comparing them with the length of their corresponding "code" columns to ensure they have the same number of distinct columns and thus contain the same information.

Although there are a few other columns that might not be strong predictors, such as attending provider license number, I decide to keep them after noticing slight decreases in model accuracy when the columns are dropped

In [None]:
#Finally, lets drop the columns the rest of the columns we won't need for the modeling portion


fig, ax =plt.subplots(1,2, figsize=(14,5))
sns.countplot(hos_sql_df['APR Severity of Illness Description'], ax=ax[0])
sns.countplot(hos_sql_df['APR Severity of Illness Code'], ax=ax[1])
fig.show()

In [None]:
hos_sql_df_1 = hos_sql_df.drop(['APR Severity of Illness Description'], axis=1)  #after confirming the illness code column encodes the same information


In [None]:
num_county = hos_sql_df_1['Hospital County'].unique().tolist()
num_zip = hos_sql_df_1['Zip Code - 3 digits'].unique().tolist()
print("Number of Hospital County's:",len(num_county))
print("Number of Zipcodes:",len(num_zip))
#Below, we can see hospital county and zipcode do not encode same info.

In [None]:
num_facilities = hos_sql_df_1['Facility Id'].unique().tolist()
num_facname = hos_sql_df_1['Facility Name'].unique().tolist()
print("Number of Facility Ids:",len(num_facilities))
print("Number of Facility Names:",len(num_facname))
#We can see that these most likely encode the same info even though they are on off, so I will drop the names column

In [None]:
num_diag_code = hos_sql_df_1['CCS Diagnosis Code'].unique().tolist()
num_diag_desc = hos_sql_df_1['CCS Diagnosis Description'].unique().tolist()
print("Number of Diagnosis Codes:",len(num_diag_code))
print("Number of Diagnosis Descriptions:",len(num_diag_desc))
#Diagnosis Codes and Descriptions encode the same info so we will drop the descriptions.

In [None]:
print("Types of Procedure Descriptions:",hos_sql_df_1['CCS Procedure Description'].unique().tolist())
#I want to see a list of the possible descriptions since many of them say NO PROC. Below is the output
#Lets verify the procedure code encodes the same info and then drop this column
num_proc_code = hos_sql_df_1['CCS Procedure Code'].unique().tolist()
num_proc_desc = hos_sql_df_1['CCS Procedure Description'].unique().tolist()
print("Number of Procedure Codes:",len(num_proc_code))
print("Number of Procedure Descriptions:",len(num_proc_desc))
#They do contain the same information, so lets drop the descriptions

In [None]:
num_drg_code = hos_sql_df_1['APR DRG Code'].unique().tolist()
num_drg_desc = hos_sql_df_1['APR DRG Description'].unique().tolist()
print("Number of DRG Codes:",len(num_drg_code))
print("Number of DRG Descriptions:",len(num_drg_desc))
#Same number of unique values, so drop descriptions

In [None]:
num_mdc_code = hos_sql_df_1['APR MDC Code'].unique().tolist()
num_mdc_desc = hos_sql_df_1['APR MDC Description'].unique().tolist()
print("Number of MDC Codes:",len(num_mdc_code))
print("Number of MDC Descriptions:",len(num_mdc_desc))
#Same number of unique values, so drop descriptions

In [None]:
print("Number of Attending Provider License Numbers:",len(hos_sql_df_1['Attending Provider License Number'].unique().tolist()))
#This person is responsible for the overall care of the inpatient. Thus, they might play a large role in how long that person stays
#so we will keep this column since there are 27,085 different attending providers

In [None]:

#Run these once you have compiled all of them!
hos_sql_df_1 = hos_sql_df_1.drop(['Zip Code - 3 digits'], axis=1) #use average income as a feature instead
hos_sql_df_1 = hos_sql_df_1.drop(['Facility Name'], axis=1)
hos_sql_df_1 = hos_sql_df_1.drop(['CCS Diagnosis Description'], axis=1)
hos_sql_df_1 = hos_sql_df_1.drop(['CCS Procedure Description'], axis=1)
hos_sql_df_1 = hos_sql_df_1.drop(['APR DRG Description'], axis=1)
hos_sql_df_1 = hos_sql_df_1.drop(['APR MDC Description'], axis=1)
hos_sql_df_1 = hos_sql_df_1.drop(['Discharge Year'], axis=1)    #since these are all 2015 since the dataset is from 2015 inpatient records
hos_sql_df_1 = hos_sql_df_1.drop(['Operating Certificate Number'], axis=1)  #drop this column since it should not be a predictor for inpatient length of stay
hos_sql_df_1 = hos_sql_df_1.drop(['Ethnicity'], axis=1) #contains less information than and is contained within the Race column, so let's drop
hos_sql_df_1 = hos_sql_df_1.drop(['Hospital County'], axis=1) #lets drop hospital county column for computational efficiency
hos_sql_df_1 = hos_sql_df_1.drop(['Birth Weight'], axis=1) #Now we will drop birth weight, which we looked at up above
hos_sql_df_1 = hos_sql_df_1.drop(['Attending Provider License Number'], axis=1) #to enable generalization of model to any attending providers
hos_sql_df_1 = hos_sql_df_1.drop(['Patient Disposition'], axis=1) #data leakage feature

# **Feature Encoding**

Now we are ready to format our dataframe for modeling. However, for the data cleaning process, I converted a few columns that represent categorical features into numerical values to facilitate some processes. Now, these must be converted back to categorical columns.

In [None]:
hos_sql_df_1.dtypes

In [None]:
mort_string_index = {'Minor': 1, 'Moderate': 2, 'Major': 3, 'Extreme': 4}
age_string_index = {'0 to 17': 1, '18 to 29': 2, '30 to 49': 3, '50 to 69': 4, '70 or Older': 5}

hos_sql_df_1['Age Group'] = hos_sql_df_1['Age Group'].apply(lambda x: age_string_index[x])
hos_sql_df_1['APR Risk of Mortality'] = hos_sql_df_1['APR Risk of Mortality'].apply(lambda x: mort_string_index[x])
display(hos_sql_df_1)

One-hot encoding of all applicable columns is performed

In [None]:
encoded_df = pd.get_dummies(hos_sql_df_1)
display(encoded_df)

In [None]:
encoded_df['Facility Id'] = encoded_df['Facility Id'].astype('category')
encoded_df['CCS Diagnosis Code'] = encoded_df['CCS Diagnosis Code'].astype('category')
encoded_df['CCS Procedure Code'] = encoded_df['CCS Procedure Code'].astype('category')
encoded_df['APR DRG Code'] = encoded_df['APR DRG Code'].astype('category')
encoded_df['APR MDC Code'] = encoded_df['APR MDC Code'].astype('category')
encoded_df['APR Severity of Illness Code'] = encoded_df['APR Severity of Illness Code'].astype('category')
#encoded_df['Attending Provider License Number'] = encoded_df['Attending Provider License Number'].astype('category')

Let's make the correlation matrix one more time to assess for correlation between our categorical variables. Here we can see some patches of strong positive and negative correlations in our data. This motivates the use of dimensionality reduction of correlated variables later on

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
corr = encoded_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

# **Preliminary Modeling**

Here, I prepare my data for predictive modeling using machine learning methods. This preliminary process conists of two main steps:

First, the the training data is separated from the labels by dropping the label column and dropping columns that would not be present at the time of length of stay prediction in a real-world scenario (such as total charges and total costs). This effectively eliminates any data leakage from occuring
Then, the data is split into train and test sets for use in the remainder of the project

In [None]:
X = encoded_df.drop(['Length of Stay','Total Charges','Total Costs'], axis=1) #remove data leakage features
y = encoded_df[['Length of Stay']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

For our first model, let's try a simple linear regression model to predict length of stay. This means treating length of stay as a somewhat continous variable ranging from interger values from 1 to 120

# **Linear Regression**

Below, I initialize, train, and test a linear regression model on the data. I output model mean square error, in addition to an accuracy score by rounding the results of the prediction ouputs. This allows for easier comparison with classification models tested later on.

In [None]:
reg = LinearRegression().fit(X_train, y_train)
y_train_pred = reg.predict(X_train)
y_pred = reg.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
y_round_pred = np.ndarray.round(y_pred)
y_round_train_pred = np.ndarray.round(y_train_pred)
test_acc = accuracy_score(y_test, y_round_pred)
train_acc = accuracy_score(y_train, y_round_train_pred)
print(mse_test)
print(mse_train)
print('Test accuracy:', test_acc)
print('Train accuracy:', train_acc)

We can see the linear regression model does not perform particularly well. These results motivate me to move forward with machine learning multi-class classification models as opposed to regression models

# **PCA**

Next, I perform principal component analysis (PCA) to see if this can improve future model results and efficiency by scaling and reducing the dimensionality of the data. As is shown below, I am able to eliminate ~20 columns from the dataframe while retaining 95% explained variance in the data.

In [None]:
x_train = StandardScaler().fit_transform(X_train)
x_test = StandardScaler().fit_transform(X_test)

pca1 = PCA()
pca1.fit(x_train)
explained_variance_ratio = pca1.explained_variance_ratio_
pc_vs_variance = np.cumsum(pca1.explained_variance_ratio_)
plt.plot(pc_vs_variance)
plt.xlabel('Number of Components')
plt.ylabel('% Explained Variance')
plt.title('PCA Explained Variance vs. Number of Components')
plt.show()

In [None]:
variance_95 = list(filter(lambda i: i > 0.95, pc_vs_variance))[0]
component_95 = pc_vs_variance.tolist().index(variance_95)
print(component_95)

#perform the dimensionality reduction
pca2 = PCA(n_components=component_95)
x_train = pca2.fit_transform(x_train)
x_test = pca2.transform(x_test)

Which are the most important features?

Below, I briefly explore the relative feature importance in the dataset using the pca results performed above. This is done for as it will help answer questions such as: which factors actually influence how long patients stay in a hospital? This information could prove useful for hospitals as it can highlight important factors to pay attention to

I calculate the features of greatest importance based off their maximum contribution along the top two principal components of the dataset

In [None]:
pc_df = pd.DataFrame(pca2.components_,columns=X_train.columns)
top_pc = pc_df[0:1]
display(top_pc)

In [None]:
new_top_pc = top_pc.abs()
features = []
for i in range(0,10):
  features.append(new_top_pc.idxmax(axis=1).tolist())
  new_top_pc = new_top_pc.drop(columns=features[i])
print(features)

In [None]:
top_pc = top_pc.abs()
list_magnitude = top_pc.loc[0, :].values.tolist()
labels = top_pc.columns.tolist()
feature_importance_df = pd.DataFrame({'Feature': labels, 'Relative Importance': list_magnitude})
#feature_importance_df = feature_importance_df.nlargest(10, 'Relative Importance')

fig, ax =plt.subplots(figsize=(18,15))
sns.barplot(x='Feature', y='Relative Importance', data=feature_importance_df.reset_index())
plt.xticks(rotation=90)
plt.show()

From this figure, we can see that features such as the illness code, age group, type of admission, risk of mortality, payment typology and emergency department indicator play a large role in explaining the variance in the dataset. This supports the findings suggested by the figures in the data visualization section.

# **Decision Tree**

For our next model, let's train a decision tree classifier. Decision trees are one of the most common models for classification problems, so let's see how one performs on this data

In [None]:
#Decision Tree - on non-pca data
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
#from sklearn import metrics
train_predictions = dtree.predict(X_train)
test_predictions = dtree.predict(X_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

Without parameter optimization, we can see that we have definitely overfit. This is evident in the extremely high training accuracy, but with a much lower test accuracy. However, we are performing better than the linear regression model. Let's see if we can do better by setting some model hyperparameters

In [None]:
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 10, max_leaf_nodes=150)
dtree.fit(x_train,y_train)

#from sklearn import metrics
train_predictions = dtree.predict(x_train)
test_predictions = dtree.predict(x_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

By setting the max depth and max leaf nodes parameter, I have successfully prevented overfitting of the decision tree. Below is a visualization of the tree, which allows us to asses model complexity visually by looking at the depth of the tree and the number of leaf nodes present

In [None]:
#from sklearn.tree import plot_tree
f, ax = plt.subplots(figsize=(50, 30))
plot_tree(dtree)
plt.show()

To confirm the use of classification for the remainder of the modeling, I test out a decision tree regressor. We can see that it performs worse than its classification counterpart

In [None]:
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import accuracy_score

clf=DecisionTreeRegressor(max_depth=10, max_leaf_nodes=150)
clf.fit(x_train,y_train)
train_pred = clf.predict(x_train)
test_pred = clf.predict(x_test)

mse_test = mean_squared_error(y_test, test_pred)
mse_train = mean_squared_error(y_train, train_pred)
y_round_pred = np.ndarray.round(test_pred)
y_round_train_pred = np.ndarray.round(train_pred)
test_acc = accuracy_score(y_test, y_round_pred)
train_acc = accuracy_score(y_train, y_round_train_pred)
print(mse_test)
print(mse_train)
print('Test accuracy:', test_acc)
print('Train accuracy:', train_acc)

We could keep tuning parameters with the decision tree classifier to get better results, but we will get better performance using a random forest - an ensemble of decision trees. However, before attempting to implement another model, I decide to look at another way of increasing model performance.

After observing that decision tree classification seems to perform better than the regression, I am inclined to continue with classification. Along these lines, I will create bins of length of stay to construct a new categorical predictor column. Not only do these bins make sense given the fact that there is not a large difference between staying at a hospital for 57 days vs. 58 days, for example, but this will also increase the prediction accuracy of the model I build. This is evident in that the model can now predict a given patient will stay bewteen a given range of days, instead of having to predict an exact number of days. Below, I will perform this construction of new categories

# **Label/Predictor Construction**

Summary of findings from this section:

From the exploratory data analysis section of this project, I discovered the huge class imbalance present in the dataset given that most of the length of stays are clustered in the 1-5 day range
Class imbalance can have significant negative effects of model performance. For example, model accuracy can be misleadinly high if the model simply only predicts the classes that appear alot, in the process disregarding features that may point to that data row belonging to another, less common class. I find that this is the case with my models.
In this section, I attempt to deal with this class imbalance in a few different ways after observing the affects of this class imbalance on model performance. These effects are dicussed in this section.
After trying numerous different labeling schemes that involves length of stay bins of different sizes, and different numbers of length of stay bins that allowed for more/less specificty in predictions, I came to an optimal binning format that had a high accuracy score for most models trained on it, while not losing large amounts of specifity in predictions
The bins I decided to use for my final models are listed here: [0-3, 3-6, 6-9, 9-13, 13-20, 20-50, 50-120+] where all units are in days
This bin format results in a baseline accuracy of 1/7, or ~14.3% accuracy if the model were to guess randomly
Below shows some snippets of the iterative process I went through in order to determine this optimal bin format

As I create these categories, I will keep in mind that I want to create a somewhat even distribution of the number of data rows within each bin. The below label construction was made with the fact that small differences in length of stay matter less for patients who have long lengths of stay than for patients who are only staying 1 or 2 days. In other words, the difference between staying at a hospital for 1 day versus 2 days is more significant than staying 119 days vs. 120 days

In [None]:
bins = [0,1,2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50,60,70,80,90,100,120]
encoded_df['stay_bin']=pd.cut(x = encoded_df['Length of Stay'],
                        bins = bins)
encoded_df['stay_label']=pd.cut(x = encoded_df['Length of Stay'],
                        bins = bins,
                        labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])
encoded_df['stay_bin'] = encoded_df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
encoded_df['stay_bin'] = encoded_df['stay_bin'].apply(lambda x: str(x).replace('120','120+')) #make this bin more descriptive
display(encoded_df)

I visualize my new class distribution below

In [None]:
f, ax = plt.subplots(figsize=(20, 15))
sns.countplot(x='stay_bin', data = encoded_df)
plt.xticks(rotation=90)
plt.title('Class Distribution')
plt.show()

Let's run another decision tree classifier and see how much our classification improves now with these bins

In [None]:
#from sklearn.model_selection import train_test_split

#create train and test sets
new_X = encoded_df.drop(['Length of Stay','Total Charges','Total Costs','stay_bin','stay_label'], axis=1)
new_y = encoded_df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.3)

#perform pca
#from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA
x_train = StandardScaler().fit_transform(X_train)
x_test = StandardScaler().fit_transform(X_test)

pca = PCA(n_components=29) #50 components, as found above
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

We are getting a similar accuracy as with our decision tree above. This caused me to raise an eyebrow and hypothesize that that there is an issue with class imbalancing, as the model may just be predicting shorter lengths of stay in both models since there are the most instances of these classes. Let's plot a confusion matrix to see if we can confirm this

In [None]:
#Create confusion matrix
#from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(20, 20))
plot_confusion_matrix(dtree, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix with Normalization')
plt.show()

We can see the dramatic effects of the class imbalance on this model from this confusion matrix. As we can see, the model is mostly predicting labels 0-4. Then, it almost never predicts any values between 5 and 9 or 11 and 23. Instead, it classifies the remainder of the predictions in the 10 label. An idea for the reason behind this is that there is a spike in class count in this bin as seen from the class distribution plot above. Thus, the model predicts rows with features that correspond to longer lenghts of stay entirely in this bin as the most of these instances correspond to this label. This result illustrates the danger of class imbalance in a modeling problem

Next, I explore two ways of dealing with class imbalance: undersampling and assinging class weights that penalize overpredicting the most common label

# **Undersampling Techniques Following Label Construction**

In [None]:
#We can see that we have major class imbalance issues. I deal with that here
#from collections import Counter 
counts = y_train['stay_label'].value_counts().tolist()
print(counts)
#df_class_0_under = df_class_0.sample(count_class_1)
#df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)



In [None]:
#from imblearn.under_sampling import RandomUnderSampler

resample_dict = {0:112307, 1:112307, 2:112307, 3:112307} #resample the first four classes to have the same number of instances as the (10-15] bucket
rus = RandomUnderSampler(random_state=0, sampling_strategy=resample_dict)
x_resampled, y_resampled = rus.fit_resample(x_train, y_train)
unique_elements, counts_elements = np.unique(y_resampled, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
#Plot the new class distribution in the train set

#y_resampled_df = pd.DataFrame(data=y_resampled, columns=['label'])
y_resampled_df = pd.DataFrame(data=y_resampled, columns=['stay_label'])

f, ax = plt.subplots(figsize=(20, 15))
sns.countplot(x='stay_label', data = y_resampled_df)
plt.title('Class Distribution')
plt.show()

In [None]:
#Now lets run the decision tree and confusion matrix again
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 10, max_leaf_nodes=300)
dtree.fit(x_resampled,y_resampled)



In [None]:
#from sklearn import metrics
train_predictions = dtree.predict(x_train)
test_predictions = dtree.predict(x_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

In [None]:
#Create confusion matrix
#from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(20, 20))
plot_confusion_matrix(dtree, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix with Normalization')
plt.show()


We are still over-predicting in certain bins. I undersample the originial training data even further until I reach a distribution that causes the trend visualized in the confusion matrix above to disappear.

In [None]:
#lets downsample all the way to one of our smallest bins (the last bin that contains lenght of stays 100-120+)
#from collections import Counter 
counts = y_train['stay_label'].value_counts().tolist()
print(counts)

#from imblearn.under_sampling import RandomUnderSampler

resample_dict = {0:1000, 1:1000, 2:1000, 3:1000, 4:1000, 5:1000, 6:1000, 7:1000, 8:1000, 9:1000, 10:1000, 11:1000, 12:1000, 13:1000,
                 14:1000, 15:1000, 16:1000, 17:1000, 18:1000, 19:1000, 20:1000, 23:1000 } #lets take everything down to the 14th bucket size = 6785
rus = RandomUnderSampler(random_state=0, sampling_strategy=resample_dict)
x_resampled, y_resampled = rus.fit_resample(x_train, y_train)
unique_elements, counts_elements = np.unique(y_resampled, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
#Now lets run the decision tree and confusion matrix again
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 30, max_leaf_nodes=500)
dtree.fit(x_resampled,y_resampled)



In [None]:
#from sklearn import metrics
train_predictions = dtree.predict(x_train)
test_predictions = dtree.predict(x_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

In [None]:
#from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(20, 20))
plot_confusion_matrix(dtree, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix with Normalization')
plt.show()

Finally, we can see that our training accuracy has gone down, but we no longer see over-prediction of any particular category bins. Thus, we have successfuly dealt with the negative effect of class imbalance on model performance. Next, I will explore the second method of dealing with the class imbalance and comapre the results between the two in order to proceed with the optimal method. The below results are on an iteration of the dataset with fewer labels than used above. While the thought process behind this is explained later, the main result of dealing with class balance is emphasized here.

In [None]:
#perform decision tree classification no undersampling with non-pca data
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 10, max_leaf_nodes=300)
dtree.fit(X_train,y_train)

#from sklearn import metrics
train_predictions = dtree.predict(X_train)
test_predictions = dtree.predict(X_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

Here is where I implement the weights method of dealing with clas imbalance. The “balanced” mode for the class weight parameter of the decision tree uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))

In [None]:
#perform decision tree classification with balanced class weight parameter
#from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 15, max_leaf_nodes=300, class_weight='balanced')
dtree.fit(x_train,y_train)

#from sklearn import metrics
train_predictions = dtree.predict(x_train)
test_predictions = dtree.predict(x_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

In [None]:
#confusion matrix for unbalanced classes with class_weights balanced
#from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(15, 15))
plot_confusion_matrix(dtree, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix for Balanced Class Weights no Undersampling with Normalization')
plt.show()

We can see that we get similar results with class weights as with when we perform class balancing - interesting and useful to know

# **Class Balancing Pt. 2**

In [None]:
#bins = [0,3,6,9,13,20,50,120]
#labels = [3,6,9,13,20,50,120]

bins = [0,5,10,20,30,50,120]
labels = [5,10,20,30,50,120]
encoded_df['stay_bin']=pd.cut(x = encoded_df['Length of Stay'], #encoded df is the raw dataframe following one-hot encoding
                        bins = bins)
encoded_df['stay_label']=pd.cut(x = encoded_df['Length of Stay'],
                        bins = bins,
                        labels = labels) #lets also rename our bins to be more descriptive since now they are much larger
encoded_df['stay_bin'] = encoded_df['stay_bin'].apply(lambda x: str(x).replace(',',' -'))
encoded_df['stay_bin'] = encoded_df['stay_bin'].apply(lambda x: str(x).replace('120','120+')) #make this bin more descriptive
display(encoded_df)

In [None]:
f, ax = plt.subplots(figsize=(15, 11))
sns.countplot(x='stay_bin', data = encoded_df, palette='Reds')
plt.xticks(rotation=90)
plt.title('Class Distribution')
plt.xlabel('Length of Stay Bins')
plt.ylabel('Patient Count (millions)')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

#create train and test sets
new_X = encoded_df.drop(['Length of Stay','Total Charges','Total Costs','stay_bin','stay_label'], axis=1)
new_y = encoded_df[['stay_label']]
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.3)

#perform pca
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
x_train = StandardScaler().fit_transform(X_train)
x_test = StandardScaler().fit_transform(X_test)

pca = PCA(n_components=29) #29 components, as found above
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [None]:
#logisitic regression without class balance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
log_reg = LogisticRegression(multi_class='ovr').fit(x_train, y_train)
y_train_pred = log_reg.predict(x_train)
y_pred = log_reg.predict(x_test)

test_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, y_train_pred)

print('Test accuracy:', test_acc)
print('Train accuracy:', train_acc)

In [None]:
#Create confusion matrix for no class balancing
from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(11, 9))
plot_confusion_matrix(log_reg, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix Without Class Balancing')
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#logistic regression - works better with standardized but no PCA performed on data
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score
#import numpy as np
log_reg = LogisticRegression(class_weight='balanced', multi_class='ovr').fit(x_train, y_train)
y_train_pred = log_reg.predict(x_train)
y_pred = log_reg.predict(x_test)

test_acc = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, y_train_pred)

print('Test accuracy:', test_acc)
print('Train accuracy:', train_acc)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#Another example of importance of class balancing
#Create confusion matrix
from sklearn.metrics import plot_confusion_matrix
f, ax = plt.subplots(figsize=(11, 9))
plot_confusion_matrix(log_reg, x_test, y_test, cmap=plt.cm.Blues, normalize='true', ax=ax)
plt.title('Confusion Matrix With Class Balancing')
plt.show()

As we can see, the balanced class weight parameter has again prevented over-prediction in specific label values

In [None]:
#overfit tree
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(class_weight='balanced')
dtree.fit(X_train,y_train)

from sklearn import metrics
train_predictions = dtree.predict(X_train)
test_predictions = dtree.predict(X_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

Let's optimize parameters to prevent overfitting

In [None]:
from sklearn.model_selection import validation_curve
param_range = [15,20,30,35,100]
train_scores, test_scores = validation_curve(dtree, x_train, y_train, param_name='max_depth', 
                                             param_range=param_range, cv=3, scoring="accuracy")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.subplots(figsize=(11,9))
plt.title("Validation Curve with Decision Tree Classification")
plt.xlabel('Max Depth')
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.plot(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

In [None]:
from sklearn.model_selection import validation_curve
dtree=DecisionTreeClassifier(class_weight='balanced')
dtree.fit(X_train,y_train)
param_range = [15,20,30,35,100]
train_scores, test_scores = validation_curve(dtree, X_train, y_train, param_name='max_depth', 
                                             param_range=param_range, cv=3, scoring="accuracy")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.subplots(figsize=(11,9))
plt.title("Validation Curve with Decision Tree Classification")
plt.xlabel('Max Depth')
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.plot(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

Here, we can see the effect of the max depth parameter on decision tree performance. As max depth is increased, the decision tree becomes more prone to overfitting, resulting in a somewhat linear increase in training accuracy, but a constant cross-validation score throughout. Next, let's perform a grid search on this max depth parameter along with max leaf nodes in order to find the optimal combination of parameters for the model

Now I will test a random forest model on the data. With this large of a dataset, the random forest classifier is slow to run, and thus a hyperparameter optimization search is not the most feasible. Instead, let's try a randomized grid search on the decision tree classifier following the random forest model test

In [None]:
#try a randomized search on decision tree with 3-fold cross validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

dtree = DecisionTreeClassifier(class_weight='balanced')
search_vals = dict(max_depth=[35,50,75,100], max_leaf_nodes=[800,1000,1500,2000])
dtree_search = RandomizedSearchCV(dtree, search_vals, cv=3)
search = dtree_search.fit(X_train,y_train)
search.best_params_

Let's discover the improvement in model accuracy we can achieve when implementing these optimal parameters. The optimal parameters found were all on the largest end of the search parameter distributions, which suggests that larger values for max depth and max leaf nodes are most beneificial. For further improvement, a randomized grid search could be conducted over a wider distribution of parameter values. However, due to the large size of this dataset, computational efficiency is a limiting factor.

In [None]:
#decision tree optimal parameters
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier(max_depth= 50, max_leaf_nodes=1000, class_weight='balanced')
dtree.fit(X_train,y_train)

from sklearn import metrics
train_predictions = dtree.predict(X_train)
test_predictions = dtree.predict(X_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_predictions))

In [None]:
#randomized search on random forest with 3-fold CV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(class_weight='balanced')
search_vals = dict(max_depth=[15,25,50], max_leaf_nodes=[600,800,1400], n_estimators=[100,300,500])
dtree_search = RandomizedSearchCV(rf, search_vals, cv=3)
search = dtree_search.fit(X_train,y_train)
search.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, max_depth=15, class_weight='balanced')
rf.fit(X_train,y_train)

train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)
print("Train Accuracy:",metrics.accuracy_score(y_train, train_predictions))
print("Test Accuracy:",metrics.accuracy_score(y_test, test_predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_predictions))


Model accuracy ultimately did not increase signficantly with the hyperparameter search. Now let's look at the importance of the features in this dataset. These are calculated through the use of a metric called mean decrease impurity. This metric is defined as the total decrease in node impurity (weighted by the probability of reaching that node (which is approximated by the proportion of samples reaching that node)) averaged over all trees of the ensemble.

In [None]:
feat_importances = rf.feature_importances_
feat_names = X_train.columns.tolist()

plt.subplots(figsize=(18,11))
plt.xticks(rotation=90)
plt.bar(x=feat_names, height=feat_importances)
plt.title('Importance of Input Features on Length of Stay Predictor in Random Forest Model')
plt.ylabel('Feature Importance')
plt.show()

This provides yet another metric for feature importance, which allows for comparison bewteen the results found in the PCA section.

In [None]:
#Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtree = DecisionTreeClassifier(random_state = 1, class_weight = "balanced", max_depth = 15)
boost = AdaBoostClassifier(dtree, n_estimators=75, random_state=0)
boost.fit(X_train, y_train)

train_predictions = boost.predict(X_train)
test_predictions = boost.predict(X_test)
print("Train Accuracy:", accuracy_score(y_train, train_predictions))
print("Test Accuracy:", accuracy_score(y_test, test_predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_predictions))

# **Additional Data Insights**

In [None]:
hos_df.info()

In [None]:
#we found out how many Type of Admission
print("Type of Admission in Dataset:\n")
print(hos_df['Type of Admission'].unique())
#we found out how many Age group
print("\n\nAge Group in Dataset:\n")
print(hos_df['Age Group'].unique())
#we found out how many ARP Risk of Mortality
print("\n\nARP Risk of Mortality:\n")
print(hos_df['APR Risk of Mortality'].unique())
#we found out how many hospital country in our data
print("\n\nHospital Country in Dataset:\n")
print("There are {} different values\n".format(len(hos_df['Hospital County'].unique())))
print(hos_df['Hospital County'].unique())
#we found out how many ARP MDC Description
print("\n\nARP MDC Description(disease diagnosis) in Dataset:\n")
print("There are {} different values\n".format(len(hos_df['APR MDC Description'].unique())))
print(hos_df['APR MDC Description'].unique())

In [None]:
#We group features by data numbers
#show it if missing value(dropna=False)
hos_df['Type of Admission'].value_counts(dropna=False)

In [None]:
#number of patients by age groups
#show it if missing value(dropna=False)
hos_df['Age Group'].value_counts(dropna=False)

In [None]:
#show it if missing value(dropna=False)
print("Patients with or without abortion:\n")
print(hos_df['Abortion Edit Indicator'].value_counts(dropna=False))

In [None]:
#filtering
hos_df_newborn=hos_df['Type of Admission']=='Newborn'
print("Total Newborns:",hos_df_newborn.count())
hos_df[hos_df_newborn].head()

In [None]:
#grouping of mortality risk values
#show it if missing value(dropna=False)
hos_df['APR Severity of Illness Description'].value_counts(dropna=False)

# **Tidy Data(Melting)**

    We have transformed into a different structure with the melt () method to find out the features of the first five elements in our dataset ['Age_Group', 'Length_of_Stay', 'Type_of_Admission'].


In [None]:
hos_df_new = hos_df.head()
hos_df_melted = pd.melt(frame = hos_df_new, id_vars = 'APR MDC Description', value_vars = ['Age Group','Type of Admission'])
hos_df_melted

Concatenating Data

    age group of the diagnosis and the patient


In [None]:
#firstly lets create 2 data frame
hos_df_data1=hos_df['APR MDC Description'].tail()
hos_df_data2=hos_df['Age Group'].tail()

conc_hos_df_col=pd.concat([hos_df_data1,hos_df_data2],axis=1)
conc_hos_df_col

Building Data Frames From Scratch

In [None]:
#data frames from dictionary
Hospital=list(hos_df['Hospital County'].head())
Facility=list(hos_df['Facility Name'].head())
Year=list(hos_df['Discharge Year'].head())
Costs=list(hos_df['Total Costs'].head())

list_label=["hospital_country","facility_name","discharge_year","total_costs"]
list_col=[Hospital,Facility,Year,Costs]
zipped=list(zip(list_label,list_col))
hos_df_dict=dict(zipped)

hos_df_diff=pd.DataFrame(hos_df_dict)
hos_df_diff



# **Visual Exploratory Data Analysis**

In [None]:

hos_df_data1=hos_df.loc[:,["Total Costs","Total Charges","Birth Weight","Length of Stay"]]
hos_df_data1.plot()
plt.show()

In [None]:
hos_df_data1.plot(subplots=True)
plt.show()

In [None]:
hos_df_data1.plot(kind="hist",y="Total Costs",bins=50,range=(0,250))
plt.show()

In [None]:
#with non cumulative an cumulative
fig,axes=plt.subplots(nrows=2,ncols=1)

hos_df_data1.plot(kind="hist",y="Total Costs",bins=50,range=(0,250),ax=axes[0])
hos_df_data1.plot(kind="hist",y="Total Costs",bins=50,range=(0,250),ax=axes[1],cumulative=True)

plt.savefig("Graph.png")
plt.show()

In [None]:
print(hos_df['Discharge Year'])
hos_df['Discharge Year'] =pd.to_datetime(hos_df['Discharge Year'])
#lets make discharge_year as index
hos_df_dis=hos_df.set_index("Discharge Year")
hos_df_dis

In [None]:
print(hos_df.loc[85,['APR DRG Description']])
#selecting only some columns
hos_df[["APR DRG Description","Age Group","Length of Stay"]].head(20)

In [None]:
print(hos_df.loc[1:10,"Race":"Length of Stay"])

hos_df.loc[1:10,"Gender":]

In [None]:
print("Total hospitalization times for patients admitted to the hospital as Urgent:",
      hos_df['Length of Stay'][hos_df['Type of Admission']=='Urgent'].sum())

#The first value of unique races of patients coming to the hospital
hos_df.groupby("Race").first()

In [None]:
print("Total hospitalization times for patients admitted to the hospital as Emergency:",
      hos_df['Length of Stay'][hos_df['Type of Admission']=='Emergency'].sum())

#The first value of unique races of patients coming to the hospital
hos_df.groupby("Race").first()