<a href="https://colab.research.google.com/github/rhailper/milestoneII/blob/main/notebooks/SIADS696_DataExplorationAndCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Connect to drive and pull from github

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [4]:
from config import *

In [6]:
#!git clone https://{token}@github.com/rhailper/milestoneII.git

In [23]:
%cd /content/drive/MyDrive/Colab Notebooks/milestoneII

/content/drive/MyDrive/Colab Notebooks/milestoneII


In [24]:
!git pull

From https://github.com/rhailper/milestoneII
   150aecb..59eaef5  main       -> origin/main
Already up to date.


# Import and do basic cleaning to files

In [88]:
import pandas as pd
import numpy as numpy
import itertools
from ast import literal_eval
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MultiLabelBinarizer

## Client information

In [30]:
# import client info - this file has basic demographic information about the client
demo = pd.read_csv('data/input_data/CLIENT_INFORMATION.csv') 

In [31]:
# drop row with na values 
# If a client has na values in this value that means they died or stopped 
# receiving services from the organization 
# (it might also be interesting to look at these client)
#client_info = client_info.dropna()

Due to the deidentification process, this dataset was not allowed to disclose the exact age of any client over the age of 90. Any client over the age of 90 was coded as 90+ so this needs to be changed in order to make the feature numerical.

In [32]:
# replace '90+' with 90
demo['Age'] = demo['Age'].str.replace('90+','90',regex=False)#.astype(int)

In [33]:
# convert categorical variables into one hot encoded dummy variables
df = pd.get_dummies(demo, columns=['Gender','Federal Poverty','Race','Primary Funding Source','Multiple Funding Sources?'])

#### Client services


In [75]:
# import client services - this files contains service utilization
serv = pd.read_csv('data/input_data/CLIENT_SERVICES.csv') 

In [76]:
# get only 2022
serv = serv[serv['Year']==2022]

In [77]:
# get the total service cost per client 2022
service_costs_total = serv.groupby('ID')['Cost of Serivces'].sum()

In [78]:
# get the avg service cost per month per client 2022
service_costs_avg = serv.groupby('ID')['Cost of Serivces'].mean()

In addition to the costs of services for each client, from this table we can get the number and type of services each client recieves. The types of services will need to be one hot encoded. 

In [83]:
# get list of all unique services a client had in 2022
# convert service column from string to list
serv['Services'] = serv['Services'].apply(literal_eval)
# get a list of unique services by client
num_unique_services = serv.groupby('ID').apply(lambda x : len(set(x['Services'].sum())))

In [94]:
# get the list of services as one hot encodings
unique_services = serv.groupby('ID').apply(lambda x : set(x['Services'].sum()))
mlb = MultiLabelBinarizer()

unique_services_one_hot = pd.DataFrame(mlb.fit_transform(unique_services),
                   columns=mlb.classes_,
                   index=unique_services.index)

#### Diagnoses

In [None]:
# import diagnoses - this gile contains client diagnoses based on ICD-10 codes
diag = pd.read_csv('//content/milestoneII/data/DIAGNOSES.csv') 

#### Questionaire

In [None]:
# import questionaire - this file contains information about clients ability to complete daily activities 
quest = pd.read_csv('/content/milestoneII/data/QUESTIONAIRE.csv') 

In [None]:
# get the most recent annual questionaire for each client 

In [None]:
# get the difference in scores between the most recent annual and the previous annual

In [None]:
#quest['InterRAI Period'].unique()

#### Hospitalzations (will be outcome variable for supervised learning)

In [None]:
def convert_hosp(hosp,year,emerg_only=True):
  '''Helper function to conver the hospitalzation dataset 
     into useable features or outcomes'''
  if emerg_only == True:
    df = hosp[(hosp['Year']==year)&(hosp['Admittype']=='Emergency')]
    total_hosp_emerg_only = df.groupby('ID')['Number Hospitalzations'].sum()
    avg_hosp_emerg_only = df.groupby('ID')['Number Hospitalzations'].mean()
    return  total_hosp_emerg_only, avg_hosp_emerg_only
  else:
    df = hosp[(hosp['Year']==year)]
    total_hosp = df.groupby('ID')['Number Hospitalzations'].sum()
    avg_hosp = df.groupby('ID')['Number Hospitalzations'].mean()
    return  total_hosp, avg_hosp

In [None]:
# import hospitalzations - this file contains information about client hospitalzations in the past 2 years
hosp = pd.read_csv('data/HOSPITALIZATIONS.csv') 

In [None]:
hosp_features_and_outcomes = []
for k,v in {'emerg_total_2022':[2022,True],'health_visits_total_2022':[2022,True],'emerg_avg_2022':,
                 'health_visits_avg_2022':,'emerg_total_2023':,'emerg_avg_2023':,
                 'health_visits_total_2023':,'health_visits_avg_2023':}:
  convert_hosp(hosp,year,emerg_only=True)  

In [None]:
# get the total number of emergecy room visits in 2022 per client (feature)
emerg_total_2022.name = 'emerg_total_2022'

In [None]:
# get the total number of emergecy room visits in 2022 per client (feature)
emerg_total_2022 = hosp[(hosp['Year']==2022)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].sum()

# get the total number of health care visits in 2022 per client (feature)
health_visits_total_2022 = hosp[(hosp['Year']==2022)].groupby('ID')['Number Hospitalzations'].sum()

# get the average emergency room visits per month in 2022 per client (feature)
emerg_avg_2022 = hosp[(hosp['Year']==2022)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].mean()

# get the average health care visits per month in 2022 per client (feature)
health_visits_avg_2022 = hosp[(hosp['Year']==2022)].groupby('ID')['Number Hospitalzations'].mean()

# get total number of emergecy room visits per client in Jan,Feb,Mar 2023 (outcome)
emerg_total_2023 = hosp[(hosp['Year']==2023)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].sum()

# get average number of emergecy room visits per client in Jan,Feb,Mar 2023 (outcome)
emerg_avg_2023 = hosp[(hosp['Year']==2023)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].mean()

# get total number of health care visits per client in Jan,Feb,Mar 2023 (outcome)
health_visits_total_2023 = hosp[(hosp['Year']==2023)].groupby('ID')['Number Hospitalzations'].sum()

# get average number of health care visits per client in Jan,Feb,Mar 2023 (outcome)
health_visits_avg_2023 = hosp[(hosp['Year']==2023)].groupby('ID')['Number Hospitalzations'].mean()

In [None]:
pd.concat([emerg_total_2022,health_visits_total_2022,emerg_avg_2022,
           emerg_total_2023, emerg_avg_2023,
           health_visits_total_2023,health_visits_avg_2023],axis=1)

Unnamed: 0_level_0,emerg_total_2022,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
62,1.0,19.0,1.0,,,6.0,3.000000
112,3.0,12.0,1.0,,,5.0,1.666667
167,2.0,10.0,1.0,,,4.0,1.333333
210,1.0,3.0,1.0,,,,
234,4.0,9.0,1.0,,,3.0,1.500000
...,...,...,...,...,...,...,...
225490,,,,,,1.0,1.000000
225730,,,,,,1.0,1.000000
225990,,,,,,1.0,1.000000
226131,,,,,,1.0,1.000000


In [27]:
#%cd /content/drive/MyDrive/Colab Notebooks/milestoneII

In [21]:
#!git config --global user.email "rhailper@umich.edu"
#!git config --global user.name "rhailper"
#!git pull

In [None]:
#!git remote add origin https://{token}@github.com/rhailper/milestoneII.git

In [20]:
#!git pull

In [19]:
#!git checkout main

In [95]:
#!git add .
#!git commit -m 'Updates to folder structure'
#!git push https://{token}@github.com/rhailper/milestoneII.git

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
