<a href="https://colab.research.google.com/github/rhailper/milestoneII/blob/main/SIADS696_DataExplorationAndCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Questions to discuss as a team  
Should we all use the same input dataset so that the models we use are comparable?


In [96]:
%cd /content/drive/MyDrive/Colab Notebooks

/content/drive/MyDrive/Colab Notebooks


In [98]:
from config import *

In [77]:
#!git clone https://{token}@github.com/rhailper/milestoneII.git

In [None]:
import pandas as pd
import numpy as numpy
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

### Import and do basic cleaning to files

#### Client information

In [None]:
# import client info - this file has basic demographic information about the client
demo = pd.read_csv('/content/milestoneII/data/CLIENT_INFORMATION.csv') 

In [None]:
# drop row with na values
# If a client has na values in this value that means they died or stopped 
# receiving services from the organization
#client_info = client_info.dropna()

Due to the deidentification process, this dataset was not allowed to disclose the exact age of any client over the age of 90. Any client over the age of 90 was coded as 90+ so this needs to be changed in order to make the feature numerical.

In [None]:
# replace '90+' with 90
demo['Age'] = demo['Age'].str.replace('90+','90',regex=False)#.astype(int)

In [None]:
# convert categorical variables into one hot encoded dummy variables
df = pd.get_dummies(demo, columns=['Gender','Federal Poverty','Race','Primary Funding Source','Multiple Funding Sources?'])

In [None]:
df

Unnamed: 0,ID,Age,Federal % of Poverty,ADL Count,Critical Need Count,IADL Count,Skilled Need Count,Nutrition Score,Gender_Female,Gender_Male,...,Race_Asian,Race_Black or African American,Race_Multiracial,Race_Native Hawaiian or Other Pacific Islander,Race_Unknown/Missing,Race_White,Primary Funding Source_Non-waiver,Primary Funding Source_Waiver,Multiple Funding Sources?_No,Multiple Funding Sources?_Yes
0,10,38,87.0,4.0,0.0,5.0,0.0,0.0,0,1,...,0,0,0,0,1,0,0,1,1,0
1,100035,52,73.0,6.0,1.0,8.0,0.0,3.0,1,0,...,0,1,0,0,0,0,0,1,1,0
2,100048,90,124.0,,,,,,0,1,...,0,0,0,0,0,1,0,1,1,0
3,100061,53,51.0,6.0,1.0,6.0,0.0,3.0,1,0,...,0,1,0,0,0,0,0,1,1,0
4,100073,69,65.0,0.0,2.0,1.0,0.0,8.0,0,1,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13598,99910,67,145.0,5.0,1.0,7.0,0.0,3.0,1,0,...,0,0,0,0,0,1,0,1,1,0
13599,99919,90,96.0,7.0,1.0,8.0,0.0,6.0,1,0,...,0,1,0,0,0,0,0,1,1,0
13600,99963,55,76.0,5.0,2.0,6.0,0.0,9.0,1,0,...,0,0,0,0,0,1,0,1,1,0
13601,99967,60,72.0,5.0,1.0,5.0,0.0,8.0,1,0,...,0,1,0,0,0,0,0,1,1,0


#### Client services


In [None]:
# import client services - this files contains service utilization
serv = pd.read_csv('/content/milestoneII/data/CLIENT_SERVICES.csv') 

In [None]:
client_info

Unnamed: 0,ID,Month,Year,Services,Cost of Serivces
0,10,April,2022,"['Structured Family Care - Level 2', 'Case Man...",4531.06
1,10,August,2021,"['Structured Family Care - Level 2', 'Case Man...",4673.14
2,10,December,2021,"['Structured Family Care - Level 2', 'Case Man...",4673.14
3,10,February,2022,"['Structured Family Care - Level 2', 'Case Man...",4246.90
4,10,January,2022,"['Structured Family Care - Level 2', 'Case Man...",4673.14
...,...,...,...,...,...
127089,99976,March,2022,"['Attendant Care', 'Case Management - flat rat...",2782.90
127090,99976,May,2022,"['Attendant Care', 'Case Management - flat rat...",4226.26
127091,99976,November,2021,"['Attendant Care', 'Case Management - flat rat...",2643.22
127092,99976,October,2021,"['Attendant Care', 'Case Management - flat rat...",2782.90


#### Diagnoses

In [None]:
# import diagnoses - this gile contains client diagnoses based on ICD-10 codes
diag = pd.read_csv('//content/milestoneII/data/DIAGNOSES.csv') 

#### Questionaire

In [None]:
# import questionaire - this file contains information about clients ability to complete daily activities 
quest = pd.read_csv('/content/milestoneII/data/QUESTIONAIRE.csv') 

In [None]:
# get the most recent annual questionaire for each client 

In [None]:
# get the difference in scores between the most recent annual and the previous annual

In [80]:
#quest['InterRAI Period'].unique()

#### Hospitalzations (will be outcome variable for supervised learning)

In [62]:
def convert_hosp(hosp,year,emerg_only=True):
  '''Helper function to conver the hospitalzation dataset 
     into useable features or outcomes'''
  if emerg_only == True:
    df = hosp[(hosp['Year']==year)&(hosp['Admittype']=='Emergency')]
    total_hosp_emerg_only = df.groupby('ID')['Number Hospitalzations'].sum()
    avg_hosp_emerg_only = df.groupby('ID')['Number Hospitalzations'].mean()
    return  total_hosp_emerg_only, avg_hosp_emerg_only
  else:
    df = hosp[(hosp['Year']==year)]
    total_hosp = df.groupby('ID')['Number Hospitalzations'].sum()
    avg_hosp = df.groupby('ID')['Number Hospitalzations'].mean()
    return  total_hosp, avg_hosp

In [23]:
# import hospitalzations - this file contains information about client hospitalzations in the past 2 years
hosp = pd.read_csv('data/HOSPITALIZATIONS.csv') 

In [None]:
hosp_features_and_outcomes = []
for k,v in {'emerg_total_2022':[2022,True],'health_visits_total_2022':[2022,True],'emerg_avg_2022':,
                 'health_visits_avg_2022':,'emerg_total_2023':,'emerg_avg_2023':,
                 'health_visits_total_2023':,'health_visits_avg_2023':}:
  convert_hosp(hosp,year,emerg_only=True)  

In [70]:
# get the total number of emergecy room visits in 2022 per client (feature)
emerg_total_2022.name = 'emerg_total_2022'

In [66]:
# get the total number of emergecy room visits in 2022 per client (feature)
emerg_total_2022 = hosp[(hosp['Year']==2022)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].sum()

# get the total number of health care visits in 2022 per client (feature)
health_visits_total_2022 = hosp[(hosp['Year']==2022)].groupby('ID')['Number Hospitalzations'].sum()

# get the average emergency room visits per month in 2022 per client (feature)
emerg_avg_2022 = hosp[(hosp['Year']==2022)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].mean()

# get the average health care visits per month in 2022 per client (feature)
health_visits_avg_2022 = hosp[(hosp['Year']==2022)].groupby('ID')['Number Hospitalzations'].mean()

# get total number of emergecy room visits per client in Jan,Feb,Mar 2023 (outcome)
emerg_total_2023 = hosp[(hosp['Year']==2023)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].sum()

# get average number of emergecy room visits per client in Jan,Feb,Mar 2023 (outcome)
emerg_avg_2023 = hosp[(hosp['Year']==2023)&(hosp['Admittype']=='Emergency')].groupby('ID')['Number Hospitalzations'].mean()

# get total number of health care visits per client in Jan,Feb,Mar 2023 (outcome)
health_visits_total_2023 = hosp[(hosp['Year']==2023)].groupby('ID')['Number Hospitalzations'].sum()

# get average number of health care visits per client in Jan,Feb,Mar 2023 (outcome)
health_visits_avg_2023 = hosp[(hosp['Year']==2023)].groupby('ID')['Number Hospitalzations'].mean()

In [72]:
pd.concat([emerg_total_2022,health_visits_total_2022,emerg_avg_2022,
           emerg_total_2023, emerg_avg_2023,
           health_visits_total_2023,health_visits_avg_2023],axis=1)

Unnamed: 0_level_0,emerg_total_2022,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations,Number Hospitalzations
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
62,1.0,19.0,1.0,,,6.0,3.000000
112,3.0,12.0,1.0,,,5.0,1.666667
167,2.0,10.0,1.0,,,4.0,1.333333
210,1.0,3.0,1.0,,,,
234,4.0,9.0,1.0,,,3.0,1.500000
...,...,...,...,...,...,...,...
225490,,,,,,1.0,1.000000
225730,,,,,,1.0,1.000000
225990,,,,,,1.0,1.000000
226131,,,,,,1.0,1.000000


In [101]:
%cd /content/drive/MyDrive/Colab Notebooks/milestoneII

/content/drive/MyDrive/Colab Notebooks/milestoneII


In [82]:
#!git config --global user.email "rhailper@umich.edu"
#!git config --global user.name "rhailper"
#!git pull

Already up to date.


In [104]:
#!git remote add origin https://{token}@github.com/rhailper/milestoneII.git

In [102]:
!git add .
!git commit -m 'Updates to data cleaning'
!git push https://{token}@github.com/rhailper/milestoneII.git

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 3, done.
Counting objects: 100% (3/3), done.
Delta compression using up to 2 threads
Compressing objects: 100% (2/2), done.
Writing objects: 100% (2/2), 225 bytes | 75.00 KiB/s, done.
Total 2 (delta 1), reused 0 (delta 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/rhailper/milestoneII.git
   7cc4161..0b4821f  main -> main
