In [59]:
# load library
from datetime import datetime
import math
import numpy as np
import pandas as pd

In [60]:
# load data
csv_file = "Data/Episodes.csv"
csv_data = pd.read_csv(csv_file, low_memory = False)
csv_df = pd.DataFrame(csv_data)
Episodes_df1 = csv_df[["PID","EpisodeID"]]


#load data - Episodes_new is just the original excel file "Data/Episodes" changed to a csv file for better formatting.
csv_file = "Data/Episodes_new.csv"
csv_data = pd.read_csv(csv_file, low_memory = False)
csv_df = pd.DataFrame(csv_data)
Episodes_df2 = csv_df[["EpisodeID","ContractType","AdmitDate","DischargeDate"]]

# combine the two dataset
Episodes_df = pd.merge(Episodes_df1, Episodes_df2, how='left', on="EpisodeID")

In [61]:
# get month information from AdmitDate
Admitmonth_list = []
for date in Episodes_df["AdmitDate"]:
    Admitmonth_list.append(int(date.split('/')[0]))
    
# get month information from DischargeDate
Dischargemonth_list = []
for date in Episodes_df["DischargeDate"]:
    if str(date) == "nan":
        Dischargemonth_list.append(None)
    elif str(date[-1]) != "9":
        Dischargemonth_list.append(12)
    else:
        Dischargemonth_list.append(int(str(date).split('/')[0]))
        
# deal with none in dischargedate     
new_column = []
for i in range(len(Dischargemonth_list)):
    if Dischargemonth_list[i] == None:
        new_column.append('#')
        continue
    new_column.append((Admitmonth_list[i],Dischargemonth_list[i]+1))
Episodes_df['month_range'] = new_column

In [62]:
# preview the number of CT1 and CT2 patients in each SNFs in 2019
Episodes_df.groupby(['PID','ContractType'])[['EpisodeID']].count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,EpisodeID
PID,ContractType,Unnamed: 2_level_1
6,CT1,1
6,CT2,65
8,CT1,25
8,CT2,82
9,CT1,61


In [63]:
#create empty dict and keep the info into dict for further operation
_dict_CT1 = dict()
_dict_CT2 = dict()
for i in range(len(Episodes_df["PID"])):
    if Episodes_df['month_range'][i] == '#':
        continue
    if (Episodes_df["PID"][i] not in _dict_CT1) and Episodes_df["ContractType"][i] =='CT1' :
        _dict_CT1[Episodes_df["PID"][i]] = list(range(Episodes_df['month_range'][i][0],Episodes_df['month_range'][i][1]))
        continue
    elif (Episodes_df["PID"][i] in _dict_CT1) and Episodes_df["ContractType"][i] =='CT1' :
        _dict_CT1[Episodes_df["PID"][i]].extend(list(range(Episodes_df['month_range'][i][0],Episodes_df['month_range'][i][1])))
        continue
    
    if (Episodes_df["PID"][i] not in _dict_CT2) and Episodes_df["ContractType"][i] =='CT2' :
        _dict_CT2[Episodes_df["PID"][i]] = list(range(Episodes_df['month_range'][i][0],Episodes_df['month_range'][i][1]))
        continue
    elif (Episodes_df["PID"][i] in _dict_CT2) and Episodes_df["ContractType"][i] =='CT2' :
        _dict_CT2[Episodes_df["PID"][i]].extend(list(range(Episodes_df['month_range'][i][0],Episodes_df['month_range'][i][1])))
        continue

In [64]:
# sort pid order from small to large
_dict_CT1 = sorted(_dict_CT1.items(), key = lambda x:x[0])
_dict_CT2 = sorted(_dict_CT2.items(), key = lambda x:x[0])

In [65]:
# Create two-dimensional arrays for keeping information in next operation
data_CT1=np.random.randn(722,13)
data_CT2=np.random.randn(1191,13)

In [66]:
# create dataframes based on two-dimensional arrays
df_CT1 = pd.DataFrame(data_CT1,columns=['PID', 'M1_CT1', 'M2_CT1', 'M3_CT1','M4_CT1','M5_CT1','M6_CT1','M7_CT1','M8_CT1','M9_CT1','M10_CT1','M11_CT1','M12_CT1'],index=list(range(0,722)),dtype=np.int64)
df_CT2 = pd.DataFrame(data_CT2,columns=['PID', 'M1_CT2', 'M2_CT2', 'M3_CT2','M4_CT2','M5_CT2','M6_CT2','M7_CT2','M8_CT2','M9_CT2','M10_CT2','M11_CT2','M12_CT2'],index=list(range(0,1191)),dtype=np.int64)

In [67]:
# calculate the number of CT1 patient in each snf every month
for i in range(len(_dict_CT1)):
    df_CT1["PID"][i] = int(_dict_CT1[i][0])
    df_CT1['M1_CT1'][i] = int(_dict_CT1[i][1].count(1))
    df_CT1['M2_CT1'][i] = int(_dict_CT1[i][1].count(2))
    df_CT1['M3_CT1'][i] = int(_dict_CT1[i][1].count(3))
    df_CT1['M4_CT1'][i] = int(_dict_CT1[i][1].count(4))
    df_CT1['M5_CT1'][i] = int(_dict_CT1[i][1].count(5))
    df_CT1['M6_CT1'][i] = int(_dict_CT1[i][1].count(6))
    df_CT1['M7_CT1'][i] = int(_dict_CT1[i][1].count(7))
    df_CT1['M8_CT1'][i] = int(_dict_CT1[i][1].count(8))
    df_CT1['M9_CT1'][i] = int(_dict_CT1[i][1].count(9))
    df_CT1['M10_CT1'][i] = int(_dict_CT1[i][1].count(10))
    df_CT1['M11_CT1'][i] = int(_dict_CT1[i][1].count(11))
    df_CT1['M12_CT1'][i] = int(_dict_CT1[i][1].count(12))
df_CT1.head()

Unnamed: 0,PID,M1_CT1,M2_CT1,M3_CT1,M4_CT1,M5_CT1,M6_CT1,M7_CT1,M8_CT1,M9_CT1,M10_CT1,M11_CT1,M12_CT1
0,6,0,0,0,1,0,0,0,0,0,0,0,0
1,8,1,3,5,6,8,4,5,2,1,4,4,6
2,9,7,6,9,6,11,7,9,5,9,6,9,6
3,11,0,1,1,1,1,1,1,0,0,0,0,0
4,12,2,0,4,3,1,1,1,1,1,1,2,1


In [68]:
# calculate the number of CT2 patient in each snf every month
for i in range(len(_dict_CT2)):
    df_CT2["PID"][i] = int(_dict_CT2[i][0])
    df_CT2['M1_CT2'][i] = int(_dict_CT2[i][1].count(1))
    df_CT2['M2_CT2'][i] = int(_dict_CT2[i][1].count(2))
    df_CT2['M3_CT2'][i] = int(_dict_CT2[i][1].count(3))
    df_CT2['M4_CT2'][i] = int(_dict_CT2[i][1].count(4))
    df_CT2['M5_CT2'][i] = int(_dict_CT2[i][1].count(5))
    df_CT2['M6_CT2'][i] = int(_dict_CT2[i][1].count(6))
    df_CT2['M7_CT2'][i] = int(_dict_CT2[i][1].count(7))
    df_CT2['M8_CT2'][i] = int(_dict_CT2[i][1].count(8))
    df_CT2['M9_CT2'][i] = int(_dict_CT2[i][1].count(9))
    df_CT2['M10_CT2'][i] = int(_dict_CT2[i][1].count(10))
    df_CT2['M11_CT2'][i] = int(_dict_CT2[i][1].count(11))
    df_CT2['M12_CT2'][i] = int(_dict_CT2[i][1].count(12))
df_CT2.head()

Unnamed: 0,PID,M1_CT2,M2_CT2,M3_CT2,M4_CT2,M5_CT2,M6_CT2,M7_CT2,M8_CT2,M9_CT2,M10_CT2,M11_CT2,M12_CT2
0,6,11,5,7,5,4,3,3,5,6,5,3,11
1,8,19,9,9,6,8,13,10,6,5,2,0,1
2,9,3,4,6,6,0,4,3,2,1,0,1,0
3,10,2,0,1,0,1,0,0,0,0,0,0,0
4,11,4,5,4,5,2,3,0,1,0,0,0,0


In [69]:
# load curr_complete_dataset in order to get PID column
dateset_final = pd.read_csv('Data/curr_complete_dataset_delete_columns.csv')

In [70]:
dateset_final_PID = dateset_final['PID']

In [71]:
# combine datasets to attain a dataset including CT1 and CT2 patient numbers in each SNF every month
result = pd.merge(dateset_final_PID, df_CT1, how='left', on='PID')
result2 = pd.merge(result, df_CT2, how='left', on='PID')

In [72]:
# set the largest columns and largest rows to have a better observation on the new dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [73]:
result2.head()

Unnamed: 0,PID,M1_CT1,M2_CT1,M3_CT1,M4_CT1,M5_CT1,M6_CT1,M7_CT1,M8_CT1,M9_CT1,M10_CT1,M11_CT1,M12_CT1,M1_CT2,M2_CT2,M3_CT2,M4_CT2,M5_CT2,M6_CT2,M7_CT2,M8_CT2,M9_CT2,M10_CT2,M11_CT2,M12_CT2
0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,5.0,7.0,5.0,4.0,3.0,3.0,5.0,6.0,5.0,3.0,11.0
1,8,1.0,3.0,5.0,6.0,8.0,4.0,5.0,2.0,1.0,4.0,4.0,6.0,19.0,9.0,9.0,6.0,8.0,13.0,10.0,6.0,5.0,2.0,0.0,1.0
2,9,7.0,6.0,9.0,6.0,11.0,7.0,9.0,5.0,9.0,6.0,9.0,6.0,3.0,4.0,6.0,6.0,0.0,4.0,3.0,2.0,1.0,0.0,1.0,0.0
3,10,,,,,,,,,,,,,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,4.0,5.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0


In [74]:
# fill NaN with 0
result2 = result2.fillna(0)
result2.head()

Unnamed: 0,PID,M1_CT1,M2_CT1,M3_CT1,M4_CT1,M5_CT1,M6_CT1,M7_CT1,M8_CT1,M9_CT1,M10_CT1,M11_CT1,M12_CT1,M1_CT2,M2_CT2,M3_CT2,M4_CT2,M5_CT2,M6_CT2,M7_CT2,M8_CT2,M9_CT2,M10_CT2,M11_CT2,M12_CT2
0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,5.0,7.0,5.0,4.0,3.0,3.0,5.0,6.0,5.0,3.0,11.0
1,8,1.0,3.0,5.0,6.0,8.0,4.0,5.0,2.0,1.0,4.0,4.0,6.0,19.0,9.0,9.0,6.0,8.0,13.0,10.0,6.0,5.0,2.0,0.0,1.0
2,9,7.0,6.0,9.0,6.0,11.0,7.0,9.0,5.0,9.0,6.0,9.0,6.0,3.0,4.0,6.0,6.0,0.0,4.0,3.0,2.0,1.0,0.0,1.0,0.0
3,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,4.0,5.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0


In [75]:
result2.to_csv("Data/curr_CT1_CT2_eachmonth.csv",index=False)