In [97]:
%matplotlib inline

import pandas as pd
import numpy as np
import datetime
from reference import get_file_names

In [98]:
df_ref = pd.read_csv('./vidas_data/cross_ref_cols_tabs.csv')
# df_ref.head()

In [99]:
# get the data frame giving the list of IDEHRs who have passed
get_file_names(['IDEHR', 'IDDecesso'], df_ref)

['wH_DirectTable_Decesso', 'wH_DirectTable_C_Decesso']

In [100]:
# get the data frame giving the list of IDEHRs with the start date of hospitalization
get_file_names(['IDEHR', 'DateTherapy'], df_ref)

['wH_Therapy_JCM_Therapy', 'wH_Therapy_JCM_TherapyWhenNeeded']

In [101]:
# get the data frame giving the list of IDEHRs with the diagnosis
get_file_names(['IDEHR', 'Diagnosi'], df_ref)

['wH_DirectTable_Informazioni_SS', 'wH_DirectTable_C_Informazioni_SS']

In [102]:
# data frame with deathID
df_death = pd.read_csv('./vidas_data/wH_DirectTable_Decesso.csv', parse_dates=['Data'])
df_death = df_death[['IDEHR', 'Data', 'PatientId', 'EHRType', 'IDDecesso']]
print(df_death.shape)
df_death.head()

(6526, 5)


Unnamed: 0,IDEHR,Data,PatientId,EHRType,IDDecesso
0,4,2015-01-09 16:15:00,59,EHR,1
1,1022,2015-01-11 00:15:00,90,EHR,2
2,16,2015-01-15 01:45:00,71,EHR,3
3,1025,2015-01-15 06:44:00,91,EHR,4
4,6,2015-01-20 14:50:00,61,EHR,5


In [103]:
# check NaNs
df_death.isna().sum()

IDEHR         0
Data         87
PatientId     0
EHRType       0
IDDecesso     0
dtype: int64

In [104]:
# data frame with start of hospitalization
df_soh = pd.read_csv('./vidas_data/wH_Therapy_JCM_Therapy.csv', parse_dates=['DateTherapy'])
print(df_soh.shape)
df_soh = df_soh[['IDEHR', 'DateTherapy']].drop_duplicates(subset=['IDEHR'], keep='first')
print(df_soh.shape)
df_soh.head()

(292176, 39)
(2341, 2)


Unnamed: 0,IDEHR,DateTherapy
0,2,2015-01-09
9,1020,2015-01-10
34,3,2015-01-10
41,5,2015-01-10
50,6,2015-01-10


In [105]:
# check NaNs
df_soh.isna().sum()

IDEHR          0
DateTherapy    0
dtype: int64

In [106]:
# data frame with diagnosis
df_diag = pd.read_csv('./vidas_data/wH_DirectTable_Informazioni_SS.csv')
df_diag = df_diag[['IDEHR', 'Diagnosi', 'PatientId', 'EHRType']]
print(df_diag.shape)
df_diag.head()

(10217, 4)


Unnamed: 0,IDEHR,Diagnosi,PatientId,EHRType
0,1239,"Trachea, bronchi e polmoni#909=1",91,AMB
1,1241,Stomaco#912=1,93,AMB
2,1242,Colon#877=1,92,AMB
3,1244,"Trachea, bronchi e polmoni#909=1",94,AMB
4,1244,"Trachea, bronchi e polmoni#909=1",94,AMB


In [107]:
# check NaNs
df_diag.isna().sum()

IDEHR         0
Diagnosi     57
PatientId     0
EHRType       0
dtype: int64

In [108]:
df_diag = df_diag.dropna()
print(df_diag.shape)
df_diag.isna().sum()

(10160, 4)


IDEHR        0
Diagnosi     0
PatientId    0
EHRType      0
dtype: int64

In [109]:
df_diag.Diagnosi.value_counts()

Trachea, bronchi e polmoni#909=1                                         1762
Pancreas#906=1                                                            853
Mammella#901=1                                                            710
Colon#877=1                                                               659
Fegato, dotti biliari intraepatici#891=1                                  556
Stomaco#912=1                                                             453
Encefalo#889=1                                                            404
Diagnosi non oncologica#881=1                                             403
Prostata#910=1                                                            346
Vescica#925=1                                                             300
Consulenza/Supporto#935=3                                                 277
Assistenza al Lutto#933=1                                                 262
Rene e altri e non specificati organi urinari#911=1             

In [110]:
# do the merge
from functools import reduce
dfs = [df_death, df_soh, df_diag]
df_referral = reduce(lambda left, right: pd.merge(left, right, on='IDEHR'), dfs)

In [111]:
# check why the same IDEHR has different PatientID + EHRType
df_referral

Unnamed: 0,IDEHR,Data,PatientId_x,EHRType_x,IDDecesso,DateTherapy,Diagnosi,PatientId_y,EHRType_y
0,16,2015-01-15 01:45:00,71,EHR,3,2015-01-11,Consulenza/Supporto#935=3,6,AMB
1,1047,2015-02-03 09:00:00,116,EHR,22,2015-01-28,Mammella#901=1,116,EHR
2,7,2015-03-12 14:45:00,62,EHR,59,2015-01-11,Consulenza/Supporto#935=3,3,AMB
3,10,2015-03-18 22:45:00,65,EHR,64,2015-01-10,Prostata#910=1,4,AMB
4,1223,2015-06-07 18:00:00,30546,EHR,147,2015-06-04,"Trachea, bronchi e polmoni#909=1",83,AMB
...,...,...,...,...,...,...,...,...,...
756,5247,2020-07-04 00:35:00,59630,EHR,7433,2020-06-26,Rene e altri e non specificati organi urinari#...,41563,AMB
757,5257,2020-07-08 13:00:00,59803,EHR,7452,2020-07-07,Lingua#917=5,41566,AMB
758,5245,2020-07-16 00:40:00,59725,EHR,7492,2020-06-27,Colon#877=1,41562,AMB
759,5267,2020-07-20 01:55:00,59250,EHR,7510,2020-07-13,Mammella#901=1,41569,AMB


In [112]:
# get the survival time (in days) for each patient
df_referral['survival_time_days'] = pd.to_timedelta(df_referral['Data'] - df_referral['DateTherapy']).dt.days
df_referral.head()

Unnamed: 0,IDEHR,Data,PatientId_x,EHRType_x,IDDecesso,DateTherapy,Diagnosi,PatientId_y,EHRType_y,survival_time_days
0,16,2015-01-15 01:45:00,71,EHR,3,2015-01-11,Consulenza/Supporto#935=3,6,AMB,4.0
1,1047,2015-02-03 09:00:00,116,EHR,22,2015-01-28,Mammella#901=1,116,EHR,6.0
2,7,2015-03-12 14:45:00,62,EHR,59,2015-01-11,Consulenza/Supporto#935=3,3,AMB,60.0
3,10,2015-03-18 22:45:00,65,EHR,64,2015-01-10,Prostata#910=1,4,AMB,67.0
4,1223,2015-06-07 18:00:00,30546,EHR,147,2015-06-04,"Trachea, bronchi e polmoni#909=1",83,AMB,3.0


In [113]:
print(df_referral.shape)
df_referral.survival_time_days.value_counts()

(761, 10)


1.0      102
2.0       71
4.0       60
3.0       46
5.0       45
        ... 
125.0      1
102.0      1
65.0       1
53.0       1
37.0       1
Name: survival_time_days, Length: 69, dtype: int64

In [115]:
df_referral.isna().sum()
df_referral = df_referral.dropna()

In [117]:
# translate the diagnosis to English
# from googletrans import Translator
# df_referral['Diagnosis'] = df_referral.Diagnosi.str.findall('(.+)#').to_frame()
# df_referral['Diagnosis'] = df_referral['Diagnosis'].apply(lambda x: x if isinstance(x, float) else x[0])

# translator = Translator()
# df_referral['Diagnosi'] = df_referral['Diagnosis'].apply(translator.translate, src='it', dest='en').apply(getattr, args=('text',))
# df_referral = df_referral.drop(columns = ['Diagnosis'])
# df_referral.head()

In [None]:
# get the survival time histogram for all patients


In [None]:
# get the survival time histogram for patients in each diagnosis


In [None]:
# First Definition: Split patients into early/late referral by using the criteria of +- 2 stddev for each diagnosis
