In [2]:
import pandas as pd
import pandas.io.sql as psql
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

Connect to the database and fetch the person_visit_death_with_concepts table

In [9]:
concept_dir = './v2/app/concept_codes_final/'
training_dir = '../mortality_prediction_docker_model/v2/train/'
scratch_dir = '../mortality_prediction_docker_model/v2/scratch/'

In [4]:
filepath = training_dir + 'person.csv'
df_person = pd.read_csv(filepath, usecols = ['year_of_birth',
                                             'ethnicity_concept_id',
                                             'person_id',
                                             'month_of_birth',
                                             'day_of_birth',
                                             'race_concept_id',
                                             'gender_concept_id'])

In [5]:
filepath = filepath = training_dir + 'visit_occurrence.csv'
df_visits = pd.read_csv(filepath, usecols=['person_id',
                                           'visit_start_date',
                                           'preceding_visit_occurrence_id',
                                           'visit_occurrence_id',
                                           'visit_end_date',
                                           'visit_concept_id',
                                           'visit_type_concept_id',
                                           'discharge_to_concept_id'])

In [6]:
df_person_visits = pd.merge(df_person, df_visits, on=['person_id'], how='left')

In [7]:
del df_person
del df_visits

In [10]:
filepath = concept_dir + 'all_concepts.csv'
df_concepts = pd.read_csv(filepath, usecols=['concept_name',
                                             'concept_id',
                                             'vocabulary_id'])

In [11]:
df_concepts_race = df_concepts[df_concepts.vocabulary_id=='Race']
df_concepts_race = df_concepts_race.drop(columns=['vocabulary_id'])
df_concepts_race = df_concepts_race.rename(columns={'concept_id': 'race_concept_id',
                                                    'concept_name': 'race_concept_name'})

In [12]:
df_concepts_race

Unnamed: 0,race_concept_id,race_concept_name
11370,8515,Asian
11371,8516,Black or African American
11372,8527,White
11373,8552,Unknown
11374,8557,Native Hawaiian or Other Pacific Islander
11375,8657,American Indian or Alaska Native


In [13]:
df_person_visits_race = pd.merge(df_person_visits, df_concepts_race, on=['race_concept_id'], how='left')

In [14]:
del df_person_visits

In [15]:
df_concepts_visit = df_concepts[df_concepts.vocabulary_id=='Visit']
df_concepts_visit = df_concepts_visit.drop(columns=['vocabulary_id'])
df_concepts_visit = df_concepts_visit.rename(columns={'concept_id': 'visit_concept_id',
                                                      'concept_name': 'visit_concept_name'})

In [16]:
df_concepts_visit

Unnamed: 0,visit_concept_id,visit_concept_name
904,9201,Inpatient Visit
905,9202,Outpatient Visit
906,9203,Emergency Room Visit


In [17]:
df_person_visits_race_concepts = \
pd.merge(df_person_visits_race, df_concepts_visit, on=['visit_concept_id'], how='left')

In [18]:
filepath = training_dir + 'death.csv'
df_death = pd.read_csv(filepath, usecols=['person_id',
                                          'death_date',
                                          'death_datetime',
                                          'death_type_concept_id'])

In [19]:
df = pd.merge(df_person_visits_race_concepts, df_death, on=['person_id'], how='left')

Convert dates to the correct datatype

In [20]:
df.columns

Index(['year_of_birth', 'ethnicity_concept_id', 'person_id', 'month_of_birth',
       'day_of_birth', 'race_concept_id', 'gender_concept_id',
       'visit_start_date', 'preceding_visit_occurrence_id',
       'visit_occurrence_id', 'visit_end_date', 'visit_concept_id',
       'visit_type_concept_id', 'discharge_to_concept_id', 'race_concept_name',
       'visit_concept_name', 'death_date', 'death_datetime',
       'death_type_concept_id'],
      dtype='object')

In [21]:
df.dtypes

year_of_birth                    float64
ethnicity_concept_id             float64
person_id                          int64
month_of_birth                     int64
day_of_birth                       int64
race_concept_id                  float64
gender_concept_id                  int64
visit_start_date                  object
preceding_visit_occurrence_id    float64
visit_occurrence_id              float64
visit_end_date                    object
visit_concept_id                 float64
visit_type_concept_id            float64
discharge_to_concept_id          float64
race_concept_name                 object
visit_concept_name                object
death_date                        object
death_datetime                    object
death_type_concept_id            float64
dtype: object

In [22]:
df[['visit_start_date','visit_end_date', 'death_date']] = \
df[['visit_start_date','visit_end_date', 'death_date']].apply(pd.to_datetime, format='%Y-%m-%d')

In [23]:
len(df)

1108650

Add visit_duration columns

In [24]:
df['visit_duration'] = df['visit_end_date'] - df['visit_start_date']

In [25]:
df.head()

Unnamed: 0,year_of_birth,ethnicity_concept_id,person_id,month_of_birth,day_of_birth,race_concept_id,gender_concept_id,visit_start_date,preceding_visit_occurrence_id,visit_occurrence_id,visit_end_date,visit_concept_id,visit_type_concept_id,discharge_to_concept_id,race_concept_name,visit_concept_name,death_date,death_datetime,death_type_concept_id,visit_duration
0,1937.0,38003564.0,19076,8,1,8557.0,8532,2010-04-23,,1714345.0,NaT,0.0,44818517.0,,Native Hawaiian or Other Pacific Islander,,NaT,,,NaT
1,1971.0,38003564.0,63615,12,1,,8532,2008-07-27,,2428649.0,NaT,9202.0,44818517.0,,,Outpatient Visit,NaT,,,NaT
2,1971.0,38003564.0,63615,12,1,,8532,2009-01-21,,5504867.0,NaT,0.0,44818517.0,,,,NaT,,,NaT
3,1971.0,38003564.0,63615,12,1,,8532,2008-09-08,,2693895.0,NaT,0.0,44818517.0,,,,NaT,,,NaT
4,1971.0,38003564.0,63615,12,1,,8532,2009-04-08,,4692392.0,NaT,0.0,44818517.0,44814705.0,,,NaT,,,NaT


In [26]:
df['visit_end_date'] = df['visit_end_date'].fillna(df['visit_start_date']) 

In [27]:
df['death_date'] = df['death_date'].fillna(pd.Timestamp.max)

In [28]:
df[df['death_date'] != pd.Timestamp.max].death_date

1841      2010-07-17
1842      2010-07-17
7167      2010-06-01
7954      2010-06-14
9405      2010-07-13
             ...    
1104402   2011-01-14
1104599   2009-10-18
1105529   2009-08-02
1105530   2009-08-02
1105531   2009-08-02
Name: death_date, Length: 12593, dtype: datetime64[ns]

In [29]:
df.head()

Unnamed: 0,year_of_birth,ethnicity_concept_id,person_id,month_of_birth,day_of_birth,race_concept_id,gender_concept_id,visit_start_date,preceding_visit_occurrence_id,visit_occurrence_id,visit_end_date,visit_concept_id,visit_type_concept_id,discharge_to_concept_id,race_concept_name,visit_concept_name,death_date,death_datetime,death_type_concept_id,visit_duration
0,1937.0,38003564.0,19076,8,1,8557.0,8532,2010-04-23,,1714345.0,2010-04-23,0.0,44818517.0,,Native Hawaiian or Other Pacific Islander,,2262-04-11 23:47:16.854775807,,,NaT
1,1971.0,38003564.0,63615,12,1,,8532,2008-07-27,,2428649.0,2008-07-27,9202.0,44818517.0,,,Outpatient Visit,2262-04-11 23:47:16.854775807,,,NaT
2,1971.0,38003564.0,63615,12,1,,8532,2009-01-21,,5504867.0,2009-01-21,0.0,44818517.0,,,,2262-04-11 23:47:16.854775807,,,NaT
3,1971.0,38003564.0,63615,12,1,,8532,2008-09-08,,2693895.0,2008-09-08,0.0,44818517.0,,,,2262-04-11 23:47:16.854775807,,,NaT
4,1971.0,38003564.0,63615,12,1,,8532,2009-04-08,,4692392.0,2009-04-08,0.0,44818517.0,44814705.0,,,2262-04-11 23:47:16.854775807,,,NaT


In [30]:
max_visit_start_date =  df['visit_start_date'].max()
min_visit_start_date =  df['visit_start_date'].min()
print(max_visit_start_date)
print(min_visit_start_date)

2010-05-25 00:00:00
2007-11-28 00:00:00


In [31]:
def check_death_flag(x, window_size):
    if x.death_date - x.visit_start_date < window_size and x.death_date - x.visit_start_date >= timedelta(days = 0):
        return 1
    return 0

Generate windows of training data with window_id being the identifier. Every row has a death_in_next_window field that informs us whether the person dies in the next window.

In [32]:
def visit_types_count(x):
    return pd.Series(dict(
        inpatient_visit_count  = (x.visit_concept_name == 'Inpatient Visit').sum(),
        outpatient_visit_count = (x.visit_concept_name == 'Outpatient Visit').sum(),
        er_visit_count         = (x.visit_concept_name == 'Emergency Room Visit').sum()
        ))

In [33]:
def window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, calc_death=0):
    window_id = 0
    while window_start < max_visit_start_date:
        df_window = df[(df[date_var] >= window_start) & (df[date_var] < window_start + window_size)]
        if(calc_death):
            df_window['death_in_next_window'] = df_window.apply(lambda x: check_death_flag(x, window_size), axis=1)
            df_window['old'] = window_start.year - df_window.year_of_birth

        df_window[date_var] = (window_start + window_size) - df_window[date_var]
        agg_df = df_window.groupby(group_by_var).agg(agg_dict).rename(columns=rename_dict)
        apply_cols = df_window.groupby(group_by_var).apply(lambda x: apply_func(x))    
        agg_df = agg_df.join(apply_cols)
        agg_df['window_id'] = window_id
        agg_df.reset_index(drop=True)
        if not window_id:
            windowed_data = agg_df.copy()
        else:
            windowed_data = pd.concat([windowed_data, agg_df], ignore_index=True)
        window_id += 1
        window_start += window_size
    return windowed_data

In [34]:
window_size = timedelta(days = 180)
window_start = min_visit_start_date

In [36]:
%%time
agg_dict = {'person_id': 'max',
            'year_of_birth': 'max',
            'visit_start_date': 'min',
            'ethnicity_concept_id': 'max',
            'race_concept_id': 'max',
            'gender_concept_id': 'max',
            'race_concept_name': 'max',
            'visit_occurrence_id': 'nunique',
            'visit_concept_name': 'count',
            'visit_duration': 'sum',
            'death_in_next_window': 'max',
            'old': 'max'}

rename_dict = {'visit_occurrence_id': 'number_of_visits',
               'visit_start_date': 'days_since_latest_visit'}

group_by_var = 'person_id'
date_var = 'visit_start_date'
apply_func = visit_types_count

training_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, 1)

CPU times: user 6min 7s, sys: 1.35 s, total: 6min 9s
Wall time: 5min 59s


In [37]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,0,1923.0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,84.0,0,0,0,0
1,5,1935.0,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,72.0,0,1,0,0
2,6,1976.0,103 days,,8527.0,8532,White,3,0,0 days,0,31.0,0,0,0,0
3,10,1936.0,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,71.0,0,0,0,0
4,12,1919.0,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,88.0,0,0,0,0


In [38]:
training_data.dtypes

person_id                            int64
year_of_birth                      float64
days_since_latest_visit    timedelta64[ns]
ethnicity_concept_id               float64
race_concept_id                    float64
gender_concept_id                    int64
race_concept_name                   object
number_of_visits                     int64
visit_concept_name                   int64
visit_duration             timedelta64[ns]
death_in_next_window                 int64
old                                float64
inpatient_visit_count                int64
outpatient_visit_count               int64
er_visit_count                       int64
window_id                            int64
dtype: object

In [39]:
training_data.days_since_latest_visit

0         45 days
1         31 days
2        103 days
3         84 days
4          2 days
           ...   
278993   179 days
278994   174 days
278995   174 days
278996   180 days
278997   171 days
Name: days_since_latest_visit, Length: 278998, dtype: timedelta64[ns]

In [40]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,0,1923.0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,84.0,0,0,0,0
1,5,1935.0,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,72.0,0,1,0,0
2,6,1976.0,103 days,,8527.0,8532,White,3,0,0 days,0,31.0,0,0,0,0
3,10,1936.0,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,71.0,0,0,0,0
4,12,1919.0,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,88.0,0,0,0,0


In [41]:
training_data = training_data.drop(['year_of_birth'], axis=1)

In [42]:
training_data[training_data.person_id == 2225]

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
658,2225,87 days,38003564.0,8515.0,8532,Asian,1,0,2 days,0,69.0,0,0,0,0
46625,2225,61 days,38003564.0,8515.0,8532,Asian,5,1,0 days,0,70.0,0,1,0,1
104347,2225,83 days,38003564.0,8515.0,8532,Asian,3,1,2 days,0,70.0,1,0,0,2
162225,2225,30 days,38003564.0,8515.0,8532,Asian,3,1,0 days,0,71.0,0,1,0,3
219608,2225,160 days,38003564.0,8515.0,8532,Asian,1,0,0 days,0,71.0,0,0,0,4
272446,2225,179 days,38003564.0,8515.0,8532,Asian,2,0,0 days,0,72.0,0,0,0,5


In [43]:
training_data.window_id.unique()

array([0, 1, 2, 3, 4, 5])

In [44]:
training_data.er_visit_count.unique()

array([0])

In [45]:
import re 

f = open("features.txt", "r")
features = ''
for x in f:
    features += x
    
important_conditions = re.findall(r"condition_concept_([0-9]+)", features)
important_procedures = re.findall(r"procedure_concept_([0-9]+)", features)
important_drugs = re.findall(r"drug_concept_([0-9]+)", features)
important_observations = re.findall(r"observation_concept_([0-9]+)", features)

### Merge with condition_occurrence

In [46]:
filepath = training_dir + 'condition_occurrence.csv'
df = pd.read_csv(filepath, usecols = ['condition_occurrence_id',
                                      'person_id', 
                                      'condition_concept_id',
                                      'condition_start_date', 
                                      'condition_end_date',
                                      'condition_type_concept_id',
                                      'condition_status_concept_id',
                                      'visit_occurrence_id'])

In [47]:
df['condition_end_date'] = df['condition_end_date'] if not 'NaT' else df['condition_start_date']

In [48]:
df['condition_concept_id'] = df['condition_concept_id'].astype('Int64')
df['condition_type_concept_id'] = df['condition_type_concept_id'].astype('Int64')
df['condition_status_concept_id'] = df['condition_status_concept_id'].astype('Int64')

In [49]:
df['condition_concept_id'] = df['condition_concept_id'].apply(str)
df['condition_type_concept_id'] = df['condition_type_concept_id'].apply(str)
df['condition_status_concept_id'] = df['condition_status_concept_id'].apply(str)

In [50]:
df[['condition_start_date','condition_end_date']] = \
df[['condition_start_date','condition_end_date']].apply(pd.to_datetime, format='%Y-%m-%d')

In [51]:
max_condition_start_date =  df['condition_start_date'].max()
min_condition_start_date =  df['condition_start_date'].min()
print(max_condition_start_date)
print(min_condition_start_date)

2010-05-25 00:00:00
2007-11-28 00:00:00


In [52]:
def agg_condition_concept_id(x, important_features_set):
    return pd.Series(dict(
        condition_concept_id_list  = ', '.join(set(x.condition_concept_id).intersection(important_features_set)),
        condition_type_concept_id_list  = ', '.join(set(x.condition_type_concept_id))
        ))

In [53]:
%%time
agg_dict = {'person_id': 'max',
            'condition_start_date': 'min',
            'condition_status_concept_id': 'max'}

rename_dict = {'condition_start_date': 'days_since_latest_condition'}

group_by_var = 'person_id'
date_var = 'condition_start_date'
important_features_set = set(important_conditions)
apply_func = lambda x: agg_condition_concept_id(x, important_features_set)

df.condition_start_date = pd.to_datetime(df.condition_start_date, format='%Y-%m-%d')
cond_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [54]:
cond_occur_data.head()

Unnamed: 0,person_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,window_id
0,6.0,1 days,,"443776, 197508","38000200, 38000230",0
1,7.0,1 days,,,38000230,0
2,8.0,23 days,,133810,38000230,0
3,10.0,41 days,,,38000230,0
4,18.0,2 days,,,38000230,0


In [55]:
training_data = pd.merge(training_data, cond_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list
0,0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,84.0,0,0,0,0,NaT,,,
1,5,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,72.0,0,1,0,0,NaT,,,
2,6,103 days,,8527.0,8532,White,3,0,0 days,0,31.0,0,0,0,0,1 days,,"443776, 197508","38000200, 38000230"
3,10,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,71.0,0,0,0,0,41 days,,,38000230
4,12,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,88.0,0,0,0,0,NaT,,,


In [56]:
del cond_occur_data

### Merge with procedure_occurrence

In [57]:
filepath = training_dir + 'procedure_occurrence.csv'
df = pd.read_csv(filepath, usecols = ['procedure_occurrence_id',
                                      'person_id',
                                      'procedure_concept_id',
                                      'procedure_date',
                                      'procedure_type_concept_id',
                                      'visit_occurrence_id'])

In [58]:
df['procedure_concept_id'] = df['procedure_concept_id'].astype('Int64')
df['procedure_type_concept_id'] = df['procedure_type_concept_id'].astype('Int64')

In [59]:
df['procedure_concept_id'] = df['procedure_concept_id'].apply(str)
df['procedure_type_concept_id'] = df['procedure_type_concept_id'].apply(str)

In [60]:
def agg_procedure_concept_id(x, important_features_set):
    return pd.Series(dict(
        procedure_concept_id_list  = ', '.join(set(x.procedure_concept_id).intersection(important_features_set)),
        procedure_type_concept_id_list  = ', '.join(set(x.procedure_type_concept_id))
        ))

In [61]:
%%time
agg_dict = {'person_id': 'max',
            'procedure_date': 'min'}

rename_dict = {'procedure_date': 'days_since_latest_procedure'}

group_by_var = 'person_id'
date_var = 'procedure_date'
important_features_set = set(important_procedures)
apply_func = lambda x: agg_procedure_concept_id(x, important_features_set)

df.procedure_date = pd.to_datetime(df.procedure_date, format='%Y-%m-%d')
procedure_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [62]:
training_data = pd.merge(training_data, procedure_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,outpatient_visit_count,er_visit_count,window_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list
0,0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,...,0,0,0,NaT,,,,23 days,,38000269.0
1,5,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,...,1,0,0,NaT,,,,3 days,2002291.0,38000269.0
2,6,103 days,,8527.0,8532,White,3,0,0 days,0,...,0,0,0,1 days,,"443776, 197508","38000200, 38000230",NaT,,
3,10,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,0,0,0,41 days,,,38000230,15 days,0.0,38000269.0
4,12,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,...,0,0,0,NaT,,,,55 days,2414397.0,38000269.0


In [63]:
del procedure_occur_data

### Merge with drug_exposure

In [64]:
filepath = training_dir + 'drug_exposure.csv'
df = pd.read_csv(filepath, usecols = ['drug_exposure_id',
                                      'person_id',
                                      'drug_concept_id',
                                      'drug_exposure_start_date',
                                      'drug_type_concept_id',
                                      'quantity',
                                      'visit_occurrence_id'])

In [65]:
df['drug_concept_id'] = df['drug_concept_id'].astype('Int64')
df['drug_type_concept_id'] = df['drug_type_concept_id'].astype('Int64')

In [66]:
df['drug_concept_id'] = df['drug_concept_id'].apply(str)
df['drug_type_concept_id'] = df['drug_type_concept_id'].apply(str)

In [67]:
def agg_drug_concept_id(x, important_features_set):
    return pd.Series(dict(
        drug_concept_id_list  = ', '.join(set(x.drug_concept_id).intersection(important_features_set)),
        drug_type_concept_id_list  = ', '.join(set(x.drug_type_concept_id))
        ))

In [68]:
%%time
agg_dict = {'person_id': 'max',
            'drug_exposure_start_date': 'min',
            'quantity': 'sum'}

rename_dict = {'drug_exposure_start_date': 'days_since_latest_drug_exposure',
               'quantity': 'total_quantity_of_drugs'}

group_by_var = 'person_id'
date_var = 'drug_exposure_start_date'
important_features_set = set(important_drugs)
apply_func = lambda x: agg_drug_concept_id(x, important_features_set)

df.drug_exposure_start_date = pd.to_datetime(df.drug_exposure_start_date, format='%Y-%m-%d')
drug_exposure_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [69]:
training_data = pd.merge(training_data, drug_exposure_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list,days_since_latest_drug_exposure,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list
0,0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,...,,,,23 days,,38000269.0,NaT,,,
1,5,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,...,,,,3 days,2002291.0,38000269.0,NaT,,,
2,6,103 days,,8527.0,8532,White,3,0,0 days,0,...,,"443776, 197508","38000200, 38000230",NaT,,,NaT,,,
3,10,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,,,38000230,15 days,0.0,38000269.0,141 days,30.0,,38000175.0
4,12,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,...,,,,55 days,2414397.0,38000269.0,111 days,90.0,,38000175.0


In [70]:
del drug_exposure_data

### Merge with observation

In [71]:
filepath = training_dir + 'observation.csv'
df = pd.read_csv(filepath, usecols = ['observation_id',
                                      'person_id',
                                      'observation_concept_id',
                                      'observation_date',
                                      'observation_type_concept_id',
                                      'value_as_string',
                                      'value_as_concept_id'])

#### Dont know what to do with the columns value_as_string, value_as_concept_id

In [72]:
df['observation_concept_id'] = df['observation_concept_id'].astype('Int64')
df['observation_type_concept_id'] = df['observation_type_concept_id'].astype('Int64')

In [73]:
df['observation_concept_id'] = df['observation_concept_id'].apply(str)
df['observation_type_concept_id'] = df['observation_type_concept_id'].apply(str)

In [74]:
def agg_observation_concept_id(x, important_features_set):
    return pd.Series(dict(
        observation_concept_id_list  = ', '.join(set(x.observation_concept_id).intersection(important_features_set)),
        observation_type_concept_id_list  = ', '.join(set(x.observation_type_concept_id))
        ))

In [75]:
%%time
agg_dict = {'person_id': 'max',
            'observation_date': 'min'}

rename_dict = {'observation_date': 'days_since_latest_observation'}

group_by_var = 'person_id'
date_var = 'observation_date'
important_features_set = set(important_observations)
apply_func = lambda x: agg_observation_concept_id(x, important_features_set)

df.observation_date = pd.to_datetime(df.observation_date, format='%Y-%m-%d')
observation_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [76]:
training_data = pd.merge(training_data, observation_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list,days_since_latest_drug_exposure,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list,days_since_latest_observation,observation_concept_id_list,observation_type_concept_id_list
0,0,45 days,,8552.0,8507,Unknown,4,0,0 days,0,...,23 days,,38000269.0,NaT,,,,3 days,4214956.0,38000282.0
1,5,31 days,,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,1,0 days,0,...,3 days,2002291.0,38000269.0,NaT,,,,NaT,,
2,6,103 days,,8527.0,8532,White,3,0,0 days,0,...,NaT,,,NaT,,,,24 days,,38000282.0
3,10,84 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,15 days,0.0,38000269.0,141 days,30.0,,38000175.0,NaT,,
4,12,2 days,,8516.0,8532,Black or African American,2,0,0 days,0,...,55 days,2414397.0,38000269.0,111 days,90.0,,38000175.0,NaT,,


In [77]:
training_data.columns

Index(['person_id', 'days_since_latest_visit', 'ethnicity_concept_id',
       'race_concept_id', 'gender_concept_id', 'race_concept_name',
       'number_of_visits', 'visit_concept_name', 'visit_duration',
       'death_in_next_window', 'old', 'inpatient_visit_count',
       'outpatient_visit_count', 'er_visit_count', 'window_id',
       'days_since_latest_condition', 'condition_status_concept_id',
       'condition_concept_id_list', 'condition_type_concept_id_list',
       'days_since_latest_procedure', 'procedure_concept_id_list',
       'procedure_type_concept_id_list', 'days_since_latest_drug_exposure',
       'total_quantity_of_drugs', 'drug_concept_id_list',
       'drug_type_concept_id_list', 'days_since_latest_observation',
       'observation_concept_id_list', 'observation_type_concept_id_list'],
      dtype='object')

In [78]:
del observation_data

In [79]:
len(training_data)
training_data.shape 

(278998, 29)

In [80]:
# training_data.to_pickle("./training_data.pkl")

In [81]:
# pickle the training_data
import pickle
pickle.dump(training_data, open( "training_data.pkl", "wb" ))

Unroll _list columns

In [82]:
training_data = pickle.load( open( "training_data.pkl", "rb" ) )

In [83]:
# make a copy, preserve the original
train = training_data.copy()
col_num = train.shape[1]
train.shape

(278998, 29)

In [84]:
%%time
# unroll the _list columns and one-hot encode them
lists = [c for c in train.columns if '_list' in c]
for idx, row in train.iterrows():
    for l in lists:
        l_str = '_'.join(l.split('_')[:2])+'_'
        l_items = row[l]
        if isinstance(l_items, str):
            l_items = l_items.split(',')
            if isinstance(l_items, list) and l_items != ['']:
                for c in l_items:
                        train.loc[idx,l_str+str(c).strip()] = 1

CPU times: user 55min 54s, sys: 56 s, total: 56min 50s
Wall time: 14min 20s


In [85]:
train[col_num:].fillna(0, inplace=True)
train.shape

(278998, 242)

In [86]:
train = train.drop(lists, axis=1)
train.shape

(278998, 234)

In [87]:
date_cols = [c for c in train.columns if 'days' in c]

for c in date_cols:
    train[c] = pd.to_timedelta(train[c]).dt.days

train.visit_duration = pd.to_timedelta(train.visit_duration).dt.days
train.race_concept_name = train.race_concept_name.replace(to_replace=0, value='Unknown')
train.race_concept_name = train.race_concept_name.fillna('Unknown')

In [88]:
train.to_csv(scratch_dir+'train_all.csv', index=False)