In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as psql
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

  """)


Connect to the database and fetch the person_visit_death_with_concepts table

In [4]:
concept_dir = '../mortality_prediction_docker_model/v1/data/concept_codes_final/'
training_dir = '../mortality_prediction_docker_model/v1/infer/'

In [5]:
filepath = training_dir + 'person.csv'
df_person = pd.read_csv(filepath, usecols = ['year_of_birth',
                                             'ethnicity_concept_id',
                                             'person_id',
                                             'month_of_birth',
                                             'day_of_birth',
                                             'race_concept_id',
                                             'gender_concept_id'])

In [6]:
filepath = filepath = training_dir + 'visit_occurrence.csv'
df_visits = pd.read_csv(filepath, usecols=['person_id',
                                           'visit_start_date',
                                           'preceding_visit_occurrence_id',
                                           'visit_occurrence_id',
                                           'visit_end_date',
                                           'visit_concept_id',
                                           'visit_type_concept_id',
                                           'discharge_to_concept_id'])

In [7]:
df_person_visits = pd.merge(df_person, df_visits, on=['person_id'], how='left')

In [8]:
del df_person
del df_visits

In [9]:
filepath = concept_dir + 'all_concepts.csv'
df_concepts = pd.read_csv(filepath, usecols=['concept_name',
                                             'concept_id',
                                             'vocabulary_id'])

In [10]:
df_concepts_race = df_concepts[df_concepts.vocabulary_id=='Race']
df_concepts_race = df_concepts_race.drop(columns=['vocabulary_id'])
df_concepts_race = df_concepts_race.rename(columns={'concept_id': 'race_concept_id',
                                                    'concept_name': 'race_concept_name'})

In [11]:
df_concepts_race

Unnamed: 0,race_concept_id,race_concept_name
11370,8515,Asian
11371,8516,Black or African American
11372,8527,White
11373,8552,Unknown
11374,8557,Native Hawaiian or Other Pacific Islander
11375,8657,American Indian or Alaska Native


In [12]:
df_person_visits_race = pd.merge(df_person_visits, df_concepts_race, on=['race_concept_id'], how='left')

In [13]:
del df_person_visits

In [14]:
df_concepts_visit = df_concepts[df_concepts.vocabulary_id=='Visit']
df_concepts_visit = df_concepts_visit.drop(columns=['vocabulary_id'])
df_concepts_visit = df_concepts_visit.rename(columns={'concept_id': 'visit_concept_id',
                                                      'concept_name': 'visit_concept_name'})

In [15]:
df_concepts_visit

Unnamed: 0,visit_concept_id,visit_concept_name
904,9201,Inpatient Visit
905,9202,Outpatient Visit
906,9203,Emergency Room Visit


In [16]:
df_person_visits_race_concepts = \
pd.merge(df_person_visits_race, df_concepts_visit, on=['visit_concept_id'], how='left')

In [17]:
filepath = training_dir + 'death.csv'
df_death = pd.read_csv(filepath, usecols=['person_id',
                                          'death_date',
                                          'death_datetime',
                                          'death_type_concept_id'])

In [18]:
df = pd.merge(df_person_visits_race_concepts, df_death, on=['person_id'], how='left')

Convert dates to the correct datatype

In [19]:
df.columns

Index(['person_id', 'ethnicity_concept_id', 'race_concept_id',
       'month_of_birth', 'gender_concept_id', 'year_of_birth', 'day_of_birth',
       'visit_end_date', 'preceding_visit_occurrence_id',
       'visit_occurrence_id', 'discharge_to_concept_id', 'visit_start_date',
       'visit_concept_id', 'visit_type_concept_id', 'race_concept_name',
       'visit_concept_name', 'death_date', 'death_datetime',
       'death_type_concept_id'],
      dtype='object')

In [20]:
df.dtypes

person_id                          int64
ethnicity_concept_id             float64
race_concept_id                  float64
month_of_birth                     int64
gender_concept_id                  int64
year_of_birth                    float64
day_of_birth                       int64
visit_end_date                    object
preceding_visit_occurrence_id    float64
visit_occurrence_id              float64
discharge_to_concept_id          float64
visit_start_date                  object
visit_concept_id                 float64
visit_type_concept_id            float64
race_concept_name                 object
visit_concept_name                object
death_date                        object
death_datetime                    object
death_type_concept_id            float64
dtype: object

In [21]:
df[['visit_start_date','visit_end_date', 'death_date']] = \
df[['visit_start_date','visit_end_date', 'death_date']].apply(pd.to_datetime, format='%Y-%m-%d')

In [22]:
len(df)

882579

Add visit_duration columns

In [23]:
df['visit_duration'] = df['visit_end_date'] - df['visit_start_date']

In [24]:
df.head()

Unnamed: 0,person_id,ethnicity_concept_id,race_concept_id,month_of_birth,gender_concept_id,year_of_birth,day_of_birth,visit_end_date,preceding_visit_occurrence_id,visit_occurrence_id,discharge_to_concept_id,visit_start_date,visit_concept_id,visit_type_concept_id,race_concept_name,visit_concept_name,death_date,death_datetime,death_type_concept_id,visit_duration
0,9879,38003564.0,,12,8532,1921.0,1,2009-03-09,,4021389.0,,2009-03-09,0.0,44818517.0,,,NaT,,,0 days
1,640,38003564.0,8552.0,4,8532,1919.0,1,NaT,,2067328.0,44814705.0,2008-06-15,0.0,44818517.0,Unknown,,NaT,,,NaT
2,35370,38003564.0,8557.0,12,8507,1942.0,1,2009-10-21,,4615299.0,44814705.0,2009-10-21,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,NaT,,,0 days
3,35370,38003564.0,8557.0,12,8507,1942.0,1,2009-07-24,,423303.0,,2009-07-24,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,NaT,,,0 days
4,35370,38003564.0,8557.0,12,8507,1942.0,1,2008-03-16,,5168679.0,,2008-03-16,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,NaT,,,0 days


In [25]:
df['visit_end_date'] = df['visit_end_date'].fillna(df['visit_start_date']) 

In [26]:
df['death_date'] = df['death_date'].fillna(pd.Timestamp.max)

In [27]:
df[df['death_date'] != pd.Timestamp.max].death_date

227      2009-05-06
228      2009-05-06
4815     2010-05-04
4816     2010-05-04
4817     2010-05-04
            ...    
881037   2010-07-30
881038   2010-07-30
881039   2010-07-30
881040   2010-07-30
881041   2010-07-30
Name: death_date, Length: 9072, dtype: datetime64[ns]

In [28]:
df.head()

Unnamed: 0,person_id,ethnicity_concept_id,race_concept_id,month_of_birth,gender_concept_id,year_of_birth,day_of_birth,visit_end_date,preceding_visit_occurrence_id,visit_occurrence_id,discharge_to_concept_id,visit_start_date,visit_concept_id,visit_type_concept_id,race_concept_name,visit_concept_name,death_date,death_datetime,death_type_concept_id,visit_duration
0,9879,38003564.0,,12,8532,1921.0,1,2009-03-09,,4021389.0,,2009-03-09,0.0,44818517.0,,,2262-04-11 23:47:16.854775807,,,0 days
1,640,38003564.0,8552.0,4,8532,1919.0,1,2008-06-15,,2067328.0,44814705.0,2008-06-15,0.0,44818517.0,Unknown,,2262-04-11 23:47:16.854775807,,,NaT
2,35370,38003564.0,8557.0,12,8507,1942.0,1,2009-10-21,,4615299.0,44814705.0,2009-10-21,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,2262-04-11 23:47:16.854775807,,,0 days
3,35370,38003564.0,8557.0,12,8507,1942.0,1,2009-07-24,,423303.0,,2009-07-24,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,2262-04-11 23:47:16.854775807,,,0 days
4,35370,38003564.0,8557.0,12,8507,1942.0,1,2008-03-16,,5168679.0,,2008-03-16,0.0,44818517.0,Native Hawaiian or Other Pacific Islander,,2262-04-11 23:47:16.854775807,,,0 days


In [29]:
max_visit_start_date =  df['visit_start_date'].max()
min_visit_start_date =  df['visit_start_date'].min()
print(max_visit_start_date)
print(min_visit_start_date)

2010-06-01 00:00:00
2007-12-06 00:00:00


In [30]:
def check_death_flag(x, window_size):
    if x.death_date - x.visit_start_date < window_size and x.death_date - x.visit_start_date >= timedelta(days = 0):
        return 1
    return 0

Generate windows of training data with window_id being the identifier. Every row has a death_in_next_window field that informs us whether the person dies in the next window.

In [31]:
def visit_types_count(x):
    return pd.Series(dict(
        inpatient_visit_count  = (x.visit_concept_name == 'Inpatient Visit').sum(),
        outpatient_visit_count = (x.visit_concept_name == 'Outpatient Visit').sum(),
        er_visit_count         = (x.visit_concept_name == 'Emergency Room Visit').sum()
        ))

In [32]:
def window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, calc_death=0):
    window_id = 0
    while window_start < max_visit_start_date:
        df_window = df[(df[date_var] >= window_start) & (df[date_var] < window_start + window_size)]
        if(calc_death):
            df_window['death_in_next_window'] = df_window.apply(lambda x: check_death_flag(x, window_size), axis=1)
            df_window['old'] = window_start.year - df_window.year_of_birth

        df_window[date_var] = (window_start + window_size) - df_window[date_var]
        agg_df = df_window.groupby(group_by_var).agg(agg_dict).rename(columns=rename_dict)
        apply_cols = df_window.groupby(group_by_var).apply(lambda x: apply_func(x))    
        agg_df = agg_df.join(apply_cols)
        agg_df['window_id'] = window_id
        agg_df.reset_index(drop=True)
        if not window_id:
            windowed_data = agg_df.copy()
        else:
            windowed_data = pd.concat([windowed_data, agg_df], ignore_index=True)
        window_id += 1
        window_start += window_size
    return windowed_data

In [33]:
window_size = timedelta(days = 180)
window_start = min_visit_start_date

In [34]:
agg_dict = {'person_id': 'max',
            'year_of_birth': 'max',
            'visit_start_date': 'min',
            'ethnicity_concept_id': 'max',
            'race_concept_id': 'max',
            'gender_concept_id': 'max',
            'race_concept_name': 'max',
            'visit_occurrence_id': 'nunique',
            'visit_concept_name': 'count',
            'visit_duration': 'sum',
            'death_in_next_window': 'max',
            'old': 'max'}

rename_dict = {'visit_occurrence_id': 'number_of_visits',
               'visit_start_date': 'days_since_latest_visit'}

group_by_var = 'person_id'
date_var = 'visit_start_date'
apply_func = visit_types_count

training_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, 1)

In [35]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,5,1940.0,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,67.0,0,0,0,0
1,8,1922.0,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,85.0,0,0,0,0
2,11,1964.0,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,43.0,0,2,0,0
3,12,1920.0,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,87.0,0,0,0,0
4,13,1954.0,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,53.0,0,1,0,0


In [36]:
training_data.dtypes

person_id                            int64
year_of_birth                      float64
days_since_latest_visit    timedelta64[ns]
ethnicity_concept_id               float64
race_concept_id                    float64
gender_concept_id                    int64
race_concept_name                   object
number_of_visits                     int64
visit_concept_name                   int64
visit_duration             timedelta64[ns]
death_in_next_window                 int64
old                                float64
inpatient_visit_count                int64
outpatient_visit_count               int64
er_visit_count                       int64
window_id                            int64
dtype: object

In [37]:
training_data.days_since_latest_visit

0         37 days
1         80 days
2          6 days
3        131 days
4         34 days
           ...   
118776   176 days
118777   172 days
118778   173 days
118779   178 days
118780   175 days
Name: days_since_latest_visit, Length: 118781, dtype: timedelta64[ns]

In [38]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,5,1940.0,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,67.0,0,0,0,0
1,8,1922.0,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,85.0,0,0,0,0
2,11,1964.0,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,43.0,0,2,0,0
3,12,1920.0,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,87.0,0,0,0,0
4,13,1954.0,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,53.0,0,1,0,0


In [39]:
training_data = training_data.drop(['year_of_birth'], axis=1)

In [40]:
training_data[training_data.person_id == 2225]

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
945,2225,4 days,38003564.0,8516.0,8532,Black or African American,15,4,0 days,0,80.0,0,4,0,0
21198,2225,2 days,38003564.0,8516.0,8532,Black or African American,27,5,1 days,0,81.0,0,5,0,1
43702,2225,7 days,38003564.0,8516.0,8532,Black or African American,14,0,9 days,0,81.0,0,0,0,2
66361,2225,3 days,38003564.0,8516.0,8532,Black or African American,25,3,2 days,0,82.0,0,3,0,3
88774,2225,4 days,38003564.0,8516.0,8532,Black or African American,18,3,0 days,0,82.0,0,3,0,4
110295,2225,175 days,38003564.0,8516.0,8532,Black or African American,1,0,0 days,0,83.0,0,0,0,5


In [41]:
training_data.window_id.unique()

array([0, 1, 2, 3, 4, 5])

In [42]:
training_data.er_visit_count.unique()

array([0])

In [43]:
import re 

f = open("features.txt", "r")
features = ''
for x in f:
    features += x
    
important_conditions = re.findall(r"condition_concept_([0-9]+)", features)
important_procedures = re.findall(r"procedure_concept_([0-9]+)", features)
important_drugs = re.findall(r"drug_concept_([0-9]+)", features)
important_observations = re.findall(r"observation_concept_([0-9]+)", features)

### Merge with condition_occurrence

In [44]:
filepath = training_dir + 'condition_occurrence.csv'
df = pd.read_csv(filepath, usecols = ['condition_occurrence_id',
                                      'person_id', 
                                      'condition_concept_id',
                                      'condition_start_date', 
                                      'condition_end_date',
                                      'condition_type_concept_id',
                                      'condition_status_concept_id',
                                      'visit_occurrence_id'])

In [45]:
df['condition_end_date'] = df['condition_end_date'] if not 'NaT' else df['condition_start_date']

In [46]:
df['condition_concept_id'] = df['condition_concept_id'].apply(str)
df['condition_type_concept_id'] = df['condition_type_concept_id'].apply(str)
df['condition_status_concept_id'] = df['condition_status_concept_id'].apply(str)

In [47]:
df[['condition_start_date','condition_end_date']] = \
df[['condition_start_date','condition_end_date']].apply(pd.to_datetime, format='%Y-%m-%d')

In [48]:
max_condition_start_date =  df['condition_start_date'].max()
min_condition_start_date =  df['condition_start_date'].min()
print(max_condition_start_date)
print(min_condition_start_date)

2010-06-01 00:00:00
2007-11-27 00:00:00


In [49]:
def agg_condition_concept_id(x, important_features_set):
    return pd.Series(dict(
        condition_concept_id_list  = ', '.join(set(x.condition_concept_id).intersection(important_features_set)),
        condition_type_concept_id_list  = ', '.join(set(x.condition_type_concept_id))
        ))

In [50]:
agg_dict = {'person_id': 'max',
            'condition_start_date': 'min',
            'condition_status_concept_id': 'max'}

rename_dict = {'condition_start_date': 'days_since_latest_condition'}

group_by_var = 'person_id'
date_var = 'condition_start_date'
important_features_set = set(important_conditions)
apply_func = lambda x: agg_condition_concept_id(x, important_features_set)

df.condition_start_date = pd.to_datetime(df.condition_start_date, format='%Y-%m-%d')
cond_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [51]:
cond_occur_data.head()

Unnamed: 0,person_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,window_id
0,3.0,13 days,,,38000230.0,0
1,5.0,32 days,,,"38000230.0, 38000200.0",0
2,8.0,101 days,,,38000230.0,0
3,11.0,29 days,,,"38000230.0, 38000200.0",0
4,12.0,134 days,,,38000230.0,0


In [52]:
training_data = pd.merge(training_data, cond_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,old,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list
0,5,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,67.0,0,0,0,0,32 days,,,"38000230.0, 38000200.0"
1,8,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,85.0,0,0,0,0,101 days,,,38000230.0
2,11,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,43.0,0,2,0,0,29 days,,,"38000230.0, 38000200.0"
3,12,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,87.0,0,0,0,0,134 days,,,38000230.0
4,13,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,53.0,0,1,0,0,1 days,,,"38000230.0, 38000200.0"


In [53]:
del cond_occur_data

### Merge with procedure_occurrence

In [54]:
filepath = training_dir + 'procedure_occurrence.csv'
df = pd.read_csv(filepath, usecols = ['procedure_occurrence_id',
                                      'person_id',
                                      'procedure_concept_id',
                                      'procedure_date',
                                      'procedure_type_concept_id',
                                      'visit_occurrence_id'])

In [55]:
df['procedure_concept_id'] = df['procedure_concept_id'].apply(str)
df['procedure_type_concept_id'] = df['procedure_type_concept_id'].apply(str)

In [56]:
def agg_procedure_concept_id(x, important_features_set):
    return pd.Series(dict(
        procedure_concept_id_list  = ', '.join(set(x.procedure_concept_id).intersection(important_features_set)),
        procedure_type_concept_id_list  = ', '.join(set(x.procedure_type_concept_id))
        ))

In [57]:
agg_dict = {'person_id': 'max',
            'procedure_date': 'min'}

rename_dict = {'procedure_date': 'days_since_latest_procedure'}

group_by_var = 'person_id'
date_var = 'procedure_date'
important_features_set = set(important_procedures)
apply_func = lambda x: agg_procedure_concept_id(x, important_features_set)

df.procedure_date = pd.to_datetime(df.procedure_date, format='%Y-%m-%d')
procedure_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [58]:
training_data = pd.merge(training_data, procedure_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,outpatient_visit_count,er_visit_count,window_id,days_since_latest_condition,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list
0,5,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,...,0,0,0,32 days,,,"38000230.0, 38000200.0",NaT,,
1,8,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,0,0,0,101 days,,,38000230.0,2 days,,38000269.0
2,11,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,...,2,0,0,29 days,,,"38000230.0, 38000200.0",106 days,,38000269.0
3,12,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,...,0,0,0,134 days,,,38000230.0,2 days,,38000269.0
4,13,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,...,1,0,0,1 days,,,"38000230.0, 38000200.0",8 days,,38000269.0


In [59]:
del procedure_occur_data

### Merge with drug_exposure

In [60]:
filepath = training_dir + 'drug_exposure.csv'
df = pd.read_csv(filepath, usecols = ['drug_exposure_id',
                                      'person_id',
                                      'drug_concept_id',
                                      'drug_exposure_start_date',
                                      'drug_type_concept_id',
                                      'quantity',
                                      'visit_occurrence_id'])

In [61]:
df['drug_concept_id'] = df['drug_concept_id'].apply(str)
df['drug_type_concept_id'] = df['drug_type_concept_id'].apply(str)

In [62]:
def agg_drug_concept_id(x, important_features_set):
    return pd.Series(dict(
        drug_concept_id_list  = ', '.join(set(x.drug_concept_id).intersection(important_features_set)),
        drug_type_concept_id_list  = ', '.join(set(x.drug_type_concept_id))
        ))

In [63]:
agg_dict = {'person_id': 'max',
            'drug_exposure_start_date': 'min',
            'quantity': 'sum'}

rename_dict = {'drug_exposure_start_date': 'days_since_latest_drug_exposure',
               'quantity': 'total_quantity_of_drugs'}

group_by_var = 'person_id'
date_var = 'drug_exposure_start_date'
important_features_set = set(important_drugs)
apply_func = lambda x: agg_drug_concept_id(x, important_features_set)

df.drug_exposure_start_date = pd.to_datetime(df.drug_exposure_start_date, format='%Y-%m-%d')
drug_exposure_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [64]:
training_data = pd.merge(training_data, drug_exposure_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list,days_since_latest_drug_exposure,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list
0,5,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,...,,,"38000230.0, 38000200.0",NaT,,,137 days,10.0,,38000175.0
1,8,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,,,38000230.0,2 days,,38000269.0,NaT,,,
2,11,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,...,,,"38000230.0, 38000200.0",106 days,,38000269.0,NaT,,,
3,12,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,...,,,38000230.0,2 days,,38000269.0,9 days,30.0,,38000175.0
4,13,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,...,,,"38000230.0, 38000200.0",8 days,,38000269.0,NaT,,,


In [65]:
del drug_exposure_data

### Merge with observation

In [66]:
filepath = training_dir + 'observation.csv'
df = pd.read_csv(filepath, usecols = ['observation_id',
                                      'person_id',
                                      'observation_concept_id',
                                      'observation_date',
                                      'observation_type_concept_id',
                                      'value_as_string',
                                      'value_as_concept_id'])

#### Dont know what to do with the columns value_as_string, value_as_concept_id

In [67]:
df['observation_concept_id'] = df['observation_concept_id'].apply(str)
df['observation_type_concept_id'] = df['observation_type_concept_id'].apply(str)

In [68]:
def agg_observation_concept_id(x, important_features_set):
    return pd.Series(dict(
        observation_concept_id_list  = ', '.join(set(x.observation_concept_id).intersection(important_features_set)),
        observation_type_concept_id_list  = ', '.join(set(x.observation_type_concept_id))
        ))

In [69]:
agg_dict = {'person_id': 'max',
            'observation_date': 'min'}

rename_dict = {'observation_date': 'days_since_latest_observation'}

group_by_var = 'person_id'
date_var = 'observation_date'
important_features_set = set(important_observations)
apply_func = lambda x: agg_observation_concept_id(x, important_features_set)

df.observation_date = pd.to_datetime(df.observation_date, format='%Y-%m-%d')
observation_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [70]:
training_data = pd.merge(training_data, observation_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,days_since_latest_visit,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,days_since_latest_procedure,procedure_concept_id_list,procedure_type_concept_id_list,days_since_latest_drug_exposure,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list,days_since_latest_observation,observation_concept_id_list,observation_type_concept_id_list
0,5,37 days,,8557.0,8532,Native Hawaiian or Other Pacific Islander,2,0,0 days,0,...,NaT,,,137 days,10.0,,38000175.0,22 days,,38000282.0
1,8,80 days,38003564.0,8552.0,8532,Unknown,1,0,0 days,0,...,2 days,,38000269.0,NaT,,,,NaT,,
2,11,6 days,38003564.0,8552.0,8532,Unknown,9,2,2 days,0,...,106 days,,38000269.0,NaT,,,,6 days,,38000282.0
3,12,131 days,38003564.0,8527.0,8532,White,1,0,1 days,0,...,2 days,,38000269.0,9 days,30.0,,38000175.0,64 days,,38000282.0
4,13,34 days,38003564.0,8657.0,8507,American Indian or Alaska Native,4,1,0 days,0,...,8 days,,38000269.0,NaT,,,,121 days,,38000282.0


In [71]:
training_data.columns

Index(['person_id', 'days_since_latest_visit', 'ethnicity_concept_id',
       'race_concept_id', 'gender_concept_id', 'race_concept_name',
       'number_of_visits', 'visit_concept_name', 'visit_duration',
       'death_in_next_window', 'old', 'inpatient_visit_count',
       'outpatient_visit_count', 'er_visit_count', 'window_id',
       'days_since_latest_condition', 'condition_status_concept_id',
       'condition_concept_id_list', 'condition_type_concept_id_list',
       'days_since_latest_procedure', 'procedure_concept_id_list',
       'procedure_type_concept_id_list', 'days_since_latest_drug_exposure',
       'total_quantity_of_drugs', 'drug_concept_id_list',
       'drug_type_concept_id_list', 'days_since_latest_observation',
       'observation_concept_id_list', 'observation_type_concept_id_list'],
      dtype='object')

In [72]:
del observation_data

In [73]:
len(training_data)
training_data.shape 

(118781, 29)

In [74]:
# training_data.to_pickle("./training_data.pkl")

In [75]:
# pickle the training_data
import pickle
pickle.dump(training_data, open( "test_data.pkl", "wb" ))

Unroll _list columns

In [76]:
training_data = pickle.load( open( "test_data.pkl", "rb" ) )

In [77]:
# make a copy, preserve the original
train = training_data.copy()
col_num = train.shape[1]
train.shape

(118781, 29)

In [78]:
%%time
# unroll the _list columns and one-hot encode them
lists = [c for c in train.columns if '_list' in c]
for idx, row in train.iterrows():
    for l in lists:
        l_str = '_'.join(l.split('_')[:2])+'_'
        l_items = row[l]
        if isinstance(l_items, str):
            l_items = l_items.split(',')
            if isinstance(l_items, list) and l_items != ['']:
                for c in l_items:
                        train.loc[idx,l_str+str(c).strip()] = 1

CPU times: user 2min 40s, sys: 0 ns, total: 2min 40s
Wall time: 2min 40s


In [79]:
train[col_num:].fillna(0, inplace=True)
train.shape

(118781, 35)

In [80]:
train = train.drop(lists, axis=1)
train.shape

(118781, 27)

In [81]:
train.to_csv('test_all.csv', index=False)

In [82]:
train.columns

Index(['person_id', 'days_since_latest_visit', 'ethnicity_concept_id',
       'race_concept_id', 'gender_concept_id', 'race_concept_name',
       'number_of_visits', 'visit_concept_name', 'visit_duration',
       'death_in_next_window', 'old', 'inpatient_visit_count',
       'outpatient_visit_count', 'er_visit_count', 'window_id',
       'days_since_latest_condition', 'condition_status_concept_id',
       'days_since_latest_procedure', 'days_since_latest_drug_exposure',
       'total_quantity_of_drugs', 'days_since_latest_observation',
       'condition_type_38000230.0', 'condition_type_38000200.0',
       'drug_type_38000175.0', 'observation_type_38000282.0',
       'procedure_type_38000269.0', 'procedure_type_38000251.0'],
      dtype='object')