In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as psql
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

Connect to the database and fetch the person_visit_death_with_concepts table

In [2]:
def fetch_data(dbname, user, password, table):
    connection = pg.connect('host=localhost dbname='+dbname+' user='+user+' password=\''+password+'\'')
    df = pd.read_sql_query('SELECT * FROM '+ table,con=connection)
    connection.close()
    return df

In [3]:
dbname = 'commondatamodel'
user = 'romirmoza'
password = ''
table = 'person_visit_death_with_concepts'
df = fetch_data(dbname, user, password, table)

Convert dates to the correct datatype

In [4]:
df.columns

Index(['year_of_birth', 'ethnicity_concept_id', 'person_id', 'month_of_birth',
       'day_of_birth', 'race_concept_id', 'gender_concept_id',
       'birth_datetime', 'visit_start_date', 'preceding_visit_occurrence_id',
       'visit_occurrence_id', 'visit_end_date', 'visit_concept_id',
       'visit_type_concept_id', 'discharge_to_concept_id', 'race_concept_name',
       'visit_concept_name', 'death_date', 'death_datetime',
       'death_type_concept_id'],
      dtype='object')

In [5]:
df[['visit_start_date','visit_end_date', 'death_date']] = \
df[['visit_start_date','visit_end_date', 'death_date']].apply(pd.to_datetime, format='%Y-%m-%d')

Add visit_duration columns

In [6]:
df['visit_duration'] = df['visit_end_date'] - df['visit_start_date']

In [7]:
df['visit_end_date'] = df['visit_end_date'] if not 'NaT' else df['visit_start_date']

In [8]:
df.head()

Unnamed: 0,year_of_birth,ethnicity_concept_id,person_id,month_of_birth,day_of_birth,race_concept_id,gender_concept_id,birth_datetime,visit_start_date,preceding_visit_occurrence_id,...,visit_end_date,visit_concept_id,visit_type_concept_id,discharge_to_concept_id,race_concept_name,visit_concept_name,death_date,death_datetime,death_type_concept_id,visit_duration
0,1930,38003564.0,26570,5,1,8552.0,8532,NaT,2008-09-21,,...,2008-09-21,0.0,44818517.0,,Unknown,,2010-04-09,2010-04-09 04:04:58,38003565,NaT
1,1930,38003564.0,26570,5,1,8552.0,8532,NaT,2010-02-08,,...,2010-02-08,0.0,44818517.0,,Unknown,,2010-04-09,2010-04-09 04:04:58,38003565,NaT
2,1930,38003564.0,26570,5,1,8552.0,8532,NaT,2008-10-06,,...,2008-10-06,0.0,44818517.0,44814705.0,Unknown,,2010-04-09,2010-04-09 04:04:58,38003565,0 days
3,1930,38003564.0,26570,5,1,8552.0,8532,NaT,2009-02-27,,...,2009-02-27,9202.0,44818517.0,44814705.0,Unknown,Outpatient Visit,2010-04-09,2010-04-09 04:04:58,38003565,NaT
4,1930,38003564.0,26570,5,1,8552.0,8532,NaT,2009-06-03,,...,2009-06-03,0.0,44818517.0,44814705.0,Unknown,,2010-04-09,2010-04-09 04:04:58,38003565,0 days


In [9]:
max_visit_start_date =  df['visit_start_date'].max()
min_visit_start_date =  df['visit_start_date'].min()
print(max_visit_start_date)
print(min_visit_start_date)

2010-05-25 00:00:00
2007-12-23 00:00:00


In [10]:
def check_death_flag(x, window_size):
    if x.death_date - x.visit_start_date < window_size and x.death_date - x.visit_start_date >= timedelta(days = 0):
        return 1
    return 0

Generate windows of training data with window_id being the identifier. Every row has a death_in_next_window field that informs us whether the person dies in the next window.

In [11]:
def visit_types_count(x):
    return pd.Series(dict(
        inpatient_visit_count  = (x.visit_concept_name == 'Inpatient Visit').sum(),
        outpatient_visit_count = (x.visit_concept_name == 'Outpatient Visit').sum(),
        er_visit_count         = (x.visit_concept_name == 'Emergency Room Visit').sum()
        ))

In [12]:
def window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, calc_death=0):
    window_id = 0
    while window_start < max_visit_start_date:
        df_window = df[(df[date_var] >= window_start) & (df[date_var] < window_start + window_size)]
        if(calc_death):
            df_window['death_in_next_window'] = df_window.apply(lambda x: check_death_flag(x, window_size), axis=1)

        agg_df = df_window.groupby(group_by_var).agg(agg_dict).rename(columns=rename_dict)

        apply_cols = df_window.groupby(group_by_var).apply(lambda x: apply_func(x))    
        agg_df = agg_df.join(apply_cols)
        agg_df['window_id'] = window_id
        agg_df.reset_index(drop=True)
        if not window_id:
            windowed_data = agg_df.copy()
        else:
            windowed_data = pd.concat([windowed_data, agg_df], ignore_index=True)
        window_id += 1
        window_start += window_size
    return windowed_data

In [13]:
window_size = timedelta(days = 180)
window_start = min_visit_start_date

In [14]:
agg_dict = {'person_id': 'max',
            'year_of_birth': 'max',
            'birth_datetime': 'max',
            'visit_start_date': 'max',
            'ethnicity_concept_id': 'max',
            'race_concept_id': 'max',
            'gender_concept_id': 'max',
            'race_concept_name': 'max',
            'visit_occurrence_id': 'nunique',
            'visit_concept_name': 'count',
            'visit_duration': 'sum',
            'death_in_next_window': 'max'}

rename_dict = {'visit_occurrence_id': 'number_of_visits',
               'visit_start_date': 'latest_visit_start_date'}
group_by_var = 'person_id'
date_var = 'visit_start_date'
apply_func = visit_types_count

training_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func, 1)

In [15]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,birth_datetime,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,126,1932,NaT,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,0,0,0,0
1,1362,1937,NaT,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,0,0,0,0
2,2225,1938,NaT,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0
3,2249,1942,NaT,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,0,0,0,0
4,2580,1919,NaT,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,0,1,0,0


In [16]:
training_data.dtypes

person_id                            int64
year_of_birth                        int64
birth_datetime              datetime64[ns]
latest_visit_start_date     datetime64[ns]
ethnicity_concept_id               float64
race_concept_id                    float64
gender_concept_id                    int64
race_concept_name                   object
number_of_visits                     int64
visit_concept_name                   int64
visit_duration             timedelta64[ns]
death_in_next_window                 int64
inpatient_visit_count                int64
outpatient_visit_count               int64
er_visit_count                       int64
window_id                            int64
dtype: object

In [17]:
training_data.latest_visit_start_date.dt.year

0       2008
1       2008
2       2008
3       2008
4       2008
        ... 
2962    2010
2963    2010
2964    2010
2965    2010
2966    2010
Name: latest_visit_start_date, Length: 2967, dtype: int64

In [18]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,birth_datetime,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,126,1932,NaT,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,0,0,0,0
1,1362,1937,NaT,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,0,0,0,0
2,2225,1938,NaT,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0
3,2249,1942,NaT,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,0,0,0,0
4,2580,1919,NaT,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,0,1,0,0


In [19]:
training_data['old'] = training_data.latest_visit_start_date.dt.year - training_data.year_of_birth

training_data = training_data.drop(['year_of_birth', 'birth_datetime'], axis=1)

In [20]:
training_data[training_data.person_id == 2225]

Unnamed: 0,person_id,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id,old
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0,70
561,2225,2008-12-03,38003564.0,8515.0,8532,Asian,4,1,2 days,0,1,0,0,1,70
1193,2225,2009-02-27,38003564.0,8515.0,8532,Asian,1,0,0 days,0,0,0,0,2,71
1799,2225,2009-12-07,38003564.0,8515.0,8532,Asian,4,1,0 days,0,0,1,0,3,71
2423,2225,2010-05-17,38003564.0,8515.0,8532,Asian,2,0,0 days,0,0,0,0,4,72


In [21]:
training_data.window_id.unique()

array([0, 1, 2, 3, 4])

In [22]:
training_data.er_visit_count.unique()

array([0])

### Merge with condition_occurrence

In [23]:
table = 'condition_occurrence'
df = fetch_data(dbname, user, password, table)

In [24]:
df['condition_end_date'] = df['condition_end_date'] if not 'NaT' else df['condition_start_date']
df = df.drop(['condition_start_datetime',
              'condition_end_datetime',
              'stop_reason',
              'provider_id',
             'visit_detail_id',
             'condition_source_value',
             'condition_status_source_value',
             'condition_source_concept_id'], axis=1, errors='ignore')

In [25]:
df['condition_concept_id'] = df['condition_concept_id'].apply(str)
df['condition_type_concept_id'] = df['condition_type_concept_id'].apply(str)
df['condition_status_concept_id'] = df['condition_status_concept_id'].apply(str)

In [26]:
df.head()

Unnamed: 0,condition_occurrence_id,person_id,condition_concept_id,condition_start_date,condition_end_date,condition_type_concept_id,condition_status_concept_id,visit_occurrence_id
0,8901459,6,72993,2009-08-05,2009-08-05,38000230,4230359.0,979697.0
1,8891634,6,313217,2009-08-15,2009-08-15,38000230,4230359.0,2279847.0
2,9765782,6,257011,2009-01-12,2009-01-12,38000230,,
3,13768748,6,201826,2009-05-15,2009-05-15,38000230,,
4,6882817,6,73553,2009-01-02,2009-01-02,38000230,4230359.0,


In [27]:
max_condition_start_date =  df['condition_start_date'].max()
min_condition_start_date =  df['condition_start_date'].min()
print(max_condition_start_date)
print(min_condition_start_date)

2010-05-25
2007-11-28


In [28]:
def agg_condition_concept_id(x):
    return pd.Series(dict(
        condition_concept_id_list  = ', '.join(set(x.condition_concept_id)),
        condition_type_concept_id_list  = ', '.join(set(x.condition_type_concept_id))
        ))

In [29]:
agg_dict = {'person_id': 'max',
            'condition_start_date': 'max',
            'condition_status_concept_id': 'max'}

rename_dict = {'condition_start_date': 'latest_condition_start_date'}

group_by_var = 'person_id'
date_var = 'condition_start_date'
apply_func = agg_condition_concept_id

cond_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [30]:
cond_occur_data.head()

Unnamed: 0,person_id,latest_condition_start_date,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,window_id
0,6,2008-06-19,,"374009, 313217, 374915, 439620, 440005, 444382...","38000230, 38000200",0
1,7,2008-05-25,,"313217, 201826, 439297, 141932",38000230,0
2,8,2008-05-03,,"313217, 195195, 133810, 4188191",38000230,0
3,10,2008-04-15,,"4093672, 138994",38000230,0
4,16,2008-06-03,,372892,38000230,0


In [31]:
training_data = pd.merge(training_data, cond_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id,old,latest_condition_start_date,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list
0,126,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,0,0,0,0,76,,,,
1,1362,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,0,0,0,0,71,2008-03-25,,73008,38000230.0
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0,70,2008-06-17,,"140648, 4007453, 74132, 31317, 4117413, 379784...",38000230.0
3,2249,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,0,0,0,0,66,2008-04-14,,77670,38000230.0
4,2580,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,0,1,0,0,89,,,,


In [32]:
del cond_occur_data

### Merge with procedure_occurrence

In [33]:
table = 'procedure_occurrence'
df = fetch_data(dbname, user, password, table)

In [34]:
df.head()

Unnamed: 0,procedure_occurrence_id,person_id,procedure_concept_id,procedure_date,procedure_datetime,procedure_type_concept_id,modifier_concept_id,quantity,provider_id,visit_occurrence_id,visit_detail_id,procedure_source_value,procedure_source_concept_id,modifier_source_value
0,11177430,0,4306780,2008-05-03,2008-05-03,38000269,,,,1435523.0,,V7231,44828600,
1,5077664,0,2005317,2009-08-28,2009-08-28,38000269,,,203540.0,462902.0,,7862,2005317,
2,7547623,0,2314269,2009-03-16,2009-03-16,38000269,,,,680782.0,,97012,2314269,
3,462612,0,2313881,2008-07-07,2008-07-07,38000269,,,113568.0,5484318.0,,93325,2313881,
4,4592206,0,2003090,2009-03-13,2009-03-13,38000269,,,,4011904.0,,496,2003090,


In [35]:
df = df.drop(['procedure_source_value',
              'procedure_source_concept_id',
              'modifier_source_value',
              'visit_detail_id',
              'provider_id',
              'quantity',
              'modifier_concept_id',
              'procedure_datetime'], axis=1, errors='ignore')

In [36]:
df['procedure_concept_id'] = df['procedure_concept_id'].apply(str)
df['procedure_type_concept_id'] = df['procedure_type_concept_id'].apply(str)

In [37]:
def agg_procedure_concept_id(x):
    return pd.Series(dict(
        procedure_concept_id_list  = ', '.join(set(x.procedure_concept_id)),
        procedure_type_concept_id_list  = ', '.join(set(x.procedure_type_concept_id))
        ))

In [38]:
agg_dict = {'person_id': 'max',
            'procedure_date': 'max'}

rename_dict = {'procedure_date': 'latest_procedure_date'}

group_by_var = 'person_id'
date_var = 'procedure_date'
apply_func = agg_procedure_concept_id

procedure_occur_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [39]:
training_data = pd.merge(training_data, procedure_occur_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,er_visit_count,window_id,old,latest_condition_start_date,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,latest_procedure_date,procedure_concept_id_list,procedure_type_concept_id_list
0,126,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,...,0,0,76,,,,,2008-06-01,"2514408, 2100657, 2008332, 2313635, 2002290, 2...",38000269.0
1,1362,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,...,0,0,71,2008-03-25,,73008,38000230.0,2008-06-15,"2003877, 2211799, 2314286, 4147961, 2003879, 2...",38000269.0
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,...,0,0,70,2008-06-17,,"140648, 4007453, 74132, 31317, 4117413, 379784...",38000230.0,,,
3,2249,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,...,0,0,66,2008-04-14,,77670,38000230.0,2008-06-16,"2002290, 2002291, 2414397, 4141759",38000269.0
4,2580,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,...,0,0,89,,,,,2008-06-19,"2005300, 2002291, 2108115, 2514434, 2314294, 2...",38000269.0


In [40]:
del procedure_occur_data

### Merge with drug_exposure

In [41]:
table = 'drug_exposure'
df = fetch_data(dbname, user, password, table)

In [42]:
df.head()

Unnamed: 0,drug_exposure_id,person_id,drug_concept_id,drug_exposure_start_date,drug_exposure_start_datetime,drug_exposure_end_date,drug_exposure_end_datetime,verbatim_end_date,drug_type_concept_id,stop_reason,...,sig,route_concept_id,lot_number,provider_id,visit_occurrence_id,visit_detail_id,drug_source_value,drug_source_concept_id,route_source_value,dose_unit_source_value
0,3744808,8,43560452,2010-02-18,2010-02-18 23:49:27,,,2010-11-21,38000175,,...,,,,,3000253.0,,54868094800,45054001,,
1,3866327,10,1363057,2009-09-22,2009-09-22 12:54:50,,,,38000175,,...,,,,,3886276.0,,61392065460,45056800,,
2,1084422,10,19078924,2008-08-25,2008-08-25 10:05:57,,,,38000175,,...,,,,,,,64679076203,44887586,,
3,4796682,10,19077498,2009-03-09,2009-03-09 14:22:34,,,,38000175,,...,,,,,,,66267023360,45194371,,
4,88654,10,1545997,2009-12-19,2009-12-19 23:35:16,,,2014-05-21,38000175,,...,,,,,,,58016025221,45072463,,


In [43]:
df = df.drop(['drug_exposure_start_datetime',
              'drug_exposure_end_date',
              'drug_exposure_end_datetime',
              'verbatim_end_date',
              'stop_reason',
              'lot_number',
              'provider_id',
              'visit_detail_id',
              'drug_source_value',
              'drug_source_concept_id',
              'route_source_value',
              'dose_unit_source_value',
              'route_concept_id',
              'sig',
              'refills',
              'days_supply'], axis=1, errors='ignore')

In [44]:
df['drug_concept_id'] = df['drug_concept_id'].apply(str)
df['drug_type_concept_id'] = df['drug_type_concept_id'].apply(str)

In [45]:
def agg_drug_concept_id(x):
    return pd.Series(dict(
        drug_concept_id_list  = ', '.join(set(x.drug_concept_id)),
        drug_type_concept_id_list  = ', '.join(set(x.drug_type_concept_id))
        ))

In [46]:
agg_dict = {'person_id': 'max',
            'drug_exposure_start_date': 'max',
            'quantity': 'sum'}

rename_dict = {'drug_exposure_start_date': 'latest_drug_exposure_start_date',
               'quantity': 'total_quantity_of_drugs'}

group_by_var = 'person_id'
date_var = 'drug_exposure_start_date'
apply_func = agg_drug_concept_id

drug_exposure_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [47]:
training_data = pd.merge(training_data, drug_exposure_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,condition_status_concept_id,condition_concept_id_list,condition_type_concept_id_list,latest_procedure_date,procedure_concept_id_list,procedure_type_concept_id_list,latest_drug_exposure_start_date,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list
0,126,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,...,,,,2008-06-01,"2514408, 2100657, 2008332, 2313635, 2002290, 2...",38000269.0,2008-05-01,120.0,"19098438, 40185305, 40221862",38000175.0
1,1362,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,...,,73008,38000230.0,2008-06-15,"2003877, 2211799, 2314286, 4147961, 2003879, 2...",38000269.0,,,,
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,...,,"140648, 4007453, 74132, 31317, 4117413, 379784...",38000230.0,,,,2008-05-01,30.0,"0, 40032317",38000175.0
3,2249,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,...,,77670,38000230.0,2008-06-16,"2002290, 2002291, 2414397, 4141759",38000269.0,,,,
4,2580,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,...,,,,2008-06-19,"2005300, 2002291, 2108115, 2514434, 2314294, 2...",38000269.0,,,,


In [48]:
del drug_exposure_data

### Merge with observation

In [49]:
table = 'observation'
df = fetch_data(dbname, user, password, table)

In [50]:
df.head()

Unnamed: 0,observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,...,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value,observation_event_id,obs_event_field_concept_id,value_as_datetime
0,1802194,0,440927,2009-06-01,2009-06-01 07:35:03,38000282,,14-20,4069776.0,,...,46319.0,,,V5863,44830822.0,,,,,
1,583362,0,4015724,2009-09-27,2009-09-27 04:50:11,38000282,,1-3,,,...,274541.0,,,V0481,44837741.0,,,,,
2,1132254,0,439405,2008-05-23,2008-05-23 05:35:48,38000282,,48,,,...,270459.0,,,7197,44829217.0,,,,,
3,760346,0,440922,2008-12-24,2008-12-24 08:04:37,38000282,,.2,,,...,310343.0,,,V5867,,,,,,
4,1414625,0,2614666,2010-04-26,2010-04-26 12:09:46,38000282,,.1,4180790.0,,...,,,,A0425,,,,,,


#### Dont know what to do with the columns value_as_string, value_as_concept_id

In [51]:
df = df.drop(['observation_datetime',
              'value_as_number',
              'qualifier_concept_id',
              'visit_occurrence_id',
              'visit_detail_id',
              'observation_source_value',
              'observation_source_concept_id',
              'observation_event_id',
              'unit_source_value',
              'qualifier_source_value',
              'observation_event_id	',
              'obs_event_field_concept_id',
              'value_as_datetime',
              'unit_concept_id',
              'provider_id'], axis=1, errors='ignore')

In [52]:
df['observation_concept_id'] = df['observation_concept_id'].apply(str)
df['observation_type_concept_id'] = df['observation_type_concept_id'].apply(str)

In [53]:
def agg_observation_concept_id(x):
    return pd.Series(dict(
        observation_concept_id_list  = ', '.join(set(x.observation_concept_id)),
        observation_type_concept_id_list  = ', '.join(set(x.observation_type_concept_id))
        ))

In [54]:
agg_dict = {'person_id': 'max',
            'observation_date': 'max'}

rename_dict = {'observation_date': 'latest_observation_date'}

group_by_var = 'person_id'
date_var = 'observation_date'
apply_func = agg_observation_concept_id

observation_data = \
window_data(df, window_size, window_start, group_by_var, date_var, agg_dict, rename_dict, apply_func)

In [55]:
training_data = pd.merge(training_data, observation_data, on=['person_id', 'window_id'], how='left')
training_data.head()

Unnamed: 0,person_id,latest_visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,...,latest_procedure_date,procedure_concept_id_list,procedure_type_concept_id_list,latest_drug_exposure_start_date,total_quantity_of_drugs,drug_concept_id_list,drug_type_concept_id_list,latest_observation_date,observation_concept_id_list,observation_type_concept_id_list
0,126,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,...,2008-06-01,"2514408, 2100657, 2008332, 2313635, 2002290, 2...",38000269.0,2008-05-01,120.0,"19098438, 40185305, 40221862",38000175.0,2008-06-14,"440927, 2614669, 40481022, 4059467, 40479553, ...",38000282.0
1,1362,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,...,2008-06-15,"2003877, 2211799, 2314286, 4147961, 2003879, 2...",38000269.0,,,,,,,
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,...,,,,2008-05-01,30.0,"0, 40032317",38000175.0,,,
3,2249,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,...,2008-06-16,"2002290, 2002291, 2414397, 4141759",38000269.0,,,,,2008-03-24,4214956,38000282.0
4,2580,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,...,2008-06-19,"2005300, 2002291, 2108115, 2514434, 2314294, 2...",38000269.0,,,,,,,


In [56]:
del observation_data