In [2]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as psql
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

Connect to the database and fetch the person_visit_death_with_concepts table

In [3]:
def fetch_data(dbname, user, table):
    connection = pg.connect('dbname='+dbname+' user='+user)
    df = pd.read_sql_query('SELECT * FROM '+ table,con=connection)
    connection.close()
    return df

In [4]:
dbname = 'commondatamodel'
user = 'romirmoza'
table = 'person_visit_death_with_concepts'
df = fetch_data(dbname, user, table)

Convert dates to the correct datatype

In [6]:
df[['visit_start_date','visit_end_date', 'death_date']] = \
df[['visit_start_date','visit_end_date', 'death_date']].apply(pd.to_datetime, format='%Y-%m-%d')

Add visit_duration columns

In [7]:
df['visit_duration'] = df['visit_end_date'] - df['visit_start_date']

In [8]:
max_visit_start_date =  df['visit_start_date'].max()
min_visit_start_date =  df['visit_start_date'].min()
print(max_visit_start_date)
print(min_visit_start_date)

2010-05-25 00:00:00
2007-12-23 00:00:00


In [9]:
def check_death_flag(x, window_size):
    if x.death_date - x.visit_start_date < window_size and x.death_date - x.visit_start_date >= timedelta(days = 0):
        return 1
    return 0

Generate windows of training data with window_id being the identifier. Every row has a death_in_next_window field that informs us whether the person dies in the next window.

In [21]:
window_size = timedelta(days = 180)
window_start = min_visit_start_date
window_id = 0
while window_start < max_visit_start_date:
    df_window = df[(df['visit_start_date'] >= window_start) & (df['visit_start_date'] < window_start + window_size)]
    df_window['death_in_next_window'] = df_window.apply(lambda x: check_death_flag(x, window_size), axis=1)
    
    agg_df = df_window.groupby('person_id').agg({'year_of_birth': 'max',
                                                 'ethnicity_concept_id': 'max',
                                                 'person_id': 'max',
                                                 'month_of_birth': 'max',
                                                 'day_of_birth': 'max',
                                                 'race_concept_id': 'max',
                                                 'gender_concept_id': 'max',
                                                 'birth_datetime': 'max',
                                                 'race_concept_name': 'max',
                                                 'visit_occurrence_id': 'nunique',
                                                 'visit_concept_name': 'count',
                                                 'visit_duration': 'sum',
                                                 'death_in_next_window': 'max'
                                                }).rename(columns={'visit_occurrence_id': 'number_of_visits'})


    visit_counts = df_window.groupby('person_id').apply(lambda x: pd.Series(dict(
        inpatient_visit_count  = (x.visit_concept_name == 'Inpatient Visit').sum(),
        outpatient_visit_count = (x.visit_concept_name == 'Outpatient Visit').sum(),
        er_visit_count         = (x.visit_concept_name == 'Emergency Room Visit').sum()
        )))    
    agg_df = agg_df.join(visit_counts)
    agg_df['window_id'] = window_id
    agg_df.reset_index(drop=True)
    if not window_id:
        training_data = agg_df.copy()
    else:
        training_data = pd.concat([training_data, agg_df], ignore_index=True)
    window_id += 1
    window_start += window_size

In [22]:
training_data

Unnamed: 0,year_of_birth,ethnicity_concept_id,person_id,month_of_birth,day_of_birth,race_concept_id,gender_concept_id,birth_datetime,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,1932,38003564.0,126,6,1,8552.0,8532,NaT,Unknown,1,0,0 days,1,0,0,0,0
1,1937,38003564.0,1362,7,1,8557.0,8507,NaT,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,0,0,0,0
2,1938,38003564.0,2225,6,1,8515.0,8532,NaT,Asian,3,1,2 days,0,0,1,0,0
3,1942,,2249,10,1,8516.0,8532,NaT,Black or African American,1,0,0 days,1,0,0,0,0
4,1919,,2580,1,1,8557.0,8507,NaT,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,1924,38003564.0,151075,8,1,,8532,NaT,,1,0,0 days,1,0,0,0,4
2963,1926,38003564.0,151182,7,1,8516.0,8507,NaT,Black or African American,6,1,0 days,1,0,1,0,4
2964,1939,38003564.0,151393,9,1,8552.0,8532,NaT,Unknown,1,0,0 days,1,0,0,0,4
2965,1939,,151657,4,1,8527.0,8532,NaT,White,1,0,0 days,1,0,0,0,4
