In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as psql
from datetime import timedelta

import warnings
warnings.filterwarnings('ignore')

Connect to the database and fetch the person_visit_death_with_concepts table

In [2]:
def fetch_data(dbname, user, table):
    connection = pg.connect('host=localhost dbname='+dbname+' user='+user+' password=\''+password+'\'')
    df = pd.read_sql_query('SELECT * FROM '+ table,con=connection)
    connection.close()
    return df

In [4]:
dbname = 'commondatamodel'
user = 'romirmoza'
table = 'person_visit_death_with_concepts'
df = fetch_data(dbname, user, table)

Convert dates to the correct datatype

In [5]:
df.columns

Index(['year_of_birth', 'ethnicity_concept_id', 'person_id', 'month_of_birth',
       'day_of_birth', 'race_concept_id', 'gender_concept_id',
       'birth_datetime', 'visit_start_date', 'preceding_visit_occurrence_id',
       'visit_occurrence_id', 'visit_end_date', 'visit_concept_id',
       'visit_type_concept_id', 'discharge_to_concept_id', 'race_concept_name',
       'visit_concept_name', 'death_date', 'death_datetime',
       'death_type_concept_id'],
      dtype='object')

In [6]:
df[['visit_start_date','visit_end_date', 'death_date']] = \
df[['visit_start_date','visit_end_date', 'death_date']].apply(pd.to_datetime, format='%Y-%m-%d')

Add visit_duration columns

In [7]:
df['visit_duration'] = df['visit_end_date'] - df['visit_start_date']

In [8]:
max_visit_start_date =  df['visit_start_date'].max()
min_visit_start_date =  df['visit_start_date'].min()
print(max_visit_start_date)
print(min_visit_start_date)

2010-05-25 00:00:00
2007-12-23 00:00:00


In [9]:
def check_death_flag(x, window_size):
    if x.death_date - x.visit_start_date < window_size and x.death_date - x.visit_start_date >= timedelta(days = 0):
        return 1
    return 0

Generate windows of training data with window_id being the identifier. Every row has a death_in_next_window field that informs us whether the person dies in the next window.

In [10]:
window_size = timedelta(days = 180)
window_start = min_visit_start_date
window_id = 0
while window_start < max_visit_start_date:
    df_window = df[(df['visit_start_date'] >= window_start) & (df['visit_start_date'] < window_start + window_size)]
    df_window['death_in_next_window'] = df_window.apply(lambda x: check_death_flag(x, window_size), axis=1)
    
    agg_df = df_window.groupby('person_id').agg({'person_id': 'max',
                                                 'year_of_birth': 'max',
                                                 'birth_datetime': 'max',
                                                 'visit_start_date': 'max',
                                                 'ethnicity_concept_id': 'max',
                                                 'race_concept_id': 'max',
                                                 'gender_concept_id': 'max',
                                                 'race_concept_name': 'max',
                                                 'visit_occurrence_id': 'nunique',
                                                 'visit_concept_name': 'count',
                                                 'visit_duration': 'sum',
                                                 'death_in_next_window': 'max'
                                                }).rename(columns={'visit_occurrence_id': 'number_of_visits'})


    visit_counts = df_window.groupby('person_id').apply(lambda x: pd.Series(dict(
        inpatient_visit_count  = (x.visit_concept_name == 'Inpatient Visit').sum(),
        outpatient_visit_count = (x.visit_concept_name == 'Outpatient Visit').sum(),
        er_visit_count         = (x.visit_concept_name == 'Emergency Room Visit').sum()
        )))    
    agg_df = agg_df.join(visit_counts)
    agg_df['window_id'] = window_id
    agg_df.reset_index(drop=True)
    if not window_id:
        training_data = agg_df.copy()
    else:
        training_data = pd.concat([training_data, agg_df], ignore_index=True)
    window_id += 1
    window_start += window_size

In [11]:
training_data.dtypes

person_id                           int64
year_of_birth                       int64
birth_datetime             datetime64[ns]
visit_start_date           datetime64[ns]
ethnicity_concept_id              float64
race_concept_id                   float64
gender_concept_id                   int64
race_concept_name                  object
number_of_visits                    int64
visit_concept_name                  int64
visit_duration            timedelta64[ns]
death_in_next_window                int64
inpatient_visit_count               int64
outpatient_visit_count              int64
er_visit_count                      int64
window_id                           int64
dtype: object

In [12]:
training_data.visit_start_date.dt.year

0       2008
1       2008
2       2008
3       2008
4       2008
        ... 
2962    2010
2963    2010
2964    2010
2965    2010
2966    2010
Name: visit_start_date, Length: 2967, dtype: int64

In [13]:
training_data.head()

Unnamed: 0,person_id,year_of_birth,birth_datetime,visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id
0,126,1932,NaT,2008-02-12,38003564.0,8552.0,8532,Unknown,1,0,0 days,1,0,0,0,0
1,1362,1937,NaT,2008-05-08,38003564.0,8557.0,8507,Native Hawaiian or Other Pacific Islander,1,0,0 days,0,0,0,0,0
2,2225,1938,NaT,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0
3,2249,1942,NaT,2008-02-16,,8516.0,8532,Black or African American,1,0,0 days,1,0,0,0,0
4,2580,1919,NaT,2008-03-13,,8557.0,8507,Native Hawaiian or Other Pacific Islander,2,1,0 days,0,0,1,0,0


In [14]:
training_data['old'] = training_data.visit_start_date.dt.year - training_data.year_of_birth

training_data = training_data.drop(['year_of_birth', 'birth_datetime'], axis=1)

In [15]:
training_data[training_data.person_id == 2225]

Unnamed: 0,person_id,visit_start_date,ethnicity_concept_id,race_concept_id,gender_concept_id,race_concept_name,number_of_visits,visit_concept_name,visit_duration,death_in_next_window,inpatient_visit_count,outpatient_visit_count,er_visit_count,window_id,old
2,2225,2008-06-13,38003564.0,8515.0,8532,Asian,3,1,2 days,0,0,1,0,0,70
561,2225,2008-12-03,38003564.0,8515.0,8532,Asian,4,1,2 days,0,1,0,0,1,70
1193,2225,2009-02-27,38003564.0,8515.0,8532,Asian,1,0,0 days,0,0,0,0,2,71
1799,2225,2009-12-07,38003564.0,8515.0,8532,Asian,4,1,0 days,0,0,1,0,3,71
2423,2225,2010-05-17,38003564.0,8515.0,8532,Asian,2,0,0 days,0,0,0,0,4,72


In [16]:
training_data.window_id.unique()

array([0, 1, 2, 3, 4])

In [17]:
training_data.er_visit_count.unique()

array([0])