In [1]:
import numpy as np
import os
import pandas as pd
import time
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import sys
import gc
import pickle 
import math
from itertools import product

sys.path.append('../')
from preprocessing_utils import *

In [2]:

codes_path = '../../codes_mappings/'
validation_set = False

# how much data we are using to make predictions
forward_iterations = 14 # 3 years
backwards_iterations = 19 # full history: 65; median history: 19 (4.3 years)
days_per_iter = 90 # interval size

# censor date to cohort start date
num_days_prediction = 90

with open(f'{int_path}/{dataset_prefix}du_snomed_colnames', "rb") as fp:   #Pickling
    data_columns = pickle.load(fp)

# Description
1. Load data and run checks on dates + duplicates + presence of schizophrenia information
2. Map data to the rolled up concepts where appropriate
3. Create a population dataframe with **sequence length** entries per patient. This should have the start date (inclusive) and end date (exclusive) for each subsequence
4. Limit to the date range for the given iteration for each person and call the function that creates the features
   - Conditions, procedures, and labs: counts per subsequence time
   - Medications: total days (end day-start day + 1 to account for single-day medications)
   - Visits: number of visits (frequency)and mean + summed length of stay (end day - start day to count overnights)
5. Populate time since psychosis
6. Fill in "blank" iterations

### Load Data
Also constrict to patients with psychosis at least 6 months pre-index

In [3]:
# read in population dataframe
df_pop = pd.read_csv(f'{data_path}/population_2dx.csv', parse_dates = ['psychosis_diagnosis_date', 'scz_diagnosis_date', 'cohort_start_date'])
print(len(df_pop), df_pop['sz_flag'].sum()/len(df_pop), len(df_pop['person_id'].unique()))
df_pop = df_pop.loc[(df_pop['cohort_start_date']-df_pop['psychosis_diagnosis_date']).dt.days >= num_days_prediction]
print(len(df_pop), df_pop['sz_flag'].sum()/len(df_pop), len(df_pop['person_id'].unique()))
df_pop['censor_date'] = df_pop['cohort_start_date'] - pd.Timedelta(days=num_days_prediction)

count_visits = pd.read_csv(f'{int_path}/hcu_visit_counts.csv', parse_dates = ['first_visit'])
df_pop = df_pop.merge(count_visits[['person_id', 'first_visit']], how = 'left', on = 'person_id')

if dataset == 'mdcd_1yr': 
    df_3yr_pop = pd.read_csv(f'{path}/raw_data_mdcd_3yrs/population.csv')
    print(len(df_pop))
    df_pop = df_pop.loc[~df_pop['person_id'].isin(df_3yr_pop['person_id'])]
    print(len(df_pop))

107626 0.1280452678720755 107626
102739 0.11423120723386446 102739


In [4]:
all_visits = pd.read_csv(f'{data_path}/temporal_visits.csv', parse_dates = ['cohort_start_date', 'visit_start_date', 'visit_end_date'])
all_visits = pre_censor_data(all_visits, df_pop, 'visit_start_date')
all_visits.loc[all_visits['visit_end_date'] > all_visits['censor_date'], 'visit_end_date'] = all_visits.loc[all_visits['visit_end_date'] > all_visits['censor_date'], 'censor_date']
print('Duplicate Visits (should be True)', 'Unnamed: 0' not in all_visits.columns, len(all_visits) == len(all_visits['visit_occurrence_id'].unique()))

all_meds = pd.read_csv(f'{data_path}/temporal_medications.csv', parse_dates = ['cohort_start_date', 'drug_era_start_date', 'drug_era_end_date'])
all_meds = pre_censor_data(all_meds, df_pop, 'drug_era_start_date')
all_meds.loc[all_meds['drug_era_end_date'] > all_meds['censor_date'], 'drug_era_end_date'] = all_meds.loc[all_meds['drug_era_end_date'] > all_meds['censor_date'], 'censor_date']

all_conds = pd.read_csv(f'{data_path}/temporal_conditions.csv', parse_dates = ['cohort_start_date', 'condition_start_date'])
all_conds = pre_censor_data(all_conds, df_pop, 'condition_start_date')

all_procedures = pd.read_csv(f'{data_path}/temporal_procedures.csv', parse_dates = ['cohort_start_date', 'procedure_date'])
all_procedures = pre_censor_data(all_procedures, df_pop, 'procedure_date')

all_labs = pd.read_csv(f'{data_path}/temporal_labs.csv', parse_dates = ['cohort_start_date', 'measurement_date'])
all_labs = pre_censor_data(all_labs, df_pop, 'measurement_date')

Duplicate Visits (should be True) True True


In [5]:
# check for schizophrenia in fine-grained conditions
scz_codes = pd.read_csv(codes_path+'all_scz_codes.csv')
print('Check granular presence of SCZ:',len(all_conds.loc[all_conds['condition_concept_id'].isin(scz_codes['standard_concept_id'])]))

Check granular presence of SCZ: 0


### Map data to other concepts

In [6]:
rolled_medications = pd.read_csv(codes_path + 'rolled_medications.csv')
all_meds = all_meds.merge(rolled_medications[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'drug_concept_id', right_on = 'descendant_concept_id')
all_meds = all_meds[['person_id','drug_era_id','drug_era_start_date', 'drug_era_end_date', 'cohort_start_date', 'drug_concept_id', 'rolled_concept_name', 'drug_exposure_count', 'censor_date']].drop_duplicates()
all_meds.loc[all_meds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_meds.loc[all_meds['rolled_concept_name'].isna(), 'drug_concept_id']

list_med_concepts = list(all_meds['rolled_concept_name'])
list_med_concepts = [str(i) + '_meds' for i in list_med_concepts]
all_meds['rolled_concept_name'] = list_med_concepts

print('Duplicate Meds (Expect True)', 'Unnamed: 0' not in all_meds.columns, len(all_meds) == len(all_meds[['person_id', 'rolled_concept_name', 'drug_concept_id', 'drug_era_start_date', 'drug_era_end_date']].drop_duplicates()))

Duplicate Meds (Expect True) True True


In [7]:
rolled_conditions = pd.read_csv(codes_path + 'rolled_conditions_level4.csv')
all_conds = all_conds.merge(rolled_conditions[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'condition_concept_id', right_on = 'descendant_concept_id')
all_conds = all_conds[['person_id','condition_occurrence_id','condition_start_date', 'condition_concept_id', 'concept_name', 'rolled_concept_name', 'cohort_start_date', 'visit_occurrence_id', 'censor_date']].drop_duplicates()
all_conds.loc[all_conds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_conds.loc[all_conds['rolled_concept_name'].isna(), 'concept_name']

list_cond_concepts = list(all_conds['rolled_concept_name'])
list_cond_concepts = [str(i) + '_conds' for i in list_cond_concepts]
all_conds['rolled_concept_name'] = list_cond_concepts

# now check in more granular conditions
for i in list_cond_concepts:
    if 'schizo' in i.lower():
        print(i)
# check uniqueness
print('Duplicate Conds (Expect True)', 'Unnamed: 0' not in all_conds.columns, len(all_conds) == len(all_conds[['person_id', 'rolled_concept_name', 'condition_start_date', 'condition_occurrence_id']].drop_duplicates()))

Duplicate Conds (Expect True) True True


In [8]:
rolled_procedures = pd.read_csv(codes_path + 'rolled_procedures_level4.csv')
all_procedures = all_procedures.merge(rolled_procedures[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'procedure_concept_id', right_on = 'descendant_concept_id')
all_procedures = all_procedures[['person_id','procedure_occurrence_id','procedure_date', 'procedure_concept_id','concept_name', 'rolled_concept_name', 'cohort_start_date', 'censor_date']].drop_duplicates()
all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'concept_name']

list_procedure_concepts = list(all_procedures['rolled_concept_name'])
list_procedure_concepts = [str(i) + '_procedure' for i in list_procedure_concepts]
all_procedures['rolled_concept_name'] = list_procedure_concepts

print('Duplicate Procedures (Expect True)', 'Unnamed: 0' not in all_procedures.columns, len(all_procedures) == len(all_procedures[['person_id','rolled_concept_name', 'procedure_concept_id', 'procedure_date', 'procedure_occurrence_id']].drop_duplicates()))

Duplicate Procedures (Expect True) True True


In [9]:
all_labs['rolled_concept_name'] = all_labs['concept_name'].astype(str) + '_lab'
all_labs = all_labs[['person_id', 'measurement_concept_id', 'measurement_date', 'measurement_id', 'rolled_concept_name', 'censor_date']].drop_duplicates()
print('Duplicate Labs (Expect True)', 'Unnamed: 0' not in all_labs.columns, len(all_labs) == len(all_labs[['person_id', 'measurement_concept_id', 'measurement_date', 'measurement_id']].drop_duplicates()))

Duplicate Labs (Expect True) True True


In [10]:
all_conds['start_date'] = all_conds['condition_start_date'].copy()
all_procedures['start_date'] = all_procedures['procedure_date'].copy()
all_labs['start_date'] = all_labs['measurement_date'].copy()

all_cond_lab_pro = pd.concat([all_conds, all_procedures, all_labs])
all_cond_lab_pro = all_cond_lab_pro[['person_id', 'rolled_concept_name','cohort_start_date', 'start_date', 'censor_date']]

### Only keep columns that we had in the data

In [11]:
print(len(all_cond_lab_pro), len(all_meds))
all_cond_lab_pro = drop_unshared_features(all_cond_lab_pro, 'rolled_concept_name', data_columns)
all_meds = drop_unshared_features(all_meds, 'rolled_concept_name', data_columns)
print(len(all_cond_lab_pro), len(all_meds))

138917603 39703694
133881232 39654646


### Check for Data Leakage: 
Minimum times should be at least 90 days and cohort start date should be same across all dfs

In [12]:
check = (all_cond_lab_pro['cohort_start_date']-all_cond_lab_pro['start_date']).dt.days
print('Labs, conditions, procedures:', check.min(), check.max())

check = (all_meds['cohort_start_date']-all_meds['drug_era_start_date']).dt.days
print('Meds (Start of prescription):', check.min(), check.max())
check = (all_meds['cohort_start_date']-all_meds['drug_era_end_date']).dt.days
print('Meds (End of prescription):', check.min(), check.max())

check = (all_visits['cohort_start_date']-all_visits['visit_start_date']).dt.days
print('Visits (Start of visit):', check.min(), check.max())
check = (all_visits['cohort_start_date']-all_visits['visit_end_date']).dt.days
print('Visits (End of visit):', check.min(), check.max())

Labs, conditions, procedures: 90.0 5478.0
Meds (Start of prescription): 90 5478
Meds (End of prescription): 90 5478
Visits (Start of visit): 90 5478
Visits (End of visit): 90 5478


In [13]:
check_cohort_start = df_pop[['person_id','cohort_start_date']]
check_cohort_start = check_cohort_start.merge(all_cond_lab_pro[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_pop','_cond'])
check_cohort_start = check_cohort_start.merge(all_visits[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes = ['_old1','_visits'])
check_cohort_start = check_cohort_start.merge(all_meds[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old4','_meds'])
check_cohort_start.set_index('person_id',inplace=True)
check_cohort_start = check_cohort_start.T
num_unique = check_cohort_start.T.apply(lambda x: x.nunique(), axis=1)
print('Number of places where cohort start date doesnt align:',(num_unique>1).sum())

check_censor = df_pop[['person_id','censor_date']]
check_censor = check_censor.merge(all_cond_lab_pro[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_pop','_cond'])
check_censor = check_censor.merge(all_visits[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes = ['_old1','_visits'])
check_censor = check_censor.merge(all_meds[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old4','_meds'])
check_censor.set_index('person_id',inplace=True)
check_censor = check_censor.T
num_unique = check_censor.T.apply(lambda x: x.nunique(), axis=1)
print('Number of places where censor date doesnt align:',(num_unique>1).sum())

Number of places where cohort start date doesnt align: 0
Number of places where censor date doesnt align: 0


### "Psychiatric" visits

In [14]:
mental_health_conds = pd.read_csv(f'{codes_path}/mental_disorder_descendants.csv')
mh_visits = all_conds.loc[all_conds['condition_concept_id'].isin(mental_health_conds['descendant_concept_id']), 'visit_occurrence_id']
df_mh_visits = all_visits.loc[all_visits['visit_occurrence_id'].isin(mh_visits)]

all_visits['visit_concept_id'] = all_visits['visit_concept_id'].astype(str) + '_ALL'
df_mh_visits['visit_concept_id'] = df_mh_visits['visit_concept_id'].astype(str) + '_MH'
all_visits = pd.concat([all_visits, df_mh_visits], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mh_visits['visit_concept_id'] = df_mh_visits['visit_concept_id'].astype(str) + '_MH'


In [15]:
len(all_visits)

49134238

# Start Processing Timedelta

In [16]:
all_cond_lab_pro = all_cond_lab_pro[['person_id', 'rolled_concept_name', 'start_date']].drop_duplicates()
all_cond_lab_pro = all_cond_lab_pro.sort_values(['person_id', 'rolled_concept_name', 'start_date'])
all_cond_lab_pro['timedelta'] = all_cond_lab_pro.groupby(['person_id', 'rolled_concept_name'])['start_date'].diff().dt.days
all_cond_lab_pro.rename({'rolled_concept_name':'feature_name', 'start_date':'feature_start_date'}, axis=1, inplace=True)
list_unique_cpl = list(all_cond_lab_pro['feature_name'].unique())

all_cond_lab_pro.head(50)

Unnamed: 0,person_id,feature_name,feature_start_date,timedelta
1029,20000000000.0,Abdominal mass_conds,2017-06-30,
1051,20000000000.0,Abdominal mass_conds,2017-09-27,89.0
551,20000000000.0,Abdominal organ finding_conds,2013-12-18,
567,20000000000.0,Abdominal organ finding_conds,2014-01-08,21.0
570,20000000000.0,Abdominal organ finding_conds,2014-01-09,1.0
703,20000000000.0,Abdominal organ finding_conds,2015-04-04,450.0
1031,20000000000.0,Abdominal organ finding_conds,2017-06-30,818.0
1053,20000000000.0,Abdominal organ finding_conds,2017-09-27,89.0
613229,20000000000.0,Active or passive immunization_procedure,2016-02-29,
613230,20000000000.0,Active or passive immunization_procedure,2016-05-11,72.0


In [17]:
all_visits = all_visits[['person_id', 'visit_concept_id', 'visit_start_date', 'visit_end_date']].drop_duplicates()
all_visits.rename({'visit_concept_id':'feature_name', 'visit_start_date':'feature_start_date', 'visit_end_date':'feature_end_date'}, axis=1, inplace=True)

all_meds = all_meds[['person_id', 'rolled_concept_name', 'drug_era_start_date', 'drug_era_end_date']].drop_duplicates()
all_meds.rename({'rolled_concept_name':'feature_name', 'drug_era_start_date':'feature_start_date', 'drug_era_end_date':'feature_end_date'}, axis=1, inplace=True)

visit_meds = pd.concat([all_visits, all_meds])
list_unique_vm = list(visit_meds['feature_name'].unique())

visit_meds.head(30)

Unnamed: 0,person_id,feature_name,feature_start_date,feature_end_date
0,20000010000.0,581458_ALL,2008-12-03,2008-12-03
1,20000010000.0,9202_ALL,2009-02-06,2009-02-06
2,20000010000.0,9202_ALL,2009-03-18,2009-03-18
3,20000010000.0,9202_ALL,2009-03-23,2009-03-23
4,20000010000.0,9202_ALL,2009-05-09,2009-05-09
5,20000010000.0,9202_ALL,2009-08-31,2009-08-31
6,20000010000.0,9202_ALL,2009-09-16,2009-09-16
7,20000010000.0,9202_ALL,2010-01-06,2010-01-06
8,20000010000.0,581458_ALL,2010-03-04,2010-03-04
9,20000010000.0,9202_ALL,2010-03-26,2010-03-26


### Create the "iterative" population dataframe: 
- Start at the date of initial psychosis diagnosis, then go every XX days (120 days), cutting off at the censor date (if you go over the censor date, chop to the censor date).
- Practically, this means that start date 1 is first visit & end date 1 is psychosis; then start date 2 is psychosis dx date and end date 2 is psychosis dx + 90... 
- Then, starting at the date of psychosis, go back in XX-day increments for 3 years (9 iterations). The earliest iteration (furthest away from psychosis date) should consist of all prior data, and if a person has less than 3 years of data pre-psychosis, they should have fewer early visits.
- Note that start dates are inclusive and end dates are exclusive
- **KEEP IN MIND FOR LATER: WE WANT TO PAD AT THE BEGINNING, NOT AT THE END. So then we move each person to be aligned at the end**

In [18]:
df_pop = df_pop[['person_id', 'first_visit', 'cohort_start_date', 'psychosis_diagnosis_date', 'censor_date']]

df_pop['0_end'] = df_pop['psychosis_diagnosis_date']
df_pop['0_start'] = df_pop['psychosis_diagnosis_date'] - pd.Timedelta(days_per_iter, 'days')
print(len(df_pop))
df_pop.loc[df_pop['0_end']>df_pop['censor_date'], '0_end'] = df_pop.loc[df_pop['0_end']>df_pop['censor_date'], 'censor_date']
print(len(df_pop))

# after the loops, remove people for whom 0_start-0_end > 0

# FORWARD LOOP: starting at psychosis dx, every XX days till censor date
for count in range(1, forward_iterations+1): 
    # get the start date as the same day as prev end date and the end date as start + 120 days
    df_pop[str(count)+'_start'] = df_pop[str(count-1)+'_end']
    df_pop[str(count)+'_end'] = df_pop[str(count)+'_start'] + pd.Timedelta(days_per_iter, 'days')
    
    # update start/end dates to make sure it is at max, the censor date
    df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], 'censor_date']
    df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], 'censor_date']
    
    # if start date == censor date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_start'] == df_pop['censor_date'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]

for count in np.arange(-1, -1*backwards_iterations, -1):
    df_pop[str(count)+'_end'] = df_pop[str(count+1)+'_start']
    df_pop[str(count)+'_start'] = df_pop[str(count)+'_end']-pd.Timedelta(days_per_iter, 'days')
    
    # if the visit starts or ends prior to first_visit, set start/end to first_visit
    df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], 'first_visit']
    df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], 'first_visit']
    
    # if end date == first visit date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_end'] == df_pop['first_visit'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]
    
    df_pop[str(count)+'_end'] = pd.to_datetime(df_pop[str(count)+'_end'], format = '%Y-%m-%d')

# set the last backwards iteration start date to be the first visit ONLY if the date is not nan
df_pop.loc[~(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna()), f'{str(-1*backwards_iterations+1)}_start'] = df_pop.loc[~(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna()), 'first_visit']
print(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna().sum()/len(df_pop)) # check that this is ~50% or 1

102739
102739
0.5115778818170315


In [19]:
remove_cols = []
for col in df_pop.columns: 
    if df_pop[col].isna().sum() == len(df_pop):
        remove_cols.append(col)
print(remove_cols)

[]


In [20]:
# make sure all "timesteps" are at least 1 day
# also make sure none of them start/end after the censor date
for i in np.arange(-1*backwards_iterations+1, forward_iterations):
    df_pop.loc[df_pop[str(i)+'_end']==df_pop[str(i)+'_start'], [str(i)+'_start', str(i)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')] 
df_pop.drop(remove_cols, axis=1, inplace=True)

### Loop through the iterative population dataframe to get the features for each person in each iteration

In [21]:
iter_cols = [col for col in list(df_pop.columns) if '_end' in col]
iter_cols = [int(i.split('_')[0]) for i in iter_cols]
print(iter_cols)
print(len(visit_meds['person_id'].unique()), len(df_pop), len(all_cond_lab_pro['person_id'].unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18]
102739 102739 102739


In [22]:
list_timedeltas = []
for iteration in np.arange(np.min(iter_cols), np.max(iter_cols), 1): 
    temp_df_iter_pop = df_pop.copy()
    temp_df_iter_pop['iter_start_date'] = temp_df_iter_pop[str(iteration)+'_start']
    temp_df_iter_pop['iter_end_date'] = temp_df_iter_pop[str(iteration)+'_end']

    # constrict to people with a valid iteration
    temp_df_iter_pop = temp_df_iter_pop.loc[~(temp_df_iter_pop['iter_start_date'].isna())]
    temp_df_iter_pop['years_obs'] = (temp_df_iter_pop['iter_end_date']-temp_df_iter_pop['iter_start_date']).dt.days/365

    temp_df_iter_pop['iteration'] = iteration
    all_rows = temp_df_iter_pop[['person_id', 'iteration']]
    
    # CONDITIONS LABS PROCEDURES
    # for conditions, labs, procedures, just compare the start_date to the cutoff date
    within_iter_cond_pro_labs = all_cond_lab_pro.loc[all_cond_lab_pro['person_id'].isin(temp_df_iter_pop['person_id'])]
    within_iter_cond_pro_labs = within_iter_cond_pro_labs.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    within_iter_cond_pro_labs = within_iter_cond_pro_labs.loc[within_iter_cond_pro_labs['feature_start_date']>= within_iter_cond_pro_labs['iter_start_date']]
    within_iter_cond_pro_labs = within_iter_cond_pro_labs.loc[within_iter_cond_pro_labs['feature_start_date']< within_iter_cond_pro_labs['iter_end_date']]

    pre_iter_cond_pro_labs = all_cond_lab_pro.loc[all_cond_lab_pro['person_id'].isin(temp_df_iter_pop['person_id'])]
    pre_iter_cond_pro_labs = pre_iter_cond_pro_labs.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    pre_iter_cond_pro_labs = pre_iter_cond_pro_labs.loc[pre_iter_cond_pro_labs['feature_start_date']< pre_iter_cond_pro_labs['iter_start_date']]

    if len(within_iter_cond_pro_labs.loc[within_iter_cond_pro_labs['feature_start_date']>=within_iter_cond_pro_labs['iter_end_date']])+len(within_iter_cond_pro_labs.loc[within_iter_cond_pro_labs['feature_start_date']<within_iter_cond_pro_labs['iter_start_date']]) > 0:
        print('Leakage in conds/labs/procedures')
    if len(pre_iter_cond_pro_labs.loc[pre_iter_cond_pro_labs['feature_start_date']>pre_iter_cond_pro_labs['iter_start_date']]) > 0:
        print('Leakage in conds/labs/procedures')

    # Sort to ensure order then get the first occurrence
    combinations = list(product(list(temp_df_iter_pop['person_id'].unique()), list_unique_cpl))
    full_pt_feature_df = pd.DataFrame(columns = ['person_id', 'feature_name'], data = combinations)

    # IF SOMETHING EXISTS IN THE ITERATION: GET IT
    within_iter_cond_pro_labs = within_iter_cond_pro_labs.sort_values(['person_id', 'feature_name', 'feature_start_date'])
    within_iter_cond_pro_labs = within_iter_cond_pro_labs.groupby(['person_id', 'feature_name']).first().reset_index()
    full_pt_feature_df = full_pt_feature_df.merge(within_iter_cond_pro_labs[['person_id', 'feature_name', 'feature_start_date']], how='outer', on=['person_id', 'feature_name'])
    # ELSE put in the last date in the timestep
    full_pt_feature_df = full_pt_feature_df.merge(temp_df_iter_pop[['person_id', 'iter_end_date']], how='outer', on='person_id')
    full_pt_feature_df.loc[full_pt_feature_df['feature_start_date'].isna(), 'feature_start_date'] = full_pt_feature_df.loc[full_pt_feature_df['feature_start_date'].isna(), 'iter_end_date']

    # Next get the most recent occurrence of a pre-timestep value
    pre_iter_cond_pro_labs = pre_iter_cond_pro_labs.groupby(['person_id', 'feature_name']).max()['feature_start_date'].reset_index()
    pre_iter_cond_pro_labs.rename({'feature_start_date':'pre_iter_feature_date'}, axis=1, inplace=True)
    full_pt_feature_df = full_pt_feature_df.merge(pre_iter_cond_pro_labs, how='inner', on = ['person_id', 'feature_name'])
    full_pt_feature_df['time_since_last_feat'] = (full_pt_feature_df['feature_start_date']-full_pt_feature_df['pre_iter_feature_date']).dt.days

    # Pivot the table to get person_id as rows and feature_name as columns with timedelta as values
    pivot_temp_cond_pro_labs = full_pt_feature_df.pivot(index='person_id', columns='feature_name', values='time_since_last_feat')
    pivot_temp_cond_pro_labs.columns.name = None  # Remove the pivot column name
    pivot_temp_cond_pro_labs = pivot_temp_cond_pro_labs.reset_index()  # Reset index to bring person_id back as a column
    pivot_temp_cond_pro_labs.fillna(0, inplace=True)
    
    # VISITS AND MEDICATIONS: 
    """create a list of dfs that has patient, iteration and 
    per feature first and last engagement (first visit start date and 
    last visit end date per feature)
    """
    # note: for meds and visits, replace iter_end_date with equal_end_date, which is the 
    # day before the actual end date since is the last day that we are allowing the visit to "equal"
    # ie visit < iter_end_date but visit <= equal_end_date
    
    
    #for medications and visits, we want to look at 
    #1. med start date needs to be before iteration end date
    #2. med end date needs to be on or after iteration start date
    
    #3. if med start date is before iteration start date -- make med start date iteration start date
    #4. if med end date is after iteration end date -- make med end date iteration end date

    # get all the minimum visit/med start dates
    temp_visit_meds = visit_meds.loc[visit_meds['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_visit_meds = temp_visit_meds.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_visit_meds['equal_end_date'] = temp_visit_meds['iter_end_date']-pd.Timedelta(1, 'days')

    temp_visit_meds = temp_visit_meds.loc[(temp_visit_meds['feature_start_date']<temp_visit_meds['iter_end_date'])&(temp_visit_meds['feature_end_date']>=temp_visit_meds['iter_start_date'])]
    temp_visit_meds.loc[temp_visit_meds['feature_start_date']<temp_visit_meds['iter_start_date'], 'feature_start_date'] = temp_visit_meds.loc[temp_visit_meds['feature_start_date']<temp_visit_meds['iter_start_date'], 'iter_start_date']
    temp_visit_meds.loc[temp_visit_meds['feature_end_date']>temp_visit_meds['equal_end_date'], 'feature_end_date'] = temp_visit_meds.loc[temp_visit_meds['feature_end_date']>temp_visit_meds['equal_end_date'], 'equal_end_date']

    if len(temp_visit_meds.loc[temp_visit_meds['feature_start_date']>temp_visit_meds['iter_end_date']])+len(temp_visit_meds.loc[temp_visit_meds['feature_end_date']>temp_visit_meds['iter_end_date']]) > 0:
        print('Leakage in visit/med ends')
    if len(temp_visit_meds.loc[temp_visit_meds['feature_end_date']<temp_visit_meds['iter_start_date']])+len(temp_visit_meds.loc[temp_visit_meds['feature_start_date']<temp_visit_meds['iter_start_date']]) > 0:
        print('Leakage in visit/med starts')

    temp_visit_meds = temp_visit_meds.groupby(['person_id', 'feature_name']).agg({'feature_start_date':'min', 'feature_end_date':'max'}).reset_index()

    combinations = list(product(list(temp_df_iter_pop['person_id'].unique()), list_unique_vm))
    full_pt_vm_df = pd.DataFrame(columns = ['person_id', 'feature_name'], data = combinations)
    temp_visit_meds['feature_name'] = temp_visit_meds['feature_name'].astype('object')
    full_pt_vm_df = full_pt_vm_df.merge(temp_visit_meds[['person_id', 'feature_name', 'feature_start_date']], how='outer', on=['person_id', 'feature_name'])
    # ELSE put in the last date in the timestep
    full_pt_vm_df = full_pt_vm_df.merge(temp_df_iter_pop[['person_id', 'iter_end_date']], how='outer', on='person_id')
    full_pt_vm_df.loc[full_pt_vm_df['feature_start_date'].isna(), 'feature_start_date'] = full_pt_vm_df.loc[full_pt_vm_df['feature_start_date'].isna(), 'iter_end_date']

    # get all the pre-iteration visit/med start dates
    pre_visit_meds = visit_meds.loc[visit_meds['person_id'].isin(temp_df_iter_pop['person_id'])]
    pre_visit_meds = pre_visit_meds.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    pre_visit_meds['equal_end_date'] = pre_visit_meds['iter_start_date']-pd.Timedelta(1, 'days')

    pre_visit_meds = pre_visit_meds.loc[(pre_visit_meds['feature_start_date']<pre_visit_meds['iter_start_date'])]
    pre_visit_meds.loc[pre_visit_meds['feature_end_date']>pre_visit_meds['equal_end_date'], 'feature_end_date'] = pre_visit_meds.loc[pre_visit_meds['feature_end_date']>pre_visit_meds['equal_end_date'], 'equal_end_date']

    if len(pre_visit_meds.loc[pre_visit_meds['feature_start_date']>pre_visit_meds['iter_start_date']])+len(pre_visit_meds.loc[pre_visit_meds['feature_end_date']>pre_visit_meds['iter_start_date']]) > 0:
        print('Leakage in visit/med pre-iter')

    pre_visit_meds = pre_visit_meds.groupby(['person_id', 'feature_name']).agg({'feature_end_date':'max'}).reset_index()
    pre_visit_meds['feature_name'] = pre_visit_meds['feature_name'].astype('object')
    full_pt_vm_df = full_pt_vm_df.merge(pre_visit_meds, how='inner', on=['person_id', 'feature_name'])

    full_pt_vm_df['time_since_last_feat'] = (full_pt_vm_df['feature_start_date']-full_pt_vm_df['feature_end_date']).dt.days

    # Pivot the table to get person_id as rows and feature_name as columns with timedelta as values
    pivot_temp_meds_visits = full_pt_vm_df.pivot(index='person_id', columns='feature_name', values='time_since_last_feat')
    pivot_temp_meds_visits.columns.name = None  # Remove the pivot column name
    pivot_temp_meds_visits = pivot_temp_meds_visits.reset_index()  # Reset index to bring person_id back as a column
    pivot_temp_meds_visits.fillna(0, inplace=True)

    full_pivot_table = pivot_temp_cond_pro_labs.merge(pivot_temp_meds_visits, how='outer', on='person_id')
    full_pivot_table['iteration'] = iteration
    print(iteration)
        
    full_pivot_table = full_pivot_table.merge(all_rows, how = 'right', on = ['person_id', 'iteration'])
    full_pivot_table.fillna(0, inplace=True)

    if len(full_pivot_table) != len(temp_df_iter_pop):
        print('Mismatched patient size in iteration', iteration)
    
    list_timedeltas.append(full_pivot_table)

-18
-17
-16
-15
-14
-12
-11
-10
-9
-8
-7
-6
-5
-4
-3
-2
-1
0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [23]:
all_timedeltas = pd.concat(list_timedeltas)

In [24]:
del list_timedeltas
del all_visits
del all_conds
del all_meds
del all_cond_lab_pro
del visit_meds

gc.collect()

0

In [25]:
if validation_set == True:
    missing_cols = list(set(list_mdcd_cols).difference(all_timedeltas.columns))
    all_timedeltas.loc[:,missing_cols] = 0

In [26]:
all_timedeltas.fillna(0, inplace=True)
print(all_timedeltas.isna().sum().sum())
print(len(all_timedeltas.loc[all_timedeltas['person_id'].isna()]))
print(len(all_timedeltas.loc[all_timedeltas['person_id']==0]))

0
0
0


In [27]:
all_timedeltas['time_since_psychosis'] = all_timedeltas['iteration'] * days_per_iter/365
all_timedeltas.loc[all_timedeltas['iteration'] <= 0, 'time_since_psychosis'] = 0
# check that time_since_psychosis is correct
print(all_timedeltas['time_since_psychosis'].unique())
print('\n\nPre-psychosis tsp', all_timedeltas.loc[all_timedeltas['iteration']<=0, 'time_since_psychosis'].unique())
print('\n\nPost-psychosis tsp', all_timedeltas.loc[all_timedeltas['iteration']>0, 'time_since_psychosis'].unique())

[0.         0.24657534 0.49315068 0.73972603 0.98630137 1.23287671
 1.47945205 1.7260274  1.97260274 2.21917808 2.46575342 2.71232877
 2.95890411 3.20547945]


Pre-psychosis tsp [0.]


Post-psychosis tsp [0.24657534 0.49315068 0.73972603 0.98630137 1.23287671 1.47945205
 1.7260274  1.97260274 2.21917808 2.46575342 2.71232877 2.95890411
 3.20547945]


  all_timedeltas['time_since_psychosis'] = all_timedeltas['iteration'] * days_per_iter/365


In [28]:
print(all_timedeltas.shape)

(2375123, 1567)


In [29]:
all_timedeltas.to_csv(f'{int_path}/{dataset_prefix}timedeltas.csv', index=False)