In [None]:
import numpy as np
import os
import pandas as pd
import time
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import sys
import gc
import pickle 
import math

from preprocessing_utils import *

In [None]:
codes_path = '../../codes_mappings/'
validation_set = False
remove_corr_features = False

# how much data we are using to make predictions
forward_iterations = 13 # 3 years
backwards_iterations = 14 # full history: 65; median history: 19 (4.3 years)
days_per_iter = 90 # interval size

# censor date to cohort start date
num_days_prediction = 90

# Description
1. Load data and run checks on dates + duplicates + presence of schizophrenia information
2. Map data to the rolled up concepts where appropriate
3. Create a population dataframe with **sequence length** entries per patient. This should have the start date (inclusive) and end date (exclusive) for each subsequence
4. Limit to the date range for the given iteration for each person and call the function that creates the features
   - Conditions, procedures, and labs: counts per subsequence time
   - Medications: total days (end day-start day + 1 to account for single-day medications)
   - Visits: number of visits (frequency)and mean + summed length of stay (end day - start day to count overnights)
5. Populate time since psychosis
6. Fill in "blank" iterations

### Load Data
Also constrict to patients with psychosis at least 6 months pre-index

In [None]:
# read in population dataframe
df_pop = pd.read_csv(f'{data_path}/population_2dx.csv', parse_dates = ['psychosis_diagnosis_date', 'scz_diagnosis_date', 'cohort_start_date'])
print(len(df_pop), df_pop['sz_flag'].sum()/len(df_pop), len(df_pop['person_id'].unique()))
df_pop = df_pop.loc[(df_pop['cohort_start_date']-df_pop['psychosis_diagnosis_date']).dt.days >= num_days_prediction]
print(len(df_pop), df_pop['sz_flag'].sum()/len(df_pop), len(df_pop['person_id'].unique()))
df_pop['censor_date'] = df_pop['cohort_start_date'] - pd.Timedelta(days=num_days_prediction)

count_visits = pd.read_csv(f'{int_path}/hcu_visit_counts.csv', parse_dates = ['first_visit'])
df_pop = df_pop.merge(count_visits[['person_id', 'first_visit']], how = 'left', on = 'person_id')

if dataset == 'mdcd_1yr': 
    df_3yr_pop = pd.read_csv(f'{path}/raw_data_mdcd_3yrs/population_2dx.csv')
    print(len(df_pop))
    df_pop = df_pop.loc[~df_pop['person_id'].isin(df_3yr_pop['person_id'])]
    print(len(df_pop))

In [None]:
all_visits = pd.read_csv(f'{data_path}/temporal_visits.csv', parse_dates = ['cohort_start_date', 'visit_start_date', 'visit_end_date'])
all_visits = pre_censor_data(all_visits, df_pop, 'visit_start_date')
all_visits.loc[all_visits['visit_end_date'] > all_visits['censor_date'], 'visit_end_date'] = all_visits.loc[all_visits['visit_end_date'] > all_visits['censor_date'], 'censor_date']
print('Duplicate Visits (should be True)', 'Unnamed: 0' not in all_visits.columns, len(all_visits) == len(all_visits['visit_occurrence_id'].unique()))

all_meds = pd.read_csv(f'{data_path}/temporal_medications.csv', parse_dates = ['cohort_start_date', 'drug_era_start_date', 'drug_era_end_date'])
all_meds = pre_censor_data(all_meds, df_pop, 'drug_era_start_date')
all_meds.loc[all_meds['drug_era_end_date'] > all_meds['censor_date'], 'drug_era_end_date'] = all_meds.loc[all_meds['drug_era_end_date'] > all_meds['censor_date'], 'censor_date']

all_conds = pd.read_csv(f'{data_path}/temporal_conditions.csv', parse_dates = ['cohort_start_date', 'condition_start_date'])
all_conds = pre_censor_data(all_conds, df_pop, 'condition_start_date')

all_procedures = pd.read_csv(f'{data_path}/temporal_procedures.csv', parse_dates = ['cohort_start_date', 'procedure_date'])
all_procedures = pre_censor_data(all_procedures, df_pop, 'procedure_date')

all_labs = pd.read_csv(f'{data_path}/temporal_labs.csv', parse_dates = ['cohort_start_date', 'measurement_date'])
all_labs = pre_censor_data(all_labs, df_pop, 'measurement_date')

In [None]:
# check for schizophrenia in fine-grained conditions
scz_codes = pd.read_csv(codes_path+'all_scz_codes.csv')
print('Check granular presence of SCZ:',len(all_conds.loc[all_conds['condition_concept_id'].isin(scz_codes['standard_concept_id'])]))

### Map data to other concepts

In [None]:
rolled_medications = pd.read_csv(codes_path + 'rolled_medications.csv')
all_meds = all_meds.merge(rolled_medications[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'drug_concept_id', right_on = 'descendant_concept_id')
all_meds = all_meds[['person_id','drug_era_id','drug_era_start_date', 'drug_era_end_date', 'cohort_start_date', 'drug_concept_id', 'rolled_concept_name', 'drug_exposure_count', 'censor_date']].drop_duplicates()
all_meds.loc[all_meds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_meds.loc[all_meds['rolled_concept_name'].isna(), 'drug_concept_id']

list_med_concepts = list(all_meds['rolled_concept_name'])
list_med_concepts = [str(i) + '_meds' for i in list_med_concepts]
all_meds['rolled_concept_name'] = list_med_concepts

print('Duplicate Meds (Expect True)', 'Unnamed: 0' not in all_meds.columns, len(all_meds) == len(all_meds[['person_id', 'rolled_concept_name', 'drug_concept_id', 'drug_era_start_date', 'drug_era_end_date']].drop_duplicates()))

In [None]:
rolled_conditions = pd.read_csv(codes_path + 'rolled_conditions_level4.csv')
all_conds = all_conds.merge(rolled_conditions[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'condition_concept_id', right_on = 'descendant_concept_id')
all_conds = all_conds[['person_id','condition_occurrence_id','condition_start_date', 'condition_concept_id', 'concept_name', 'rolled_concept_name', 'cohort_start_date', 'visit_occurrence_id', 'censor_date']].drop_duplicates()
all_conds.loc[all_conds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_conds.loc[all_conds['rolled_concept_name'].isna(), 'concept_name']

list_cond_concepts = list(all_conds['rolled_concept_name'])
list_cond_concepts = [str(i) + '_conds' for i in list_cond_concepts]
all_conds['rolled_concept_name'] = list_cond_concepts

# now check in more granular conditions
for i in list_cond_concepts:
    if 'schizo' in i.lower():
        print(i)
# check uniqueness
print('Duplicate Conds (Expect True)', 'Unnamed: 0' not in all_conds.columns, len(all_conds) == len(all_conds[['person_id', 'rolled_concept_name', 'condition_start_date', 'condition_occurrence_id']].drop_duplicates()))

In [None]:
rolled_procedures = pd.read_csv(codes_path + 'rolled_procedures_level4.csv')
all_procedures = all_procedures.merge(rolled_procedures[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'procedure_concept_id', right_on = 'descendant_concept_id')
all_procedures = all_procedures[['person_id','procedure_occurrence_id','procedure_date', 'procedure_concept_id','concept_name', 'rolled_concept_name', 'cohort_start_date', 'censor_date']].drop_duplicates()
all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'concept_name']

list_procedure_concepts = list(all_procedures['rolled_concept_name'])
list_procedure_concepts = [str(i) + '_procedure' for i in list_procedure_concepts]
all_procedures['rolled_concept_name'] = list_procedure_concepts

print('Duplicate Procedures (Expect True)', 'Unnamed: 0' not in all_procedures.columns, len(all_procedures) == len(all_procedures[['person_id','rolled_concept_name', 'procedure_concept_id', 'procedure_date', 'procedure_occurrence_id']].drop_duplicates()))

In [None]:
all_labs['rolled_concept_name'] = all_labs['concept_name'].astype(str) + '_lab'
all_labs = all_labs[['person_id', 'measurement_concept_id', 'measurement_date', 'measurement_id', 'rolled_concept_name', 'censor_date']].drop_duplicates()
print('Duplicate Labs (Expect True)', 'Unnamed: 0' not in all_labs.columns, len(all_labs) == len(all_labs[['person_id', 'measurement_concept_id', 'measurement_date', 'measurement_id']].drop_duplicates()))

In [None]:
all_conds['start_date'] = all_conds['condition_start_date'].copy()
all_procedures['start_date'] = all_procedures['procedure_date'].copy()
all_labs['start_date'] = all_labs['measurement_date'].copy()

all_cond_lab_pro = pd.concat([all_conds, all_procedures, all_labs])
all_cond_lab_pro = all_cond_lab_pro[['person_id', 'rolled_concept_name','cohort_start_date', 'start_date', 'censor_date']]

### Delete Rare Features: anything that does not occur in at least 1% of patients

In [None]:
if validation_set == False:
    print(len(all_cond_lab_pro))
    all_cond_lab_pro = drop_rare_occurrences(all_cond_lab_pro, 'rolled_concept_name', 'person_id', len(df_pop), threshold = 0.01)
    print(len(all_cond_lab_pro))
    
    print(len(all_meds))
    all_meds = drop_rare_occurrences(all_meds, 'rolled_concept_name', 'person_id', len(df_pop), threshold = 0.01)
    print(len(all_meds))

    print(len(all_visits))
    all_visits = drop_rare_occurrences(all_visits, 'visit_concept_id', 'person_id', len(df_pop), threshold = 0.01)
    print(len(all_visits))
else:    
    with open(f'{path}/intermediate_data_mdcd_3yrs/9_26_mdcd_2dx_fullhistory_du_snomed_colnames', "rb") as fp:   #Pickling
        list_mdcd_cols = pickle.load(fp)
    all_cond_lab_pro = drop_unshared_features(all_cond_lab_pro, 'rolled_concept_name', list_mdcd_cols)
    all_meds = drop_unshared_features(all_meds, 'rolled_concept_name', list_mdcd_cols)

### Check for Data Leakage: 
Minimum times should be at least 90 days and cohort start date should be same across all dfs

In [None]:
check = (all_cond_lab_pro['cohort_start_date']-all_cond_lab_pro['start_date']).dt.days
print('Labs, conditions, procedures:', check.min(), check.max())

check = (all_meds['cohort_start_date']-all_meds['drug_era_start_date']).dt.days
print('Meds (Start of prescription):', check.min(), check.max())
check = (all_meds['cohort_start_date']-all_meds['drug_era_end_date']).dt.days
print('Meds (End of prescription):', check.min(), check.max())

check = (all_visits['cohort_start_date']-all_visits['visit_start_date']).dt.days
print('Visits (Start of visit):', check.min(), check.max())
check = (all_visits['cohort_start_date']-all_visits['visit_end_date']).dt.days
print('Visits (End of visit):', check.min(), check.max())

In [None]:
check_cohort_start = df_pop[['person_id','cohort_start_date']]
check_cohort_start = check_cohort_start.merge(all_cond_lab_pro[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_pop','_cond'])
check_cohort_start = check_cohort_start.merge(all_visits[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes = ['_old1','_visits'])
check_cohort_start = check_cohort_start.merge(all_meds[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old4','_meds'])
check_cohort_start.set_index('person_id',inplace=True)
check_cohort_start = check_cohort_start.T
num_unique = check_cohort_start.T.apply(lambda x: x.nunique(), axis=1)
print('Number of places where cohort start date doesnt align:',(num_unique>1).sum())

check_censor = df_pop[['person_id','censor_date']]
check_censor = check_censor.merge(all_cond_lab_pro[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_pop','_cond'])
check_censor = check_censor.merge(all_visits[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes = ['_old1','_visits'])
check_censor = check_censor.merge(all_meds[['person_id','censor_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old4','_meds'])
check_censor.set_index('person_id',inplace=True)
check_censor = check_censor.T
num_unique = check_censor.T.apply(lambda x: x.nunique(), axis=1)
print('Number of places where censor date doesnt align:',(num_unique>1).sum())

In [None]:
# Number of features
print('Num Features Conds/Labs/Procedures', len(all_cond_lab_pro['rolled_concept_name'].unique()))
print('Num Features Meds', len(all_meds['rolled_concept_name'].unique()))
print('Num Features Visits', len(all_visits['visit_concept_id'].unique()))

### "Psychiatric" visits

In [None]:
mental_health_conds = pd.read_csv(f'{codes_path}/mental_disorder_descendants.csv')
mh_visits = all_conds.loc[all_conds['condition_concept_id'].isin(mental_health_conds['descendant_concept_id']), 'visit_occurrence_id']
df_mh_visits = all_visits.loc[all_visits['visit_occurrence_id'].isin(mh_visits)]

all_visits['visit_concept_id'] = all_visits['visit_concept_id'].astype(str) + '_ALL'
df_mh_visits['visit_concept_id'] = df_mh_visits['visit_concept_id'].astype(str) + '_MH'
all_visits = pd.concat([all_visits, df_mh_visits], axis=0)

In [None]:
len(all_visits)

# Function for processing features

In [None]:
# temp pop needs to have a "years obs" column
def make_static_df(temp_pop, temp_cond_pro_labs, temp_meds, temp_visits):
    ### CONDITIONS/PROCEDURES/LABS
    condprolab_features = temp_cond_pro_labs.pivot_table(index='person_id', columns='rolled_concept_name', aggfunc='size', fill_value=0)
    
    # get conditions per year
    condprolab_features = condprolab_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    condprolab_features = condprolab_features.div(condprolab_features.years_obs, axis=0) 
    condprolab_features.drop(['years_obs'], axis=1, inplace=True)
    
    ### MEDICATIONS
    temp_meds['drug_exposure_days'] = (temp_meds['drug_era_end_date']-temp_meds['drug_era_start_date']).dt.days + 1 # +1 so a 1-day prescription will not be 0 days
    count_meds = temp_meds[['person_id', 'rolled_concept_name', 'drug_exposure_days']].groupby(['person_id', 'rolled_concept_name']).sum().reset_index()
    meds_features = count_meds.pivot_table(index='person_id', columns='rolled_concept_name', values='drug_exposure_days', fill_value=0)
    
    # get medications per year
    meds_features = meds_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    meds_features = meds_features.div(meds_features.years_obs, axis=0) 
    meds_features.drop(['years_obs'], axis=1, inplace=True)

    ###VISITS
    # Number of visits
    num_visits = temp_visits.groupby(['person_id', 'visit_concept_id']).count()['cohort_start_date'].reset_index()
    num_visits = num_visits.pivot_table(index='person_id', columns = 'visit_concept_id', values = 'cohort_start_date', fill_value=0)
    num_visits_columns = [str(i)+'_num_visits' for i in num_visits.columns]
    num_visits.columns = num_visits_columns
    # adjust so that num_visits is per year of observation
    num_visits = num_visits.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    num_visits = num_visits.div(num_visits.years_obs, axis=0) 
    num_visits.drop(['years_obs'], axis=1, inplace=True)
    
    # Length of Stay

    temp_visits['los'] = (temp_visits['visit_end_date']-temp_visits['visit_start_date']).dt.days
    los = temp_visits.groupby(['person_id', 'visit_concept_id'])['los'].sum()
    los = los.reset_index()
    los = los.pivot_table(index='person_id', columns = 'visit_concept_id', values='los', fill_value = 0)
    los.columns = [col+'_los' for col in los.columns]

    visits_features = num_visits.merge(los, how='outer', left_index=True, right_index=True)

    # TODO: check that all three of these have person_id index
    atemporal_features = condprolab_features.merge(meds_features, how = 'outer', left_index=True, right_index=True)
    atemporal_features = atemporal_features.merge(visits_features, how = 'outer', left_index=True, right_index=True)
    
    return atemporal_features

### Create the "iterative" population dataframe: 
- Start at the date of initial psychosis diagnosis, then go every XX days (120 days), cutting off at the censor date (if you go over the censor date, chop to the censor date).
- Practically, this means that start date 1 is first visit & end date 1 is psychosis; then start date 2 is psychosis dx date and end date 2 is psychosis dx + 90... 
- Then, starting at the date of psychosis, go back in XX-day increments for 3 years (9 iterations). The earliest iteration (furthest away from psychosis date) should consist of all prior data, and if a person has less than 3 years of data pre-psychosis, they should have fewer early visits.
- Note that start dates are inclusive and end dates are exclusive
- **KEEP IN MIND FOR LATER: WE WANT TO PAD AT THE BEGINNING, NOT AT THE END. So then we move each person to be aligned at the end**

In [None]:
df_pop = df_pop[['person_id', 'first_visit', 'cohort_start_date', 'psychosis_diagnosis_date', 'censor_date']]

df_pop['0_end'] = df_pop['psychosis_diagnosis_date']
df_pop['0_start'] = df_pop['psychosis_diagnosis_date'] - pd.Timedelta(days_per_iter, 'days')
print(len(df_pop))
df_pop.loc[df_pop['0_end']>df_pop['censor_date'], '0_end'] = df_pop.loc[df_pop['0_end']>df_pop['censor_date'], 'censor_date']
print(len(df_pop))

# after the loops, remove people for whom 0_start-0_end > 0

# FORWARD LOOP: starting at psychosis dx, every XX days till censor date
for count in range(1, forward_iterations+1): 
    # get the start date as the same day as prev end date and the end date as start + 120 days
    df_pop[str(count)+'_start'] = df_pop[str(count-1)+'_end']
    df_pop[str(count)+'_end'] = df_pop[str(count)+'_start'] + pd.Timedelta(days_per_iter, 'days')
    
    # update start/end dates to make sure it is at max, the censor date
    df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], 'censor_date']
    df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], 'censor_date']
    
    # if start date == censor date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_start'] == df_pop['censor_date'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]

for count in np.arange(-1, -1*backwards_iterations, -1):
    df_pop[str(count)+'_end'] = df_pop[str(count+1)+'_start']
    df_pop[str(count)+'_start'] = df_pop[str(count)+'_end']-pd.Timedelta(days_per_iter, 'days')
    
    # if the visit starts or ends prior to first_visit, set start/end to first_visit
    df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], 'first_visit']
    df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], 'first_visit']
    
    # if end date == first visit date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_end'] == df_pop['first_visit'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]
    
    df_pop[str(count)+'_end'] = pd.to_datetime(df_pop[str(count)+'_end'], format = '%Y-%m-%d')

# set the last backwards iteration start date to be the first visit ONLY if the date is not nan
df_pop.loc[~(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna()), f'{str(-1*backwards_iterations+1)}_start'] = df_pop.loc[~(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna()), 'first_visit']
print(df_pop[f'{str(-1*backwards_iterations+1)}_start'].isna().sum()/len(df_pop)) # check that this is ~50% or 1

In [None]:
remove_cols = []
for col in df_pop.columns: 
    if df_pop[col].isna().sum() == len(df_pop):
        remove_cols.append(col)
print(remove_cols)

In [None]:
# make sure all "timesteps" are at least 1 day
# also make sure none of them start/end after the censor date
for i in np.arange(-1*backwards_iterations+1, forward_iterations):
    df_pop.loc[df_pop[str(i)+'_end']==df_pop[str(i)+'_start'], [str(i)+'_start', str(i)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')] 
df_pop.drop(remove_cols, axis=1, inplace=True)

### Loop through the iterative population dataframe to get the features for each person in each iteration

In [None]:
iter_cols = [col for col in list(df_pop.columns) if '_end' in col]
iter_cols = [int(i.split('_')[0]) for i in iter_cols]
print(iter_cols)

In [None]:
list_feature_dfs = []

for iteration in np.arange(np.min(iter_cols), np.max(iter_cols), 1): 
    temp_df_iter_pop = df_pop.copy()
    temp_df_iter_pop['iter_start_date'] = temp_df_iter_pop[str(iteration)+'_start']
    temp_df_iter_pop['iter_end_date'] = temp_df_iter_pop[str(iteration)+'_end']
    
    # constrict to people with a valid iteration
    temp_df_iter_pop = temp_df_iter_pop.loc[~(temp_df_iter_pop['iter_start_date'].isna())]
    temp_df_iter_pop['years_obs'] = (temp_df_iter_pop['iter_end_date']-temp_df_iter_pop['iter_start_date']).dt.days/365

    temp_df_iter_pop['iteration'] = iteration
    all_rows = temp_df_iter_pop[['person_id', 'iteration']]
    
    # for conditions, labs, procedures, just compare the start_date to the cutoff date
    temp_cond_lab_pro = all_cond_lab_pro.loc[all_cond_lab_pro['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_cond_lab_pro = temp_cond_lab_pro.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_cond_lab_pro = temp_cond_lab_pro.loc[temp_cond_lab_pro['start_date']>= temp_cond_lab_pro['iter_start_date']]
    temp_cond_lab_pro = temp_cond_lab_pro.loc[temp_cond_lab_pro['start_date']< temp_cond_lab_pro['iter_end_date']]
    
    # note: for meds and visits, replace iter_end_date with equal_end_date, which is the 
    # day before the actual end date since is the last day that we are allowing the visit to "equal"
    # ie visit < iter_end_date but visit <= equal_end_date
    
    #for medications and visits, we want to look at 
    #1. med start date needs to be before iteration end date
    #2. med end date needs to be on or after iteration start date
    
    #3. if med start date is before iteration start date -- make med start date iteration start date
    #4. if med end date is after iteration end date -- make med end date iteration end date
        
    temp_meds = all_meds.loc[all_meds['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_meds = temp_meds.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_meds['equal_end_date'] = temp_meds['iter_end_date']-pd.Timedelta(1, 'days')
    
    temp_meds = temp_meds.loc[(temp_meds['drug_era_start_date']<temp_meds['iter_end_date'])&(temp_meds['drug_era_end_date']>=temp_meds['iter_start_date'])]
    temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date'], 'drug_era_start_date'] = temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date'], 'iter_start_date']
    temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['equal_end_date'], 'drug_era_end_date'] = temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['equal_end_date'], 'equal_end_date']

    temp_meds['days_to_cohort_start'] = (temp_meds['cohort_start_date']-temp_meds['drug_era_start_date']).dt.days
    
    # Repeat for visits
    temp_visits = all_visits.loc[all_visits['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_visits = temp_visits.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_visits['equal_end_date'] = temp_visits['iter_end_date']-pd.Timedelta(1, 'days')
    
    temp_visits = temp_visits.loc[(temp_visits['visit_start_date']<temp_visits['iter_end_date'])&(temp_visits['visit_end_date']>=temp_visits['iter_start_date'])]
    temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date'], 'visit_start_date'] = temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date'], 'iter_start_date']
    temp_visits.loc[temp_visits['visit_end_date']>temp_visits['equal_end_date'], 'visit_end_date'] = temp_visits.loc[temp_visits['visit_end_date']>temp_visits['equal_end_date'], 'equal_end_date']
    
    temp_visits['days_to_cohort_start'] = (temp_visits['cohort_start_date']-temp_visits['visit_start_date']).dt.days

    
    if len(temp_cond_lab_pro.loc[temp_cond_lab_pro['start_date']>=temp_cond_lab_pro['iter_end_date']])+len(temp_cond_lab_pro.loc[temp_cond_lab_pro['start_date']<temp_cond_lab_pro['iter_start_date']]) > 0:
        print('Leakage in conds')        
        
    if len(temp_meds.loc[temp_meds['drug_era_start_date']>temp_meds['iter_end_date']])+len(temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['iter_end_date']]) > 0:
        print('Leakage in med ends')
    if len(temp_meds.loc[temp_meds['drug_era_end_date']<temp_meds['iter_start_date']])+len(temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date']]) > 0:
        print('Leakage in med starts')
        
    if len(temp_visits.loc[temp_visits['visit_start_date']>temp_visits['iter_end_date']])+len(temp_visits.loc[temp_visits['visit_end_date']>temp_visits['iter_end_date']]) > 0:
        print('Leakage in visit ends')
    if len(temp_visits.loc[temp_visits['visit_end_date']<temp_visits['iter_start_date']])+len(temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date']]) > 0:
        print('Leakage in visit starts')

    all_features = make_static_df(temp_df_iter_pop, temp_cond_lab_pro, temp_meds, temp_visits)
    all_features['iteration'] = iteration 

    all_features = all_features.merge(all_rows, how = 'right', on = ['person_id', 'iteration'])
    all_features.fillna(0, inplace=True)
    
    list_feature_dfs.append(all_features)
    print(iteration)


In [None]:
df_all_iters = pd.concat(list_feature_dfs)

In [None]:
del list_feature_dfs
del all_visits
del all_conds
del all_meds
del all_procedures
del all_labs
del all_cond_lab_pro

gc.collect()

In [None]:
df_all_iters.fillna(0, inplace=True)

In [None]:
if validation_set == True:
    missing_cols = list(set(list_mdcd_cols).difference(df_all_iters.columns))
    print(len(missing_cols))
    df_all_iters.loc[:,missing_cols] = 0
    
# remove correlated features: cumc
if remove_corr_features == True:
    train_df = df_all_iters.loc[df_all_iters['person_id'].isin(train_pids)]
    corr_df = train_df.corr()
    corr_df.drop(['person_id', 'iteration'], axis=1, inplace=True)
    corr_df.drop(['person_id', 'iteration'], axis=0, inplace=True)

    print(len(corr_df.columns))
    reduced_cols_1 = remove_highcorr_cols(1, corr_df.columns, corr_df)
    print(len(reduced_cols_1))
    corr_df = corr_df.loc[reduced_cols_1, reduced_cols_1]
    reduced_cols_2 = remove_highcorr_cols(0.95, corr_df.columns, corr_df)
    print(len(reduced_cols_2))
    
    df_all_iters = df_all_iters.loc[:, ['person_id', 'iteration']+reduced_cols_2]

In [None]:
df_all_iters['time_since_psychosis'] = df_all_iters['iteration'] * days_per_iter/365
df_all_iters.loc[df_all_iters['iteration'] <= 0, 'time_since_psychosis'] = 0
(df_all_iters['iteration']).max()

In [None]:
# check that time_since_psychosis is correct
print('Pre-psychosis tsp', df_all_iters.loc[df_all_iters['iteration']<=0, 'time_since_psychosis'].unique())
print('Post-psychosis tsp', df_all_iters.loc[df_all_iters['iteration']>0, 'time_since_psychosis'].unique())
print(len(df_all_iters.loc[df_all_iters['person_id'].isna()]))
print(len(df_all_iters.loc[df_all_iters['person_id']==0]))

In [None]:
print(df_all_iters.shape)

In [None]:
# create the time to event vector (in 2d form): 
df_just_iters = df_all_iters[['person_id', 'iteration']]
df_just_iters['ranked_iteration'] = df_just_iters['iteration'] - df_just_iters['iteration'].min()

print('Len of all iterations', len(df_just_iters))
list_end_dates = []
min_iteration = df_just_iters['iteration'].min()
max_iteration = df_just_iters['iteration'].max()
for i in range(min_iteration, max_iteration + 1):
    end_dates = df_pop[['person_id', f'{str(i)}_end']]
    end_dates['iteration'] = i
    end_dates.dropna(inplace=True)
    end_dates.rename({f'{str(i)}_end':'interval_end_date'}, axis=1, inplace=True)
    list_end_dates.append(end_dates)

end_dates = pd.concat(list_end_dates)
end_dates['interval_end_date'] = pd.to_datetime(end_dates['interval_end_date'])
df_just_iters = df_just_iters.merge(end_dates, how = 'left', on = ['person_id', 'iteration'])
df_just_iters = df_just_iters.merge(df_pop[['person_id', 'cohort_start_date']], how = 'inner', on = 'person_id')
df_just_iters['time_to_event'] = (df_just_iters['cohort_start_date']-df_just_iters['interval_end_date']).dt.days
print('Len of all iterations', len(df_just_iters))
df_just_iters = df_just_iters.loc[df_just_iters['iteration'] >= 0]
df_just_iters.to_csv(f'{int_path}/{dataset_prefix}time_to_event.csv', index=False)
print('Len of positive iterations', len(df_just_iters))

# CHECK: COHORT START DATE MATCHES IN DF_POP
check_cohort_start = df_just_iters.merge(df_pop[['person_id', 'cohort_start_date']], how = 'inner', on = 'person_id')
print('Len of positive iterations', len(check_cohort_start))
print('Len of cohort matching', len(check_cohort_start['cohort_start_date_x'] == check_cohort_start['cohort_start_date_y']))

# CHECK: EACH PERSON HAS A SINGLE INTERVAL END DATE THAT CORRESPONDS TO PSYCHOSIS DIAGNOSIS DATE
df_just_iters = df_just_iters.merge(df_pop[['person_id', 'psychosis_diagnosis_date']], how = 'inner', on = 'person_id')
print('All ppl should have psychosis interval', sum(df_just_iters['psychosis_diagnosis_date'] == df_just_iters['interval_end_date'])/len(df_just_iters['person_id'].unique()))
# AND NO PERSON HAS AN INTERVAL END DATE FROM BEFORE PSYCHOSIS DIAGNOSIS DATE
print('no pre-psychosis interval',len(df_just_iters.loc[df_just_iters['interval_end_date'] < df_just_iters['psychosis_diagnosis_date']]))
# AND THIS SHOULD ALL CORRESPOND TO A SINGLE RANKED ITERATION TAHT WE WILL USE IN CREATE DATALOADERS
print(df_just_iters.loc[df_just_iters['interval_end_date'] == df_just_iters['psychosis_diagnosis_date'], 'ranked_iteration'].unique())
print(df_just_iters['ranked_iteration'].min())
# CHECK: what is the minimum number of iterations a person has
print(df_just_iters.groupby(['person_id'])['ranked_iteration'].count().min())

df_just_iters.set_index(['person_id','ranked_iteration'], inplace=True)
df_just_iters.sort_index(inplace=True)
print('Check largest difference', find_largest_diff(df_just_iters)['largest_diff'].max()) # should be 1

In [None]:
data_columns = list(df_all_iters.columns)

data_columns.remove('person_id')
data_columns.remove('iteration')

with open(f'{int_path}/{dataset_prefix}du_snomed_colnames', "wb") as fp:   #Pickling
    pickle.dump(data_columns, fp)

In [None]:
df_all_iters.to_csv(f'{int_path}/{dataset_prefix}snomed_data.csv', index=False)

In [None]:
# this is df_pop with the iterations
df_pop.to_csv(f'{int_path}/{dataset_prefix}iteration_dates.csv', index=False)