In [None]:
import numpy as np
import os
import pandas as pd
import time
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from collections import Counter
from datetime import datetime
import sys
import gc
import pymssql
import pickle 
import math

In [None]:
conn = "connection to SQL server"

In [None]:
ccae_validation_set = False
path = 'PATH'
raw_path = path + 'RAW_DATA_PATH'
int_path = path + 'INTERMEDIATE_PATH'

# Create Data
1. Read in data
2. Create a population dataframe with N (N = len_seq) entries per patient. This should have the start date (inclusive) and end date (exclusive) for each "iteration" 
3. In a loop: 
- Limit data to that date-range for each person
- Run a modified version of the function used by the XGBoost model to generate features. 
- Divide by the number of years in the time chunk

### Load Data
Also constrict to patients with psychosis at least 90 days pre-index

In [None]:
num_days_prediction = 90
df_pop = pd.read_csv(raw_path+'population.csv')
df_pop.rename({'psychosis_dx_date':'psychosis_diagnosis_date'}, axis=1, inplace=True)
df_pop['psychosis_diagnosis_date'] = pd.to_datetime(df_pop['psychosis_diagnosis_date'], format="mixed", dayfirst=False)
df_pop['cohort_start_date'] = pd.to_datetime(df_pop['cohort_start_date'], format="mixed", dayfirst=False)
df_pop = df_pop.loc[(df_pop['cohort_start_date']-df_pop['psychosis_diagnosis_date']).dt.days >= num_days_prediction]

In [None]:
all_visits = pd.read_csv(raw_path+'temporal_visits.csv')
df_pop = df_pop.merge(all_visits.groupby('person_id').min()['visit_start_date'], how='left', left_on='person_id',right_index=True)
df_pop.rename({'visit_start_date':'first_visit'}, axis=1, inplace=True)
df_pop.head()

In [None]:
all_conds = pd.read_csv(raw_path+'temporal_conditions.csv')
all_meds = pd.read_csv(raw_path+'temporal_medications.csv')
all_procedures = pd.read_csv(raw_path+'temporal_procedures.csv')
all_labs = pd.read_csv(raw_path+'temporal_labs.csv')

### Restrict all data to appropriate time periods

In [None]:
all_meds = all_meds.loc[all_meds['person_id'].isin(df_pop['person_id'])]
all_meds['cohort_start_date'] = pd.to_datetime(all_meds['cohort_start_date'], format="mixed", dayfirst=False)
all_meds['drug_era_start_date'] = pd.to_datetime(all_meds['drug_era_start_date'], format="mixed", dayfirst=False)
all_meds['drug_era_end_date'] = pd.to_datetime(all_meds['drug_era_end_date'], format="mixed", dayfirst=False)
all_meds = all_meds.loc[(all_meds['cohort_start_date']-all_meds['drug_era_end_date']).dt.days >= num_days_prediction]
all_meds['days_to_cohort_start'] = (all_meds['cohort_start_date']-all_meds['drug_era_start_date']).dt.days

# medications mapping 
medications_mapping_query = ("SELECT c_atc.concept_id as rolled_concept_id, c_atc.concept_name as rolled_concept_name, c_standard.concept_id as descendant_concept_id, c_standard.concept_name as descendant_concept_name "+
                             "FROM dbo.concept as c_atc "+
                             "LEFT JOIN dbo.concept_ancestor as ca on ancestor_concept_id=c_atc.concept_id "+
                             "LEFT JOIN dbo.concept as c_standard on c_standard.concept_id = descendant_concept_id "+
                             "WHERE c_atc.concept_class_id = 'ATC 3rd' AND c_standard.standard_concept = 'S'")

medications_mapping = pd.io.sql.read_sql(medications_mapping_query, conn)



# medications mapping: move Lithium to the antiepileptics category
lithium_list = generate_code_list('Lithium', 'ATC 4th')
medications_mapping.loc[(medications_mapping['descendant_concept_id'].isin(lithium_list))&(medications_mapping['rolled_concept_name']=='ANTIPSYCHOTICS'), 'rolled_concept_name'] = 'ANTIEPILEPTICS'
medications_mapping['rolled_concept_name'].replace({'ANTIEPILEPTICS': 'MOOD STABILIZERS'}, inplace=True)

all_meds = all_meds.merge(medications_mapping[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'drug_concept_id', right_on = 'descendant_concept_id')
all_meds = all_meds[['person_id','drug_era_id','drug_era_start_date', 'drug_era_end_date', 'cohort_start_date', 'drug_concept_id', 'rolled_concept_name', 'drug_exposure_count']].drop_duplicates()
all_meds.loc[all_meds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_meds.loc[all_meds['rolled_concept_name'].isna(), 'drug_concept_id']

list_med_concepts = list(all_meds['rolled_concept_name'])
list_med_concepts = [str(i) + '_meds' for i in list_med_concepts]
all_meds['rolled_concept_name'] = list_med_concepts

In [None]:
all_visits = all_visits.loc[all_visits['person_id'].isin(df_pop['person_id'])]
all_visits['cohort_start_date'] = pd.to_datetime(all_visits['cohort_start_date'], format="mixed", dayfirst=False)
all_visits['visit_start_date'] = pd.to_datetime(all_visits['visit_start_date'], format="mixed", dayfirst=False)
all_visits['visit_end_date'] = pd.to_datetime(all_visits['visit_end_date'], format="mixed", dayfirst=False)
all_visits = all_visits.loc[(all_visits['cohort_start_date']-all_visits['visit_end_date']).dt.days >= num_days_prediction]
all_visits['days_to_cohort_start'] = (all_visits['cohort_start_date']-all_visits['visit_start_date']).dt.days

In [None]:
all_conds = all_conds.loc[all_conds['person_id'].isin(df_pop['person_id'])]
all_conds['cohort_start_date'] = pd.to_datetime(all_conds['cohort_start_date'], format="mixed", dayfirst=False)
all_conds['condition_start_date'] = pd.to_datetime(all_conds['condition_start_date'], format="mixed", dayfirst=False)
all_conds['days_to_cohort_start'] = (all_conds['cohort_start_date']-all_conds['condition_start_date']).dt.days
all_conds = all_conds.loc[all_conds['days_to_cohort_start'] >= num_days_prediction]

rollup_conds = """WITH rolled_conditions AS (SELECT descendant_concept_id FROM dbo.concept_ancestor WHERE ancestor_concept_id = 441840 AND max_levels_of_separation = 4)
                        SELECT rolled_conditions.descendant_concept_id as rolled_concept_id, ca.descendant_concept_id, concept_name as rolled_concept_name
                        FROM rolled_conditions
                        LEFT JOIN dbo.concept_ancestor as ca ON ca.ancestor_concept_id = rolled_conditions.descendant_concept_id
                        LEFT JOIN dbo.concept on rolled_conditions.descendant_concept_id = concept_id"""

conditions_mapping = pd.io.sql.read_sql(rollup_conds, conn)
all_conds = all_conds.merge(conditions_mapping[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'condition_concept_id', right_on = 'descendant_concept_id')
all_conds = all_conds[['person_id','condition_occurrence_id','condition_start_date', 'condition_concept_id', 'concept_name', 'rolled_concept_name', 'cohort_start_date', 'days_to_cohort_start']].drop_duplicates()
all_conds.loc[all_conds['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_conds.loc[all_conds['rolled_concept_name'].isna(), 'concept_name']

list_cond_concepts = list(all_conds['rolled_concept_name'])
list_cond_concepts = [str(i) + '_conds' for i in list_cond_concepts]
all_conds['rolled_concept_name'] = list_cond_concepts

In [None]:
all_procedures = all_procedures.loc[all_procedures['person_id'].isin(df_pop['person_id'])]
all_procedures['cohort_start_date'] = pd.to_datetime(all_procedures['cohort_start_date'], format="mixed", dayfirst=False)
all_procedures['procedure_date'] = pd.to_datetime(all_procedures['procedure_date'], format="mixed", dayfirst=False)
all_procedures['days_to_cohort_start'] = (all_procedures['cohort_start_date']-all_procedures['procedure_date']).dt.days
all_procedures = all_procedures.loc[all_procedures['days_to_cohort_start'] >= num_days_prediction]

rollup_procedures = """WITH rolled_procedures AS (SELECT descendant_concept_id FROM dbo.concept_ancestor WHERE ancestor_concept_id = 45889197 AND max_levels_of_separation = 4)
                        SELECT rolled_procedures.descendant_concept_id as rolled_concept_id, ca.descendant_concept_id, concept_name as rolled_concept_name
                        FROM rolled_procedures
                        LEFT JOIN dbo.concept_ancestor as ca ON ancestor_concept_id = rolled_procedures.descendant_concept_id
                        LEFT JOIN dbo.concept on rolled_procedures.descendant_concept_id = concept_id"""

procedures_mapping = pd.io.sql.read_sql(rollup_procedures, conn)
all_procedures = all_procedures.merge(procedures_mapping[['descendant_concept_id', 'rolled_concept_name', 'rolled_concept_id']], how='left', left_on = 'procedure_concept_id', right_on = 'descendant_concept_id')
all_procedures = all_procedures[['person_id','procedure_occurrence_id','procedure_date', 'procedure_concept_id','concept_name', 'rolled_concept_name', 'cohort_start_date', 'days_to_cohort_start']].drop_duplicates()
all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'rolled_concept_name'] = all_procedures.loc[all_procedures['rolled_concept_name'].isna(), 'concept_name']

list_procedure_concepts = list(all_procedures['rolled_concept_name'])
list_procedure_concepts = [str(i) + '_procedure' for i in list_procedure_concepts]
all_procedures['rolled_concept_name'] = list_procedure_concepts

In [None]:
all_labs = all_labs.loc[all_labs['person_id'].isin(df_pop['person_id'])]
all_labs['cohort_start_date'] = pd.to_datetime(all_labs['cohort_start_date'], format="mixed", dayfirst=False)
all_labs['measurement_date'] = pd.to_datetime(all_labs['measurement_date'], format="mixed", dayfirst=False)
all_labs['days_to_cohort_start'] = (all_labs['cohort_start_date']-all_labs['measurement_date']).dt.days
all_labs = all_labs.loc[all_labs['days_to_cohort_start'] >= num_days_prediction]
all_labs['concept_name'] = all_labs['concept_name'].astype(str) + '_lab'

### Delete Rare Features: anything that does not occur in at least 1% of patients

In [None]:
if ccae_validation_set == False:
    all_conds = drop_rare_occurrences(all_conds, 'rolled_concept_name', col_id = 'person_id', size_pop = len(df_pop))
    all_meds = drop_rare_occurrences(all_meds, 'rolled_concept_name', col_id = 'person_id', size_pop = len(df_pop))
    all_procedures = drop_rare_occurrences(all_procedures, 'rolled_concept_name', col_id = 'person_id', size_pop = len(df_pop))
    all_labs = drop_rare_occurrences(all_labs, 'measurement_concept_id', col_id = 'person_id', size_pop = len(df_pop))
    all_visits = drop_rare_occurrences(all_visits, 'visit_concept_id', col_id = 'person_id', size_pop = len(df_pop))
else:
    with open(path + "raw_data_3yrs/intermediate_data_mdcd/MDCD_11_15_dl_colnames_snomed", "rb") as fp:   #Pickling
        list_mdcd_cols = pickle.load(fp)
    all_conds = drop_unshared_features(all_conds, 'rolled_concept_name', list_mdcd_cols)
    all_meds = drop_unshared_features(all_meds, 'rolled_concept_name', list_mdcd_cols)
    all_procedures = drop_unshared_features(all_procedures, 'rolled_concept_name', list_mdcd_cols)
    all_labs = drop_unshared_features(all_labs, 'concept_name', list_mdcd_cols)

### Check for Data Leakage: 
Minimum times should be at least 90 days and cohort start date should be same across all dfs

In [None]:
check = (all_labs['cohort_start_date']-all_labs['measurement_date']).dt.days
print('Labs:', check.min(), check.max())

check = (all_procedures['cohort_start_date']-all_procedures['procedure_date']).dt.days
print('Procedures:', check.min(), check.max())

check = (all_conds['cohort_start_date']-all_conds['condition_start_date']).dt.days
print('Conditions:', check.min(), check.max())

check = (all_meds['cohort_start_date']-all_meds['drug_era_start_date']).dt.days
print('Meds (Start of prescription):', check.min(), check.max())
check = (all_meds['cohort_start_date']-all_meds['drug_era_end_date']).dt.days
print('Meds (End of prescription):', check.min(), check.max())

check = (all_visits['cohort_start_date']-all_visits['visit_start_date']).dt.days
print('Visits (Start of visit):', check.min(), check.max())
check = (all_visits['cohort_start_date']-all_visits['visit_end_date']).dt.days
print('Visits (End of visit):', check.min(), check.max())

print('Check presence of SCZ:',len(all_conds.loc[all_conds['concept_name'].isin(['Schizophrenia', 'Paranoid schizophrenia'])]))

In [None]:
check_cohort_start = df_pop[['person_id','cohort_start_date']]
check_cohort_start = check_cohort_start.merge(all_conds[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_pop','_cond'])
check_cohort_start = check_cohort_start.merge(all_visits[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes = ['_old1','_visits'])
check_cohort_start = check_cohort_start.merge(all_procedures[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old2','_pro'])
check_cohort_start = check_cohort_start.merge(all_labs[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old3','_labs'])
check_cohort_start = check_cohort_start.merge(all_meds[['person_id','cohort_start_date']].drop_duplicates(),how='left', left_on='person_id', right_on='person_id', suffixes=['_old4','_meds'])
check_cohort_start.set_index('person_id',inplace=True)
check_cohort_start = check_cohort_start.T
num_unique = check_cohort_start.T.apply(lambda x: x.nunique(), axis=1)
print('Number of places where cohort start date doesnt align:',(num_unique>1).sum())

In [None]:
# check for duplicate codes
print('Conds', 'Unnamed: 0' not in all_conds.columns, len(all_conds) == len(all_conds[['person_id', 'rolled_concept_name', 'condition_start_date', 'condition_occurrence_id']].drop_duplicates()))
print('Meds', 'Unnamed: 0' not in all_meds.columns, len(all_meds) == len(all_meds[['person_id', 'rolled_concept_name', 'drug_concept_id', 'drug_era_start_date', 'drug_era_end_date']].drop_duplicates()))
print('Visits', 'Unnamed: 0' not in all_visits.columns, len(all_visits) == len(all_visits['visit_occurrence_id'].unique()))
print('Procedures', 'Unnamed: 0' not in all_procedures.columns, len(all_procedures) == len(all_procedures[['person_id','rolled_concept_name', 'procedure_concept_id', 'procedure_date', 'procedure_occurrence_id']].drop_duplicates()))
print('Labs', 'Unnamed: 0' not in all_labs.columns, len(all_labs) == len(all_labs[['person_id', 'measurement_concept_id', 'measurement_date', 'measurement_id']].drop_duplicates()))

### Make SQL queries for inpatient psych visits

In [None]:
# INPATIENT PSYCH VISITS
query = ("SELECT vo.person_id, vo.visit_occurrence_id, vo.visit_concept_id, co.condition_start_date, vo.visit_start_date, vo.visit_end_date, co.condition_concept_id, c.concept_name as condition_name, p.race_concept_id, p.gender_concept_id "+
         "FROM dbo.visit_occurrence as vo LEFT JOIN dbo.condition_occurrence as co on co.visit_occurrence_id = vo.visit_occurrence_id "+
         "LEFT JOIN dbo.concept as c on c.concept_id = co.condition_concept_id "+
         "LEFT JOIN dbo.person as p on p.person_id = vo.person_id "+
         "WHERE vo.visit_concept_id = 9201 AND condition_concept_id IN "+
         "(SELECT DISTINCT concept_id_2 FROM dbo.concept as c LEFT JOIN dbo.concept_relationship on concept_id_1 = concept_id WHERE c.concept_code LIKE 'F%' AND c.vocabulary_id = 'ICD10CM' AND relationship_id = 'Maps to')")

psych_hosp = pd.io.sql.read_sql(query, conn)
list_psych_visits = list(psych_hosp['visit_occurrence_id'].unique())

# Function for processing features

In [None]:
# temp pop needs to have a "years obs" column
def make_static_df(temp_pop, temp_conds, temp_meds, temp_visits, temp_procedures, temp_labs):
    ### CONDITIONS
    conditions_features = temp_conds.pivot_table(index='person_id', columns='rolled_concept_name', aggfunc='size', fill_value=0)
    
    # get conditions per year
    conditions_features = conditions_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    conditions_features = conditions_features.div(conditions_features.years_obs, axis=0) 
    conditions_features.drop(['years_obs'], axis=1, inplace=True)
    
    ### MEDICATIONS
    temp_meds['drug_exposure_days'] = (temp_meds['drug_era_end_date']-temp_meds['drug_era_start_date']).dt.days + 1 # +1 so a 1-day prescription will not be 0 days
    count_meds = temp_meds[['person_id', 'rolled_concept_name', 'drug_exposure_days']].groupby(['person_id', 'rolled_concept_name']).sum().reset_index()
    meds_features = count_meds.pivot_table(index='person_id', columns='rolled_concept_name', values='drug_exposure_days', fill_value=0)
    
    # get medications per year
    meds_features = meds_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    meds_features = meds_features.div(meds_features.years_obs, axis=0) 
    meds_features.drop(['years_obs'], axis=1, inplace=True)

    ###VISITS    
    # Number of visits
    num_visits = temp_visits.groupby(['person_id', 'visit_concept_id']).count()['cohort_start_date'].reset_index()
    num_visits = num_visits.pivot_table(index='person_id', columns = 'visit_concept_id', values = 'cohort_start_date', fill_value=0)
    num_visits_columns = [str(i)+'_num_visits' for i in num_visits.columns]
    num_visits.columns = num_visits_columns
    # adjust so that num_visits is per year of observation
    num_visits = num_visits.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    num_visits = num_visits.div(num_visits.years_obs, axis=0) 
    num_visits.drop(['years_obs'], axis=1, inplace=True)
    
    # Length of Stay
    non_outpatient = temp_visits.loc[temp_visits['visit_concept_id']!=9202]

    non_outpatient['los'] = (non_outpatient['visit_end_date']-non_outpatient['visit_start_date']).dt.days
    los = non_outpatient.groupby(['person_id', 'visit_concept_id']).agg({'los':['sum', 'max', 'min', 'mean']})
    los = los.reset_index()
    los.columns = [' '.join(col).strip() for col in los.columns.values]

    los = los.pivot_table(index='person_id', columns = 'visit_concept_id', values=['los sum', 'los max', 'los min', 'los mean'], fill_value = 0)
    los.columns = [''.join(str(col)).strip() for col in los.columns.values]

    visits_features = num_visits.merge(los, how='outer', left_index=True, right_index=True)
    
    #### VISITS: INPATIENT HOSPITALIZATIONS
    # limit psych hospitalizations to ones eligible (according to preprocessed visits df)
    psych_hospitalizations = temp_visits.loc[temp_visits['visit_occurrence_id'].isin(list_psych_visits)]
    
    # Number of visits
    num_visits = psych_hospitalizations.groupby('person_id').count()['cohort_start_date'].reset_index()
    num_visits.rename({'cohort_start_date':'num_psych_hospitalizations'}, inplace=True, axis=1)
    num_visits.set_index('person_id', inplace=True)
    # adjust so that num_visits is per year of observation
    num_visits = num_visits.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    num_visits = num_visits.div(num_visits.years_obs, axis=0) 
    num_visits.drop(['years_obs'], axis=1, inplace=True)

    visits_features = visits_features.merge(num_visits, how = 'left', right_index=True, left_index=True).fillna(0)

    # Length of Stay
    temp_visits['los'] = (temp_visits['visit_end_date']-temp_visits['visit_start_date']).dt.days
    los = temp_visits.groupby(['person_id']).agg({'los':['sum', 'max', 'min', 'mean']})
    los.columns = [' '.join(col).strip() for col in los.columns.values]
    los.columns = ['los psych sum', 'los psych max', 'los psych min', 'los psych mean']

    visits_features = visits_features.merge(los, how = 'left', right_index=True, left_index=True).fillna(0)

    ### PROCEDURES
    procedures_features = temp_procedures.pivot_table(index='person_id', columns='rolled_concept_name', aggfunc='size', fill_value=0)
    
    # get procedures per year
    procedures_features = procedures_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    procedures_features = procedures_features.div(procedures_features.years_obs, axis=0) 
    procedures_features.drop(['years_obs'], axis=1, inplace=True)
    
    ### LABS
    lab_features = temp_labs.pivot_table(index='person_id', columns='concept_name', aggfunc='size', fill_value=0)

    # get labs per year
    lab_features = lab_features.merge(temp_pop[['person_id','years_obs']].set_index('person_id'), how='left', left_index=True, right_index=True)
    lab_features = lab_features.div(lab_features.years_obs, axis=0) 
    lab_features.drop(['years_obs'], axis=1, inplace=True)
    
    atemporal_features = pd.concat([conditions_features, meds_features, procedures_features, lab_features, visits_features], axis=1)
    return atemporal_features

### Create the "iterative" population dataframe: 
- Start at the date of initial psychosis diagnosis, then go every XX days (120 days), cutting off at the censor date (if you go over the censor date, chop to the censor date).
- Practically, this means that start date 1 is first visit & end date 1 is psychosis; then start date 2 is psychosis dx date and end date 2 is psychosis dx + 90... 
- Then, starting at the date of psychosis, go back in XX-day increments for 3 years (9 iterations). The earliest iteration (furthest away from psychosis date) should consist of all prior data, and if a person has less than 3 years of data pre-psychosis, they should have fewer early visits.
- Note that start dates are inclusive and end dates are exclusive
- **KEEP IN MIND FOR LATER: WE WANT TO PAD AT THE BEGINNING, NOT AT THE END. So then we move each person to be aligned at the end**

In [None]:
df_pop = df_pop[['person_id', 'first_visit', 'cohort_start_date', 'psychosis_diagnosis_date']]
df_pop['censor_date'] = df_pop['cohort_start_date']-pd.Timedelta(90, 'days')

time_per_iter = 120
df_pop['0_start'] = df_pop['psychosis_diagnosis_date']
df_pop['0_end'] = df_pop['psychosis_diagnosis_date'] + pd.Timedelta(time_per_iter, 'days')
df_pop.loc[df_pop['0_end']>df_pop['censor_date'], '0_end'] = df_pop.loc[df_pop['0_end']>df_pop['censor_date'], 'censor_date']

# after the loops, remove people for whom 0_start-0_end > 0

# FORWARD LOOP: starting at psychosis dx, every XX days till censor date
for count in range(1, int(np.ceil(((df_pop['censor_date']-df_pop['psychosis_diagnosis_date']).dt.days/time_per_iter).max()))): 
    # get the start date as the same day as prev end date and the end date as start + 120 days
    df_pop[str(count)+'_start'] = df_pop[str(count-1)+'_end']
    df_pop[str(count)+'_end'] = df_pop[str(count)+'_start'] + pd.Timedelta(time_per_iter, 'days')
    
    # update start/end dates to make sure it is at max, the censor date
    df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start'] > df_pop['censor_date'], 'censor_date']
    df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] > df_pop['censor_date'], 'censor_date']
    
    # if start date == censor date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_start'] == df_pop['censor_date'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]

for count in np.arange(-1, -10, -1):
    df_pop[str(count)+'_end'] = df_pop[str(count+1)+'_start']
    df_pop[str(count)+'_start'] = df_pop[str(count)+'_end']-pd.Timedelta(time_per_iter, 'days')
    
    # if the visit starts or ends prior to first_visit, set start/end to first_visit
    df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], str(count)+'_start'] = df_pop.loc[df_pop[str(count)+'_start']<df_pop['first_visit'], 'first_visit']
    df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], str(count)+'_end'] = df_pop.loc[df_pop[str(count)+'_end'] < df_pop['first_visit'], 'first_visit']
    
    # if end date == first visit date: set start and end to NaT
    df_pop.loc[df_pop[str(count)+'_end'] == df_pop['first_visit'], [str(count)+'_start', str(count)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')]
    
    df_pop[str(count)+'_end'] = pd.to_datetime(df_pop[str(count)+'_end'], format = '%Y-%m-%d')

# set iteration -10 so that the end date is -9 start and (if not NaT) the start date is first visit
df_pop['-10_end'] = df_pop['-9_start']
df_pop['-10_start'] = np.datetime64('NaT')
df_pop.loc[~(df_pop['-10_end'].isna()), '-10_start'] = df_pop.loc[~(df_pop['-10_end'].isna()), 'first_visit']

In [None]:
df_pop.columns[-50:]

In [None]:
# make sure all "timesteps" are at least 1 day
for i in np.arange(-10, 30):
    df_pop.loc[df_pop[str(i)+'_end']==df_pop[str(i)+'_start'], [str(i)+'_start', str(i)+'_end']] = [np.datetime64('NaT'), np.datetime64('NaT')] 

### Loop through the iterative population dataframe to get the features for each person in each iteration

In [None]:
list_feature_dfs = []

for iteration in np.arange(-10, 30): 
    temp_df_iter_pop = df_pop.copy()
    temp_df_iter_pop['iter_start_date'] = temp_df_iter_pop[str(iteration)+'_start']
    temp_df_iter_pop['iter_end_date'] = temp_df_iter_pop[str(iteration)+'_end']
    
    # constrict to people with a valid iteration
    temp_df_iter_pop = temp_df_iter_pop.loc[~(temp_df_iter_pop['iter_start_date'].isna())]
    temp_df_iter_pop['years_obs'] = (temp_df_iter_pop['iter_end_date']-temp_df_iter_pop['iter_start_date']).dt.days/365
    
    # for conditions, labs, procedures, just compare the start_date to the cutoff date
    temp_conds = all_conds.loc[all_conds['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_conds = temp_conds.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_conds = temp_conds.loc[temp_conds['condition_start_date']>= temp_conds['iter_start_date']]
    temp_conds = temp_conds.loc[temp_conds['condition_start_date']< temp_conds['iter_end_date']]

    temp_labs = all_labs.loc[all_labs['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_labs = temp_labs.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_labs = temp_labs.loc[temp_labs['measurement_date']>= temp_labs['iter_start_date']]
    temp_labs = temp_labs.loc[temp_labs['measurement_date']< temp_labs['iter_end_date']]
    
    temp_procedures = all_procedures.loc[all_procedures['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_procedures = temp_procedures.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_procedures = temp_procedures.loc[temp_procedures['procedure_date']>= temp_procedures['iter_start_date']]
    temp_procedures = temp_procedures.loc[temp_procedures['procedure_date']< temp_procedures['iter_end_date']]
    
    
    # note: for meds and visits, replace iter_end_date with equal_end_date, which is the 
    # day before the actual end date since is the last day that we are allowing the visit to "equal"
    # ie visit < iter_end_date but visit <= equal_end_date
    
    
    #for medications and visits, we want to look at 
    #1. med start date needs to be before iteration end date
    #2. med end date needs to be on or after iteration start date
    
    #3. if med start date is before iteration start date -- make med start date iteration start date
    #4. if med end date is after iteration end date -- make med end date iteration end date
        
    temp_meds = all_meds.loc[all_meds['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_meds = temp_meds.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_meds['equal_end_date'] = temp_meds['iter_end_date']-pd.Timedelta(1, 'days')
    
    temp_meds = temp_meds.loc[(temp_meds['drug_era_start_date']<temp_meds['iter_end_date'])&(temp_meds['drug_era_end_date']>=temp_meds['iter_start_date'])]
    temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date'], 'drug_era_start_date'] = temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date'], 'iter_start_date']
    temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['equal_end_date'], 'drug_era_end_date'] = temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['equal_end_date'], 'equal_end_date']

    temp_meds['days_to_cohort_start'] = (temp_meds['cohort_start_date']-temp_meds['drug_era_start_date']).dt.days
    
    # Repeat for visits
    temp_visits = all_visits.loc[all_visits['person_id'].isin(temp_df_iter_pop['person_id'])]
    temp_visits = temp_visits.merge(temp_df_iter_pop[['person_id','iter_start_date', 'iter_end_date']], how = 'left', left_on = 'person_id', right_on = 'person_id')
    temp_visits['equal_end_date'] = temp_visits['iter_end_date']-pd.Timedelta(1, 'days')
    
    temp_visits = temp_visits.loc[(temp_visits['visit_start_date']<temp_visits['iter_end_date'])&(temp_visits['visit_end_date']>=temp_visits['iter_start_date'])]
    temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date'], 'visit_start_date'] = temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date'], 'iter_start_date']
    temp_visits.loc[temp_visits['visit_end_date']>temp_visits['equal_end_date'], 'visit_end_date'] = temp_visits.loc[temp_visits['visit_end_date']>temp_visits['equal_end_date'], 'equal_end_date']
    
    temp_visits['days_to_cohort_start'] = (temp_visits['cohort_start_date']-temp_visits['visit_start_date']).dt.days

    
    if len(temp_conds.loc[temp_conds['condition_start_date']>=temp_conds['iter_end_date']])+len(temp_conds.loc[temp_conds['condition_start_date']<temp_conds['iter_start_date']]) > 0:
        print('Leakage in conds')        
    if len(temp_labs.loc[temp_labs['measurement_date']>=temp_labs['iter_end_date']])+len(temp_labs.loc[temp_labs['measurement_date']<temp_labs['iter_start_date']]) > 0:
        print('Leakage in labs')
    if len(temp_procedures.loc[temp_procedures['procedure_date']>=temp_procedures['iter_end_date']])+len(temp_procedures.loc[temp_procedures['procedure_date']<temp_procedures['iter_start_date']]) > 0:
        print('Leakage in procedures')
        
    if len(temp_meds.loc[temp_meds['drug_era_start_date']>temp_meds['iter_end_date']])+len(temp_meds.loc[temp_meds['drug_era_end_date']>temp_meds['iter_end_date']]) > 0:
        print('Leakage in med ends')
    if len(temp_meds.loc[temp_meds['drug_era_end_date']<temp_meds['iter_start_date']])+len(temp_meds.loc[temp_meds['drug_era_start_date']<temp_meds['iter_start_date']]) > 0:
        print('Leakage in med starts')
        
    if len(temp_visits.loc[temp_visits['visit_start_date']>temp_visits['iter_end_date']])+len(temp_visits.loc[temp_visits['visit_end_date']>temp_visits['iter_end_date']]) > 0:
        print('Leakage in visit ends')
    if len(temp_visits.loc[temp_visits['visit_end_date']<temp_visits['iter_start_date']])+len(temp_visits.loc[temp_visits['visit_start_date']<temp_visits['iter_start_date']]) > 0:
        print('Leakage in visit starts')

    all_features = make_static_df(temp_df_iter_pop, temp_conds, temp_meds, temp_visits, temp_procedures, temp_labs)
    all_features['iteration'] = iteration
    
    # add: time since psychosis = iter_start_date-psychosis_diagnosis_date
    if iteration > 0:
        all_features['time_since_psychosis'] = (temp_df_iter_pop['iter_start_date']-temp_df_iter_pop['psychosis_diagnosis_date']).dt.days/365
    else: 
        all_features['time_since_psychosis'] = 0
        
    list_feature_dfs.append(all_features)
    print(iteration)

In [None]:
df_all_iters = pd.concat(list_feature_dfs)

In [None]:
if ccae_validation_set == True:
    missing_cols = list(set(list_mdcd_cols).difference(df_all_iters.columns))
    df_all_iters.loc[:,missing_cols] = 0

In [None]:
# create ranked iterations, where the smallest iter per patient is 0 and the largest is (max)
ranked_vals = df_all_iters.reset_index().groupby('person_id')['iteration'].rank(method='first').values
df_all_iters['ranked_iteration'] = ranked_vals

df_all_iters.fillna(0, inplace=True)

print(df_all_iters['ranked_iteration'].max())
df_all_iters.to_csv(int_path + 'CCAE_11_15_dl_data_snomed_individualfeats.csv')