In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
import gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import random
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#load CSV files which are direct extracts from OMOP tables
conds = pd.read_csv('./data/raw_data/EHR/EHR_cohort_conditions.csv').drop('Unnamed: 0', axis=1)
conds = conds[conds['condition_concept_id'] != 0]
drugs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_drugs.csv').drop('Unnamed: 0', axis=1)
drugs = drugs[drugs['drug_concept_id'] != 0]
procs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_procedures.csv').drop('Unnamed: 0', axis=1)
procs = procs[procs['procedure_concept_id'] != 0]
obs = pd.read_csv('./data/raw_data/EHR/EHR_cohort_observations.csv').drop('Unnamed: 0', axis=1)
obs = obs[obs['observation_concept_id'] != 0]

conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])
procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])
drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])
obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])

conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])
procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])
drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])
obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])

measurements = pd.read_csv('./data/raw_data/EHR/EHR_cohort_measurements.csv').drop('Unnamed: 0', axis=1)
measurements = measurements[~pd.isnull(measurements['value_as_number'])]
measurements = measurements[measurements['measurement_concept_id'] != 0]
measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])
measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])

In [None]:
#load IDs of mothers in omics cohort
OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values

In [None]:
#filter data to only mothers in omics cohort
conds = conds[conds['mom_person_id'].isin(OOL_cohort_omop)]
drugs = drugs[drugs['mom_person_id'].isin(OOL_cohort_omop)]
procs = procs[procs['mom_person_id'].isin(OOL_cohort_omop)]
measurements = measurements[measurements['mom_person_id'].isin(OOL_cohort_omop)]
obs = obs[obs['mom_person_id'].isin(OOL_cohort_omop)]

In [None]:
#load OMOP concept table for easier interpretability
concepts = pd.read_csv('../cancer/data/raw_data/EHR/concepts.csv').drop('Unnamed: 0',axis=1)

In [None]:
#label drugs with their name
drugs = drugs.merge(concepts, how='left', left_on='drug_concept_id', right_on='concept_id')

In [None]:
def filter_df(df, birth_time, time_col, time_range_days=280):
    """
    A function to remove entries in a dataframe prior to time of birth. 
    
    df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother
    birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME
    time_col: The index of the column with the date of the event in df
    time_range_days: keeps data from delivery up to time_range_days prior 
    
    """
    print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))
    df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')
    df['diff'] = df['birth_DATETIME']-df[time_col]
    new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)
    print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))
    return new_df

def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):
    df = proteomics[['DOS','mom_person_id','child_person_id','sample_ID']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])
    df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days
    df = df[df['delta'] < df['DOS']]
    return df
    

In [None]:
#load key file which can be used to map proteomics data to mother person_id
patient_indices = pd.read_csv('./data/processed_data/sampleID_indices.csv').drop('Unnamed: 0',axis=1)

In [None]:
#load and clean proteomics data
OOL_proteomics = pd.read_csv('./data/processed_data/ool_proteomics_omop_id.csv').drop(['Unnamed: 0','SampleID','ID','EGA'],axis=1)
OOL_proteomics['sample_ID'] = OOL_proteomics['maternal_person_id'].astype(str)+'_'+OOL_proteomics['Timepoint'].astype(str)
OOL_proteomics = OOL_proteomics.drop(['Timepoint','maternal_person_id'],axis=1)
OOL_proteomics.columns = [str(i)+'_protein' for i in OOL_proteomics.columns]
OOL_proteomics = OOL_proteomics.rename(columns={'DOS_protein':'DOS_sampling_time', 'sample_ID_protein':'sample_ID'})
OOL_proteomics = OOL_proteomics[['sample_ID','DOS_sampling_time']]
OOL_proteomics['mom_person_id'] = OOL_proteomics['sample_ID'].str[0:7].astype(int)

In [None]:
## This block of code creates a dataframe with mom_person_id, child_person_id, min_delta, max_delta 
## (based on the range of EHR data available), days to onset, and a combined sample_ID col which is used as an identifier

# Filter and calculate delta
time_col_name = 'condition_start_DATETIME'
df = conds
df['delta'] = (df[time_col_name] - df['child_birth_date']).dt.days

# Calculate min and max delta in one operation
ool = df.groupby(['mom_person_id', 'child_person_id'])['delta'].agg(['min', 'max'])
ool.columns = ['min_delta', 'max_delta']

# Filter for samples with at least 7 days between min and max
sampling_df = ool[ool['max_delta'] - ool['min_delta'] >= 7].reset_index()

# Create initial sample_ID
sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str) + '_' + sampling_df['child_person_id'].astype(str)

# Filter based on OOL_sample_IDs
OOL_sample_IDs = np.unique([i[0:15] for i in list(patient_indices['0'])])
sampling_df = sampling_df[sampling_df['sample_ID'].str[:15].isin(OOL_sample_IDs)]

# Merge with OOL_proteomics
sampling_df = sampling_df.merge(OOL_proteomics, how='inner', on='mom_person_id', suffixes=('_x', '_y'))

# Set DOS
sampling_df['DOS'] = sampling_df['DOS_sampling_time']

# Create the correct sample_ID
sampling_df['sample_ID'] = sampling_df['sample_ID_x'] + sampling_df['sample_ID_y'].str[-3:]

# Drop unnecessary columns
columns_to_drop = ['sample_ID_x', 'sample_ID_y', 'DOS_sampling_time']
sampling_df = sampling_df.drop(columns_to_drop, axis=1, errors='ignore')

In [None]:
#filter data so it only occurs within the correct time range (beginning of pregnancy thru sampling)
condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')
procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')
drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')
measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')
observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')


In [None]:
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print(f"Starting epoch #{self.epoch}")

    def on_epoch_end(self, model):
        print(f"Finished epoch #{self.epoch}")
        self.epoch += 1

In [None]:
# Train word2vec model
# NOTE: For word2vec model training, we do NOT do the date filtering and use all data from pregnancy
try:
    model = Word2Vec.load("./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model")
    #model = Word2Vec.load("./models/word2vec/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model")
except:
    
    epoch_logger = EpochLogger()

    word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]
    word2vec_conds.columns = ['sample_ID','concept_id','ts']

    word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]
    word2vec_procs.columns = ['sample_ID','concept_id','ts']

    word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]
    word2vec_drug.columns = ['sample_ID','concept_id','ts']
    
    word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]
    word2vec_mea.columns = ['sample_ID','concept_id','ts']
    
    word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]
    word2vec_obs.columns = ['sample_ID','concept_id','ts']
    
    word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)
    word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])
    word2vec_data['date'] = word2vec_data['date'].dt.date
    word2vec_data = word2vec_data.drop('ts',axis=1)
    word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]
    word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)
    
    grouped_data = word2vec_data.groupby(['sample_ID', 'date'])
    sentences = []
    for _, group in tqdm(grouped_data):
        codes = group['concept_id'].tolist()
        random.shuffle(codes)
        sentences.append(codes)
        
    print('starting training')
    model = Word2Vec(sentences, vector_size=400, window=1000, min_count=5, workers=64)
    model.train(sentences, total_examples=len(sentences), epochs=20, callbacks=[epoch_logger])
    model.save("./models/word2vec_OOL_cohort_measurements_observations_full_pregnancy_sampling_400dim.model")


In [None]:
code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}

In [None]:
#map EHR data to their respective learned embeddings from word2vec
embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]
embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]
embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]


In [None]:
embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]
embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]
embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]


In [None]:
embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]
embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]
embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]


In [None]:
embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]
embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]
embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]


In [None]:
embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]
embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]
embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]


In [None]:
embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)
embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)
embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)
embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)
embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)


In [None]:
%%time
#combine all EHR data tables together
embedded_conds = embedded_conds[~pd.isnull(embedded_conds['embedding'])]
embedded_procs = embedded_procs[~pd.isnull(embedded_procs['embedding'])]
embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['embedding'])]
embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['embedding'])]
embedded_obs = embedded_obs[~pd.isnull(embedded_obs['embedding'])]

all_data = pd.concat([embedded_conds.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',
                      'condition_start_DATETIME','child_birth_date','delta',
                                'condition_source_value'],axis=1),
    embedded_procs.drop(['DOS','mom_person_id','child_person_id','person_id', 'procedure_concept_id',
                        'procedure_DATETIME','child_birth_date','delta'],axis=1),
     embedded_drugs.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',
                        'drug_exposure_start_DATETIME','child_birth_date','delta'],axis=1),
     embedded_measurements.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',
                        'measurement_DATETIME','value_as_number','child_birth_date','delta'],axis=1),
    embedded_obs.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',
                        'observation_DATETIME','child_birth_date','delta'],axis=1)], ignore_index=True)[['sample_ID','date','embedding']]

expanded_embedding_df = pd.DataFrame(all_data['embedding'].tolist())
print('done making interim dataframe')
all_data = pd.concat([all_data.reset_index(drop=True).drop('embedding',axis=1), expanded_embedding_df], axis=1)


In [None]:
%%time
#take the mean to compute patient-day embeddings
patient_day_embeddings = all_data.groupby(['sample_ID','date']).mean()

In [None]:
patient_day_embeddings = patient_day_embeddings.reset_index()

In [None]:
patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])
patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])

In [None]:
unique_patients = patient_day_embeddings['sample_ID'].nunique()
num_features = len(patient_day_embeddings.columns) - 2  # Subtract patient_id and date columns


In [None]:
max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()
max_dates = 32

In [None]:
#assign each patient id to an index in the input data matrix
patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}


In [None]:
#create input data matrix
RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)


In [None]:
%%time
#fill in input data matrix with person-day EHR data embeddings
date_position = {}
for index, row in tqdm(patient_day_embeddings.iterrows()):
    patient_id = row['sample_ID']
    patient_index = patient_id_to_index[patient_id]
    
    if patient_id not in date_position:
        date_position[patient_id] = 0
    else:
        date_position[patient_id] += 1
        
    date_index = date_position[patient_id]
    
    for feature_index, feature_value in enumerate(row.drop(['sample_ID', 'date'])):
        if date_index < max_dates:
            RNN_data[feature_index, date_index, patient_index] = feature_value


In [None]:
RNN_data.shape

In [None]:
RNN_data = RNN_data.transpose(2,1,0)

In [None]:
RNN_data.shape

In [None]:
#align outcome data with correct index
sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)


In [None]:
#align outcome data with correct index
sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')

In [None]:
DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])

In [None]:
RNN_data.shape, DOS_outcomes.shape

In [None]:
#Save processed data below

In [None]:
np.save('./data/processed_data/RNN_data_codes_with_obs_word2vec_from_ool.npy', RNN_data)

In [None]:
np.save('./data/processed_data/RNN_data_outcomes_with_obs_word2vec_from_ool.npy', DOS_outcomes)

In [None]:
patient_outcomes = torch.tensor(DOS_outcomes).float()
num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)

In [None]:
np.save('./data/processed_data/RNN_data_lengths_with_obs_word2vec_from_ool.npy', num_patient_visits)

In [None]:
pd.DataFrame([list(patient_id_to_index.keys()),list(patient_id_to_index.values())]).T.to_csv('./data/processed_data/sampleID_indices_with_obs_word2vec_from_ool.csv')
