In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
import gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import random
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import dask
import dask.dataframe as dd

In [2]:
%%time
#load data
#the person_id, concept_id, and date columns are extracted from OMOP tables for people who delivered babies at Stanford
conds = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_conditions.csv')
conds = conds[conds['condition_concept_id'] != 0]
drugs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_drugs.csv')
drugs = drugs[drugs['drug_concept_id'] != 0]
procs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_procedures.csv')
procs = procs[procs['procedure_concept_id'] != 0]
obs = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_observations.csv')
obs = obs[obs['observation_concept_id'] != 0]

conds['condition_start_DATETIME'] = pd.to_datetime(conds['condition_start_DATETIME'])
procs['procedure_DATETIME'] = pd.to_datetime(procs['procedure_DATETIME'])
drugs['drug_exposure_start_DATETIME'] = pd.to_datetime(drugs['drug_exposure_start_DATETIME'])
obs['observation_DATETIME'] = pd.to_datetime(obs['observation_DATETIME'])

conds['child_birth_date'] = pd.to_datetime(conds['child_birth_date'])
procs['child_birth_date'] = pd.to_datetime(procs['child_birth_date'])
drugs['child_birth_date'] = pd.to_datetime(drugs['child_birth_date'])
obs['child_birth_date'] = pd.to_datetime(obs['child_birth_date'])

CPU times: user 19.3 ms, sys: 4.52 ms, total: 23.8 ms
Wall time: 22.1 ms


In [3]:
%%time
measurements = pd.read_csv('./data/raw_data/EHR/full_EHR_cohort_measurements.csv')
measurements = measurements[~pd.isnull(measurements['value_as_number'])]
measurements = measurements[measurements['measurement_concept_id'] != 0]
measurements['measurement_DATETIME'] = pd.to_datetime(measurements['measurement_DATETIME'])
measurements['child_birth_date'] = pd.to_datetime(measurements['child_birth_date'])

CPU times: user 8.55 ms, sys: 0 ns, total: 8.55 ms
Wall time: 6.78 ms


In [4]:
#identify ppl who have omics data so they can be EXCLUDED from the pre-training cohort
OOL_cohort_omop = pd.read_csv('./data/ool_EHR_features.csv')['mom_person_id'].values

In [5]:
#remove EHR data from ppl who are in the omics cohort
conds = conds[~conds['mom_person_id'].isin(OOL_cohort_omop)]
procs = procs[~procs['mom_person_id'].isin(OOL_cohort_omop)]
drugs = drugs[~drugs['mom_person_id'].isin(OOL_cohort_omop)]
measurements = measurements[~measurements['mom_person_id'].isin(OOL_cohort_omop)]
obs = obs[~obs['mom_person_id'].isin(OOL_cohort_omop)]

In [6]:
conds['mom_person_id'].nunique()

230

In [7]:

def filter_df(df, birth_time, time_col, time_range_days=280):
    """
    A function to remove entries in a dataframe prior to time of birth. 
    
    df: The dataframe to filter. Must contain a column called person_id with the OMOP ID of the mother
    birth_time: A dataframe that contains two columns: maternal_OMOP and birth_DATETIME
    time_col: The index of the column with the date of the event in df
    time_range_days: keeps data from delivery up to time_range_days prior 
    
    """
    print('There were {} patients before filtering.'.format(len(df['person_id'].unique())))
    df = df.merge(birth_time, how='inner', left_on='person_id', right_on='maternal_OMOP')
    df['diff'] = df['birth_DATETIME']-df[time_col]
    new_df = df[(df['diff'].dt.days > 0) & (df['diff'].dt.days <= time_range_days)].drop('maternal_OMOP', axis=1)
    print('There were {} patients after filtering.'.format(len(new_df['person_id'].unique())))
    return new_df

#function to help with appropriate sample / patient labeling 
def generate_features_EHR_cohort(proteomics, input_df, time_col_name, concept_id_col, indicator, binary=True):
    df = proteomics[['DOS','mom_person_id','child_person_id']].merge(input_df, how='left', on=['mom_person_id','child_person_id'])
    df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days
    df = df[df['delta'] < df['DOS']]
    df['sample_ID'] = df['mom_person_id'].astype(str)+'_'+df['child_person_id'].astype(str)
    return df
    

In [8]:
#pick a random date during pregnancy for the women so we can create an artificial sampling time
#we will use EHR data from beginning of pregnancy up until this sampling time for features
#and number of days from this sampling time to birth as the "time to onset of labor" pre-training problem
time_col_name = 'condition_start_DATETIME'
df = conds
df['delta'] = (df[time_col_name]-df['child_birth_date']).dt.days
min_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).min()
min_ool.columns = ['min_delta']
max_ool = df[['mom_person_id','child_person_id','delta']].groupby(['mom_person_id','child_person_id']).max()
max_ool.columns = ['max_delta']
sampling_df = pd.concat([min_ool, max_ool],axis=1)

np.random.seed(3)
sampling_df = sampling_df[((sampling_df['max_delta'] - sampling_df['min_delta']) >= 7) == True]
sampling_df['DOS'] = ((sampling_df['max_delta'] - sampling_df['min_delta'] - 7) * np.random.rand(sampling_df.shape[0]) + sampling_df['min_delta']).astype(int)
sampling_df = sampling_df[sampling_df['max_delta'] > -100]
#sample from last 100 days of pregnancy to mirror design of omics study
sampling_df['DOS'] = sampling_df.apply(lambda row: int(np.random.uniform(-100, row['max_delta'])), axis=1)
sampling_df = sampling_df.reset_index()

In [9]:
%%time
condition_features_EHR = generate_features_EHR_cohort(sampling_df, conds, 'condition_start_DATETIME','condition_concept_id','C')
procedure_features_EHR = generate_features_EHR_cohort(sampling_df, procs, 'procedure_DATETIME','procedure_concept_id','P')
drug_features_EHR = generate_features_EHR_cohort(sampling_df, drugs, 'drug_exposure_start_DATETIME','drug_concept_id','D')
measurement_features_EHR = generate_features_EHR_cohort(sampling_df, measurements, 'measurement_DATETIME','measurement_concept_id','M')
observation_features_EHR = generate_features_EHR_cohort(sampling_df, obs, 'observation_DATETIME','observation_concept_id','O')


CPU times: user 30 ms, sys: 341 µs, total: 30.3 ms
Wall time: 28.5 ms


In [10]:
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.losses = []

    def on_epoch_begin(self, model):
        print(f"Starting epoch #{self.epoch}")

    def on_epoch_end(self, model):
        print(f"Finished epoch #{self.epoch}")
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(self.losses)
        print(f'  Loss: {loss}')
        self.epoch += 1

In [11]:
%%time
#learn word2vec embeddings
try:
    model = Word2Vec.load("./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model")
except:
    print('training new model!')
    epoch_logger = EpochLogger()

    word2vec_conds = condition_features_EHR[['sample_ID','condition_concept_id','condition_start_DATETIME']]
    word2vec_conds.columns = ['sample_ID','concept_id','ts']

    word2vec_procs = procedure_features_EHR[['sample_ID','procedure_concept_id','procedure_DATETIME']]
    word2vec_procs.columns = ['sample_ID','concept_id','ts']

    word2vec_drug = drug_features_EHR[['sample_ID','drug_concept_id','drug_exposure_start_DATETIME']]
    word2vec_drug.columns = ['sample_ID','concept_id','ts']
    
    word2vec_mea = measurement_features_EHR[['sample_ID','measurement_concept_id','measurement_DATETIME']]
    word2vec_mea.columns = ['sample_ID','concept_id','ts']
    
    word2vec_obs = observation_features_EHR[['sample_ID','observation_concept_id','observation_DATETIME']]
    word2vec_obs.columns = ['sample_ID','concept_id','ts']
    
    word2vec_data = pd.concat([word2vec_conds, word2vec_procs, word2vec_drug,word2vec_mea, word2vec_obs],axis=0)
    word2vec_data['date'] = pd.to_datetime(word2vec_data['ts'])
    word2vec_data['date'] = word2vec_data['date'].dt.date
    word2vec_data = word2vec_data.drop('ts',axis=1)
    word2vec_data = word2vec_data[~pd.isnull(word2vec_data['concept_id'])]
    word2vec_data['concept_id'] = word2vec_data['concept_id'].astype(int)
    
    grouped_data = word2vec_data.groupby(['sample_ID', 'date'])
    sentences = []
    for _, group in tqdm(grouped_data):
        codes = group['concept_id'].tolist()
        random.shuffle(codes)
        sentences.append(codes)
        
    print('starting training')
    model = Word2Vec(sentences, vector_size=400, window=100, min_count=5, workers=64)
    model.train(sentences, total_examples=len(sentences), epochs=5, callbacks=[epoch_logger])
    model.save("./models/word2vec_full_pregnancy_cohort_measurements_observations_full_pregnancy_sampling_400dim.model")


CPU times: user 530 µs, sys: 3.41 ms, total: 3.94 ms
Wall time: 2.19 ms


In [12]:
code_to_embedding = {code: model.wv[code] for code in model.wv.index_to_key}

In [13]:
#replace EHR data with learned embeddings
embedded_conds = condition_features_EHR[(condition_features_EHR['condition_concept_id'] != 0)]
embedded_conds = embedded_conds[~pd.isnull(embedded_conds['condition_concept_id'])]
embedded_conds['embedding'] = [code_to_embedding.get(code) for code in embedded_conds['condition_concept_id']]


In [14]:
embedded_procs = procedure_features_EHR[(procedure_features_EHR['procedure_concept_id'] != 0)]
embedded_procs = embedded_procs[~pd.isnull(embedded_procs['procedure_concept_id'])]
embedded_procs['embedding'] = [code_to_embedding.get(code) for code in embedded_procs['procedure_concept_id']]


In [15]:
embedded_drugs = drug_features_EHR[(drug_features_EHR['drug_concept_id'] != 0)]
embedded_drugs = embedded_drugs[~pd.isnull(embedded_drugs['drug_concept_id'])]
embedded_drugs['embedding'] = [code_to_embedding.get(code) for code in embedded_drugs['drug_concept_id']]


In [16]:
embedded_measurements = measurement_features_EHR[(measurement_features_EHR['measurement_concept_id'] != 0)]
embedded_measurements = embedded_measurements[~pd.isnull(embedded_measurements['measurement_concept_id'])]
embedded_measurements['embedding'] = [code_to_embedding.get(code) for code in embedded_measurements['measurement_concept_id']]


In [17]:
embedded_obs = observation_features_EHR[(observation_features_EHR['observation_concept_id'] != 0)]
embedded_obs = embedded_obs[~pd.isnull(embedded_obs['observation_concept_id'])]
embedded_obs['embedding'] = [code_to_embedding.get(code) for code in embedded_obs['observation_concept_id']]


In [18]:
embedded_procs['date'] = pd.to_datetime(embedded_procs['procedure_DATETIME'].dt.date)
embedded_conds['date'] = pd.to_datetime(embedded_conds['condition_start_DATETIME'].dt.date)
embedded_drugs['date'] = pd.to_datetime(embedded_drugs['drug_exposure_start_DATETIME'].dt.date)
embedded_measurements['date'] = pd.to_datetime(embedded_measurements['measurement_DATETIME'].dt.date)
embedded_obs['date'] = pd.to_datetime(embedded_obs['observation_DATETIME'].dt.date)


In [19]:
%%time
# Convert pandas dataframes to dask dataframes
embedded_conds_dsk = dd.from_pandas(embedded_conds, npartitions=120)
embedded_procs_dsk = dd.from_pandas(embedded_procs, npartitions=120)
embedded_drugs_dsk = dd.from_pandas(embedded_drugs, npartitions=120)
embedded_measurements_dsk = dd.from_pandas(embedded_measurements, npartitions=120)
embedded_obs_dsk = dd.from_pandas(embedded_obs, npartitions=120)

# Filter null embeddings
embedded_conds_dsk = embedded_conds_dsk[embedded_conds_dsk['embedding'].notnull()]
embedded_procs_dsk = embedded_procs_dsk[embedded_procs_dsk['embedding'].notnull()]
embedded_drugs_dsk = embedded_drugs_dsk[embedded_drugs_dsk['embedding'].notnull()]
embedded_measurements_dsk = embedded_measurements_dsk[embedded_measurements_dsk['embedding'].notnull()]
embedded_obs_dsk = embedded_obs_dsk[embedded_obs_dsk['embedding'].notnull()]

# Concatenate different EHR tables
all_data = dd.concat([
    embedded_conds_dsk.drop(['DOS','mom_person_id','child_person_id', 'condition_concept_id',
                      'condition_start_DATETIME','child_birth_date','delta'], axis=1),
    embedded_procs_dsk.drop(['DOS','mom_person_id','child_person_id', 'procedure_concept_id',
                        'procedure_DATETIME','child_birth_date','delta'], axis=1),
    embedded_drugs_dsk.drop(['DOS','mom_person_id','child_person_id', 'drug_concept_id',
                        'drug_exposure_start_DATETIME','child_birth_date','delta'], axis=1),
    embedded_measurements_dsk.drop(['DOS','mom_person_id','child_person_id', 'measurement_concept_id',
                        'measurement_DATETIME','value_as_number','child_birth_date','delta'], axis=1),
    embedded_obs_dsk.drop(['DOS','mom_person_id','child_person_id', 'observation_concept_id',
                        'observation_DATETIME','child_birth_date','delta'], axis=1)
], ignore_index=True)[['sample_ID','date','embedding']].compute()

all_data.sort_values('date', ascending=False, inplace=True)


max_dates = 32

all_data.sort_values(by=['sample_ID', 'date'], ascending=[True, False], inplace=True)

# Create a helper column to rank the unique dates for each person_id
all_data['date_rank'] = all_data.groupby('sample_ID')['date'].transform(lambda x: x.rank(method='dense', ascending=False))

# Filter the rows where date_rank is within the range of 1 to max_dates
filtered_data = all_data[all_data['date_rank'].between(1, max_dates)]
filtered_data = filtered_data.drop(columns='date_rank')


CPU times: user 1.65 s, sys: 221 ms, total: 1.88 s
Wall time: 1.68 s


In [20]:
%%time
#create patient-day embeddings
patient_day_embeddings = filtered_data.groupby(['sample_ID','date']).mean()

CPU times: user 156 ms, sys: 95 µs, total: 156 ms
Wall time: 154 ms


In [21]:
patient_day_embeddings = patient_day_embeddings.reset_index()

In [22]:
patient_day_embeddings['date'] = pd.to_datetime(patient_day_embeddings['date'])
patient_day_embeddings = patient_day_embeddings.sort_values(['sample_ID', 'date'])

In [23]:
unique_patients = patient_day_embeddings['sample_ID'].nunique()
num_features = len(patient_day_embeddings['embedding'].iloc[0])


In [24]:
max_dates = patient_day_embeddings.groupby('sample_ID')['date'].count().max()
max_dates = 32

In [25]:
#assign each patient to an index in the data matrix
patient_id_to_index = {patient_id: index for index, patient_id in enumerate(patient_day_embeddings['sample_ID'].unique())}


In [26]:
#create numpy matrix for data
RNN_data = np.full((num_features, max_dates, unique_patients), np.nan)


In [27]:
%%time
#populate data matrix with input data
date_position = {}
for index, row in tqdm(patient_day_embeddings.iterrows()):
    patient_id = row['sample_ID']
    patient_index = patient_id_to_index[patient_id]
    
    if patient_id not in date_position:
        date_position[patient_id] = 0
    else:
        date_position[patient_id] += 1
        
    date_index = date_position[patient_id]
    
    RNN_data[:,date_index, patient_index] = row['embedding']


0it [00:00, ?it/s]

CPU times: user 103 ms, sys: 17 ms, total: 120 ms
Wall time: 103 ms


In [28]:
RNN_data = RNN_data.transpose(2,1,0)

In [29]:
RNN_data.shape

(113, 32, 400)

In [30]:
sampling_df['sample_ID'] = sampling_df['mom_person_id'].astype(str)+'_'+sampling_df['child_person_id'].astype(str)

In [31]:
sampling_df = sampling_df.merge(pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T, how='right', left_on='sample_ID', right_on=0)


In [32]:
#align outcome data with feature matrix
sampling_df = sampling_df.merge(patient_day_embeddings.groupby('sample_ID').count()[['date']], how='left', on='sample_ID')

In [33]:
DOS_outcomes = np.array(sampling_df[['DOS',1]].sort_values(1)['DOS'])

In [34]:
#save processed data
np.save('./data/processed_data/RNN_data_full_EHR_cohort_with_obs_fixed.npy', RNN_data)
np.save('./data/processed_data/RNN_data_outcomes_full_EHR_cohort_with_obs_fixed.npy', DOS_outcomes)


In [35]:
num_patient_visits = np.minimum(np.array(sampling_df['date']), 32)

In [36]:
np.save('./data/processed_data/RNN_data_lengths_full_EHR_cohort_with_obs_fixed.npy', num_patient_visits)

In [37]:
df = pd.DataFrame([patient_id_to_index.keys(), patient_id_to_index.values()]).T
df.to_csv('./data/processed_data/sampleID_indices_full_cohort_with_obs_fixed.csv')