In [2]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
clin_df = pd.read_csv("../DATA/Raw/dataset_495_long.csv")

# specifically only reading in variables that are known pre-mortem
static_df = pd.read_csv("../DATA/Raw/ROSMAP_clinical.csv")[["projid", "msex", "educ", "apoe_genotype", "race", "spanish"]]
 
unique_projids = clin_df["projid"].unique()
print("# of unique patients: ", len(unique_projids))

# of unique patients:  3194


In [4]:
demographics_df =  pd.read_csv("../DATA/Raw/dataset_495_basic.csv")[["projid", "msex", "educ", "apoe_genotype", "race", "spanish"]]

In [5]:
clin_df = clin_df.merge(demographics_df, how="left")

### Set requirements:
- min observations to keep a person in our dataset
- for now, require that individuals start out with no dementia DX
- another possibility: require people to start off with no dementia AND no MCI

In [6]:
predict_ahead = 3
require_previous = 2
file_suffix = str(require_previous)+"yrprev" + "_within%i"%predict_ahead

In [7]:
# current year + number of years ahead we want to be sure they don't get dementia + number of previous years
min_obvs_neg = 1 + predict_ahead + require_previous
# current year + next year (if they get dementia) + number of previous years
min_obvs_pos = 1 + 1 + require_previous

test_frac = .2
cv_num = 5

In [8]:
num_visits_list = np.array([len(clin_df[clin_df["projid"] == pid]) for pid in unique_projids])
projids = unique_projids[np.where(num_visits_list>=min_obvs_pos)[0]]


no_eventual_dxs = []
eventual_dxs = []
excluded_already_dementia = []
for projid in projids:
    dxs = clin_df[clin_df["projid"]==projid]["dcfdx"].values
    # exclude people that get dementia before require_previous years have passed 
    if np.max(dxs[~np.isnan(dxs)][:require_previous+1]) > 3:
        excluded_already_dementia.append(projid)
    else:
        # if they never get dementia AND have enough data to definitively know they wont in the next "predict_ahead" years:
        if np.nanmax(dxs) <= 3 and len(dxs) >= min_obvs_neg:
            no_eventual_dxs.append(projid)
        elif np.nanmax(dxs) > 3:
            eventual_dxs.append(projid)
    

print("%i individuals never receive a dementia diagnosis."%len(no_eventual_dxs))
print("%i individuals are given a dementia diagnosis during their time in the study."%len(eventual_dxs))
print("%i are excluded due to already having dementia"%len(excluded_already_dementia))


1174 individuals never receive a dementia diagnosis.
557 individuals are given a dementia diagnosis during their time in the study.
245 are excluded due to already having dementia


# Computing onset labels

In [9]:
fu_years_new = {}
labels_time_to_dx = {}
labels_binary = {}
dcfdxs = {}

for pid_i, projid in enumerate(projids):
    if pid_i % 500 == 0:
        print(pid_i)
    
    df = clin_df[clin_df["projid"]==projid]
    cur_fu_yrs = df["fu_year"].values
    dxes = df["dcfdx"].values    
    onset_label_times = []
    onset_label_binaries = []

    # look through all timepoints we have a row for
    for t in cur_fu_yrs:

        #####################################################################
        ##### look at prior observations ####################################
        #####################################################################
        
        # variable to check whether the input is valid (defaults to invalid)
        valid_input = False
        
        # get indices for fu_years requried for current label
        input_idxs = np.where(np.in1d(cur_fu_yrs, [t-x for x in range(require_previous+1)]))[0]

        # check if they've ever been labeled with dementia before current observation
        if np.nanmax(dxes[:np.max(input_idxs)+1]) > 3:
            #if yes, then not a valid observation, since they've already had dementia
            pass
        # check if "require_prev" observations is satisfied -- need a label for each observation 
        elif len(input_idxs) == require_previous+1:
            if np.mean(np.isnan(dxes[input_idxs])) == 0:
                valid_input = True
            # if this is shorter, then we dont have enough obsevations

        #####################################################################
        ##### look at future observations ###################################
        #####################################################################
            
        # variable to see if we have a future label (defaults to no)
        future_label = np.nan
        
        # years we want to see observations for
        future_years =  [t+x for x in range(1, predict_ahead+1)]
        # indices for which those yearly observations are located
        future_idxs = np.where(np.in1d(cur_fu_yrs, future_years))[0]
        # indices for which these yearly observations are located AND cognitive diagnosis is not nan
        known_future_idxs = np.array(future_idxs)[~np.isnan(dxes[future_idxs])] 

        if len(known_future_idxs) > 0:
            
            # check to see if a dementia diagnosis was made
            # if so, check how many years into the future the FIRST dementia diagnosis was made
            if np.max(dxes[known_future_idxs]) > 3:
                for f_id in known_future_idxs:
                    if dxes[f_id]>3:
                        onset_time = cur_fu_yrs[f_id]
                        break
                future_label = onset_time-t
        
            # if there is no AD diagnosis in the observed future years, 
            # we need to make sure we've seen "no AD" for EVERY future year (no missing values)
            elif len(known_future_idxs) == predict_ahead:
                future_label = 0 

        #####################################################################
        ##### Combine previous and future labels ############################
        #####################################################################
        
        # in order to get an onset label (positive or negative), 
        # we need to have a valid # of prior observations, AND a valid future label
        if valid_input==True and ~np.isnan(future_label):
            onset_label_time = future_label
            onset_label_binary = int(future_label>0)
        else:
            onset_label_time = np.nan
            onset_label_binary = np.nan

        onset_label_times.append(onset_label_time)
        onset_label_binaries.append(onset_label_binary)


    fu_years_new[projid] = cur_fu_yrs
    labels_time_to_dx[projid] = onset_label_times
    labels_binary[projid] = onset_label_binaries
    dcfdxs[projid] = dxes

0
500
1000
1500
2000


In [10]:
pid_y_label_list_new = []
for projid in projids:
    for i in range(len(fu_years_new[projid])):
        pid_y_label_list_new.append([projid, fu_years_new[projid][i], labels_time_to_dx[projid][i], labels_binary[projid][i]])
valid_observations_new = pd.DataFrame(pid_y_label_list_new, columns=['projid', 'fu_year', 'onset_label_time', 'onset_label_time_binary'])

In [11]:
all_obvs_new = clin_df.merge(valid_observations_new, on=["projid", "fu_year"], how="left")
valid_obvs_new = all_obvs_new[~np.isnan(all_obvs_new["onset_label_time_binary"])]

In [13]:
savedir = "../DATA/PROCESSED/"
if not os.path.isdir(savedir):
    os.makedirs(savedir)


# all samples with valid labels - plus current year's features
valid_obvs_new.to_csv(os.path.join(savedir, "merged_kept_data_%s.csv"%file_suffix))
# we also save all rows for samples with valid labels (we'll need features from past years for the samples we end up using)
all_obvs_new.to_csv(os.path.join(savedir, "merged_data_all_%s.csv"%file_suffix))

# train & test splits
In order to avoid contamination, we pseudorandomly split samples into train/test splits:
- all samples from a given individual must be in the same split
- we separately split samples for people with (a) no eventual AD diagnosis, and (b) an eventual AD diagnosis so the fractions of each type of individual are balanced across CV splits

In [14]:
test_frac = .2
cv_num = 5
savedir = "../DATA/PROCESSED/split_projids"


In [15]:
# identify all individuals with any valid observations --> get list of projids associated with people 
# who eventually get dementia (or dont)
valid_pids = valid_obvs_new["projid"].unique()
eventual_dxs = [np.nanmax(valid_obvs_new[valid_obvs_new["projid"]==pid]["onset_label_time_binary"]) for pid in valid_pids]
no_eventual_dems = valid_pids[np.where(np.array(eventual_dxs)==0)]
eventual_dems = valid_pids[np.where(np.array(eventual_dxs)==1)]


## GENERATE SPLITS OF THE DATA: select test_frac individuals from the no dementia and dementia groups
test_dem_idx = np.random.choice(eventual_dems, size=int(len(eventual_dems)*test_frac), replace=False)
test_normal_idx = np.random.choice(no_eventual_dems, size=int(len(no_eventual_dems)*test_frac), replace=False)

train_dem_idx = np.setdiff1d(eventual_dems, test_dem_idx)
train_normal_idx = np.setdiff1d(no_eventual_dems, test_normal_idx)


# save full training and test set
if not os.path.isdir(savedir):
    os.makedirs(savedir)
np.savetxt(os.path.join(savedir, "train_%s.txt"%file_suffix), np.sort(np.union1d(train_dem_idx, train_normal_idx)), fmt="%s")
np.savetxt(os.path.join(savedir, "test_%s.txt"%file_suffix), np.sort(np.union1d(test_dem_idx, test_normal_idx)), fmt="%s")


In [16]:
## For CV splits, shuffle the training sets and then divide it into cv_num groups
rand_train_dem = np.random.permutation(train_dem_idx)
rand_train_normal = np.random.permutation(train_normal_idx)

chunksize_d = int(len(rand_train_dem)/cv_num)
chunksize_n = int(len(rand_train_normal)/cv_num)

CVsplits = []
for i in range(cv_num):
    cur_val_dem = rand_train_dem[i*chunksize_d: (1+i)*chunksize_d]
    cur_val_normal = rand_train_normal[i*chunksize_n: (1+i)*chunksize_n]
    
    if not os.path.isdir(os.path.join(savedir, "CV_splits", "%i"%i)):
        os.makedirs(os.path.join(savedir, "CV_splits", "%i"%i))
    
    validation_to_save = np.sort(np.union1d(cur_val_dem, cur_val_normal))
    train_to_save = np.sort(np.union1d(np.setdiff1d(train_dem_idx, cur_val_dem), np.setdiff1d(train_normal_idx, cur_val_normal)))
    
    np.savetxt(os.path.join(savedir, "CV_splits", "%i"%i, "valid_%s.txt"%file_suffix), validation_to_save, fmt="%s")
    np.savetxt(os.path.join(savedir, "CV_splits", "%i"%i, "train_%s.txt"%file_suffix), train_to_save, fmt="%s")