### Note:
We explored several downsampling techniques due to our imbalanced labels. We ultimately found that performance (AUROC, AUPRC, and accuracy) was not improved with downsampling and left it out of our final pipeline.

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist as cdist

## Reading in Data

In [2]:
file_suffix = "2yrprev_within3"
DATA = pd.read_csv("../DATA/PROCESSED/standardized/merged_kept_data_%s.csv"%file_suffix, index_col=0)
DATA_all = pd.read_csv("../DATA/PROCESSED/standardized/merged_data_all_%s.csv"%file_suffix, index_col=0).drop_duplicates()

In [4]:
# import prior years data
import_cols = ['dcfdx__1.0','dcfdx__2.0', 'dcfdx__3.0','cogn_ep','cogn_po','cogn_ps','cogn_se','cogn_wo','cogn_global',
               'cts_animals','cts_bname','cts_catflu','cts_db','cts_delay','cts_df','cts_doperf','cts_ebdr',
               'cts_ebmt','cts_fruits','cts_idea','cts_lopair','cts_mmse30','cts_nccrtd','cts_pmat','cts_pmsub',
               'cts_read_nart','cts_sdmt','cts_story','cts_stroop_cname','cts_stroop_wread','cts_wli','cts_wlii',
               'cts_wliii','age_at_visit','hypertension_cum','cancer_cum','diabetes_sr_rx','dm_cum','headinjrloc_cum',
               'med_con_sum_cum','thyroid_cum','claudication_cum','heart_cum','stroke_cum',
               'vasc_3dis_sum','vasc_risks_sum']

years = 2 # how many years in the past to collect data from

for i in range(1,years+1):
    for col in import_cols:
        DATA[col+str(i)] = np.nan
for row in range(DATA.shape[0]):
    for i in range(1,years+1):
        curr_row = DATA_all.loc[((DATA_all['projid'] == DATA.loc[row,'projid']) & (DATA_all['fu_year'] == DATA.loc[row,'fu_year']-i))]
        if len(curr_row)>1:
            print('more than one entry')
            curr_row = DATA_all.iloc[curr_row.index[0]]
            print(curr_row)
        for col in import_cols:
            DATA.at[row,(col+str(i))] = curr_row[col]

## Balancing Methods

In [19]:
# random downsampling
for split in range(5):
    for group in ['train']:
        data_train = DATA[DATA["projid"].isin(np.loadtxt("../DATA/PROCESSED/split_projids/CV_splits/%i/%s_%s.txt"%(split,group,file_suffix)))]
        neg_label_train = data_train[data_train["onset_label_time_binary"]==0].sample(n=data_train[data_train["onset_label_time_binary"]==1].shape[0])
        train_randdownsample = neg_label_train.append(data_train[data_train["onset_label_time_binary"]==1])
        save_randdownsample = train_randdownsample[["projid","fu_year","onset_label_time","onset_label_time_binary"]]
        save_randdownsample.to_csv("../DATA/PROCESSED/split_projids/CV_splits/%i/randdownsample_%s_%s.txt"%(split,group,file_suffix))

for group in ['train']:
    data_test = DATA[DATA["projid"].isin(np.loadtxt("../DATA/PROCESSED/split_projids/%s_%s.txt"%(group,file_suffix)))]
    neg_label_test = data_test[data_test["onset_label_time_binary"]==0].sample(n=data_test[data_test["onset_label_time_binary"]==1].shape[0])
    test_randdownsample = neg_label_test.append(data_test[data_test["onset_label_time_binary"]==1])
    save_randdownsample = test_randdownsample[["projid","fu_year","onset_label_time","onset_label_time_binary"]]
    save_randdownsample.to_csv("../DATA/PROCESSED/split_projids/randdownsample_%s_%s.txt"%(group,file_suffix))

In [21]:
# matched pairs downsampling
for split in range(5):
    for group in ['train']:
        data_train = DATA[DATA["projid"].isin(np.loadtxt("../DATA/PROCESSED/split_projids/CV_splits/%i/%s_%s.txt"%(split,group,file_suffix)))]
        train_matcheddownsample = data_train[data_train["onset_label_time_binary"]==1]
        possible_matches = data_train[data_train["onset_label_time_binary"]==0]
        for row in data_train.index[data_train['onset_label_time_binary'] == 1].tolist():
            matches = possible_matches.loc[((possible_matches["msex"]==data_train.loc[row,"msex"]) & (possible_matches["projid"]!=data_train.loc[row,"projid"]))]
            dist = cdist(data_train.loc[[row],['age_at_visit','educ']], matches.loc[matches.index,['age_at_visit','educ']], metric='euclidean')
            train_matcheddownsample = train_matcheddownsample.append(matches.iloc[np.argmin(dist)])
            possible_matches.drop(matches.index[np.argmin(dist)], inplace=True)
            possible_matches.reset_index(inplace=True, drop=True)
        save_matcheddownsample = train_matcheddownsample[["projid","fu_year","onset_label_time","onset_label_time_binary"]]
        save_matcheddownsample.to_csv("../DATA/PROCESSED/split_projids/CV_splits/%i/matcheddownsample_%s_%s.txt"%(split,group,file_suffix))
        
for group in ['train']:
    data_train = DATA[DATA["projid"].isin(np.loadtxt("../DATA/PROCESSED/split_projids/%s_%s.txt"%(group,file_suffix)))]
    train_matcheddownsample = data_train[data_train["onset_label_time_binary"]==1]
    possible_matches = data_train[data_train["onset_label_time_binary"]==0]
    for row in data_train.index[data_train['onset_label_time_binary'] == 1].tolist():
        matches = possible_matches.loc[((possible_matches["msex"]==data_train.loc[row,"msex"]) & (possible_matches["projid"]!=data_train.loc[row,"projid"]))]
        dist = cdist(data_train.loc[[row],['age_at_visit','educ']], matches.loc[matches.index,['age_at_visit','educ']], metric='euclidean')
        train_matcheddownsample = train_matcheddownsample.append(matches.iloc[np.argmin(dist)])
        possible_matches.drop(matches.index[np.argmin(dist)], inplace=True)
        possible_matches.reset_index(inplace=True, drop=True)
    save_matcheddownsample = train_matcheddownsample[["projid","fu_year","onset_label_time","onset_label_time_binary"]]
    save_matcheddownsample.to_csv("../DATA/PROCESSED/split_projids/matcheddownsample_%s_%s.txt"%(group,file_suffix))