In [1]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
import copy




%matplotlib inline

# Load  Datasets

In [2]:
orig_folder ="../DATA/PROCESSED/"
new_folder = "../DATA/PROCESSED/standardized/"

if not os.path.isdir(new_folder):
    os.makedirs(new_folder)
    
y = 2
within = 3

# current year's features -- all standardization will be based on current year
cur_year_fname = "merged_kept_data_%iyrprev_within%i.csv"%(y,within)
CURRENT_YEAR = pd.read_csv(orig_folder + cur_year_fname)

# all years features - these will be used when we train multiple years of data
all_years_fname = "merged_data_all_%iyrprev_within%i.csv"%(y,within)
ALL_YEARS=pd.read_csv(orig_folder + all_years_fname)


# merge in non-temporal demographic info 
DEMOGRAPHICS = pd.read_csv("../DATA/raw/dataset_495_basic.csv")[["projid", "educ", "msex", "apoe_genotype", "race", "spanish"]].drop_duplicates()
DEMOGRAPHICS["spanish"] = 2-DEMOGRAPHICS["spanish"] 
ALL_YEARS.drop(["educ", "msex", "apoe_genotype", "race", "spanish"], axis=1, inplace=True)
CURRENT_YEAR.drop(["educ", "msex", "apoe_genotype", "race", "spanish"], axis=1, inplace=True)

ALL_YEARS = ALL_YEARS.merge(DEMOGRAPHICS, how="left", on="projid")
CURRENT_YEAR = CURRENT_YEAR.merge(DEMOGRAPHICS, how="left", on="projid")

In [3]:
print(ALL_YEARS.shape, CURRENT_YEAR.shape)

(24565, 60) (9103, 60)


In [5]:
print("VARIABLES WITH MISSING VALUES: FRACTION MISSING")
for df in [CURRENT_YEAR]:
    for col in df.columns:
        if df[col].values.dtype != 'O':
            frac_missing = np.mean(np.isnan(df[col].values))
            if frac_missing > .1:
                print("%s: %.2f"%(col, frac_missing))


VARIABLES WITH MISSING VALUES: FRACTION MISSING
r_cistrk: 0.18
r_stroke: 0.22
cts_stroop_cname: 0.65
cts_stroop_wread: 0.65
lostcons: 0.96
chf_cum: 0.70


# Standardize

In [6]:
# these are the variables we don't want to standardize (labels & identifiers)
carry_over_vars = ["projid", "study", "fu_year", "scaled_to", "onset_label_time", "onset_label_time_binary"]
excluded_variables = ["r_cistrk", "r_stroke", "lostcons", "chf_cum"]

In [7]:
# we'll standardize this copy of the df
new_ALL_YEARS = copy.copy(ALL_YEARS[carry_over_vars])
new_CURRENT_YEAR = copy.copy(CURRENT_YEAR[carry_over_vars])

In [8]:
# Continuous variables that need to be z-scored: 
cognitive_features = ['cts_animals', 'cts_bname', 'cts_catflu','cts_db', 'cts_delay', 'cts_df', 'cts_doperf', 'cts_ebdr', 'cts_ebmt',\
            'cts_fruits', 'cts_idea', 'cts_lopair', 'cts_mmse30', 'cts_nccrtd','cts_pmat', 'cts_pmsub', 'cts_read_nart', \
            'cts_sdmt', 'cts_story', 'cts_stroop_cname', 'cts_stroop_wread', 'cts_wli', 'cts_wlii', 'cts_wliii']
medical_features_sums = ['med_con_sum_cum', 'vasc_3dis_sum', 'vasc_risks_sum']
continuous_demographics = ['age_at_visit', 'educ']

# Composite variables: average over z-scores from cognitive tests ("cts_" variables)
composite_vars = {
    "cogn_ep": ["cts_wli", "cts_wlii", "cts_wliii", "cts_ebmt", "cts_ebdr",  "cts_story","cts_delay"],
    "cogn_po": ["cts_lopair", "cts_pmat"],
    "cogn_ps": ["cts_sdmt", "cts_nccrtd", "cts_stroop_cname", "cts_stroop_wread"],
    "cogn_se":  ["cts_bname", "cts_catflu", "cts_read_nart"],
    "cogn_wo": ["cts_db", "cts_df", "cts_doperf"],
    "cogn_global":  ["cts_wli", "cts_wlii", "cts_wliii", "cts_ebmt", "cts_ebdr",  "cts_story","cts_delay",
                     "cts_lopair", "cts_pmat", "cts_sdmt", "cts_nccrtd", "cts_stroop_cname", "cts_stroop_wread",
                     "cts_bname", "cts_catflu", "cts_read_nart", "cts_db", "cts_df", "cts_doperf"] }
    
# Binary variables: we leave these as is
binary = ['hypertension_cum', 'cancer_cum','diabetes_sr_rx', 'dm_cum', 'headinjrloc_cum', 'lostcons',\
                         'thyroid_cum', 'chf_cum', 'claudication_cum', 'heart_cum', 'stroke_cum', "msex", "spanish"]

# Categorical variables: need to be 1-hot encoded 
categorical = ['apoe_genotype', 'race', 'dcfdx']

In [9]:
# GET DUMMIES FOR CATEGORICAL VARS
temp = pd.concat((ALL_YEARS, CURRENT_YEAR))

for feat in categorical:
    dummies = pd.get_dummies(temp[feat])
    for col in dummies.columns:
        # if the column doesnt actually have any 1s for one of the datasets, don't add it
        if (np.nansum(dummies.iloc[len(ALL_YEARS):][col]) > 0) and np.nansum(dummies.iloc[:len(ALL_YEARS)][col]) > 0:
            new_ALL_YEARS[feat+"__"+str(col)] = dummies.iloc[:len(ALL_YEARS)][col]
            new_CURRENT_YEAR[feat+"__"+str(col)]= dummies.iloc[len(ALL_YEARS):][col]
            
    new_ALL_YEARS.at[ALL_YEARS[feat].isnull(), new_ALL_YEARS.columns.str.startswith(feat)]=np.nan
    new_CURRENT_YEAR.at[CURRENT_YEAR[feat].isnull(), new_CURRENT_YEAR.columns.str.startswith(feat)]=np.nan


In [10]:
# get zscores for continuous vars
zscore_transforms = {}

for feat in cognitive_features + medical_features_sums + continuous_demographics:
    if feat not in excluded_variables:
        zscore_transforms[feat] = (np.nanmean(CURRENT_YEAR[feat]), np.nanstd(CURRENT_YEAR[feat]))
    

for feat in zscore_transforms.keys():
    new_CURRENT_YEAR[feat] = (CURRENT_YEAR[feat]-zscore_transforms[feat][0])/zscore_transforms[feat][1]
    new_ALL_YEARS[feat] = (ALL_YEARS[feat]-zscore_transforms[feat][0])/zscore_transforms[feat][1]

In [11]:
#  Given that we're saving z-scores, we want to save a file to revert back to raw values:
#  un-standardize variables to get the original values:  x = (z*std)+mean
f = open(new_folder+"%iyrprev_within%i_mean_std.csv"%(y,within),"w")
f.write("variable, mean, std\n")
for key, val in zscore_transforms.items():
    f.write("%s, %f, %f\n"%(key, val[0], val[1]))

f.close()

In [12]:
for comp in composite_vars:
    vars_to_check = np.setdiff1d(composite_vars[comp], excluded_variables)
    new_CURRENT_YEAR[comp] = new_CURRENT_YEAR[vars_to_check].mean(axis=1)
    new_ALL_YEARS[comp] = new_ALL_YEARS[vars_to_check].mean(axis=1)

In [13]:
for b in binary:
    if b not in excluded_variables:
        new_CURRENT_YEAR[b] = CURRENT_YEAR[b]
        new_ALL_YEARS[b] = ALL_YEARS[b]

In [14]:
print("VARS WITH MISSING VALUES:")
for df in [new_ALL_YEARS, new_CURRENT_YEAR]:
    print(len(df))
    for col in df.columns:
        if df[col].values.dtype != 'O':
            frac_missing = np.mean(np.isnan(df[col].values))
            if frac_missing > .1:
                print(col, frac_missing)

VARS WITH MISSING VALUES:
24565
onset_label_time 0.6294321188683085
onset_label_time_binary 0.6294321188683085
cts_pmat 0.11768776714838185
cts_stroop_cname 0.5438225117036434
cts_stroop_wread 0.5439853450030532
9103
cts_stroop_cname 0.6451719213446117
cts_stroop_wread 0.6457211908162145


In [15]:
new_CURRENT_YEAR.columns

Index(['projid', 'study', 'fu_year', 'scaled_to', 'onset_label_time',
       'onset_label_time_binary', 'apoe_genotype__22.0', 'apoe_genotype__23.0',
       'apoe_genotype__24.0', 'apoe_genotype__33.0', 'apoe_genotype__34.0',
       'apoe_genotype__44.0', 'race__1.0', 'race__2.0', 'race__3.0',
       'race__6.0', 'dcfdx__1.0', 'dcfdx__2.0', 'dcfdx__3.0', 'cts_animals',
       'cts_bname', 'cts_catflu', 'cts_db', 'cts_delay', 'cts_df',
       'cts_doperf', 'cts_ebdr', 'cts_ebmt', 'cts_fruits', 'cts_idea',
       'cts_lopair', 'cts_mmse30', 'cts_nccrtd', 'cts_pmat', 'cts_pmsub',
       'cts_read_nart', 'cts_sdmt', 'cts_story', 'cts_stroop_cname',
       'cts_stroop_wread', 'cts_wli', 'cts_wlii', 'cts_wliii',
       'med_con_sum_cum', 'vasc_3dis_sum', 'vasc_risks_sum', 'age_at_visit',
       'educ', 'cogn_ep', 'cogn_po', 'cogn_ps', 'cogn_se', 'cogn_wo',
       'cogn_global', 'hypertension_cum', 'cancer_cum', 'diabetes_sr_rx',
       'dm_cum', 'headinjrloc_cum', 'thyroid_cum', 'claudicatio

In [16]:
print(new_ALL_YEARS.shape, new_CURRENT_YEAR.shape)

(24565, 65) (9103, 65)


In [17]:
new_ALL_YEARS.to_csv(new_folder + all_years_fname)
new_CURRENT_YEAR.to_csv(new_folder + cur_year_fname)

print("saved standardized data sets to: \n%s\n%s"%(new_folder + all_years_fname, new_folder + cur_year_fname))

saved standardized data sets to: 
../DATA/PROCESSED/standardized/merged_data_all_2yrprev_within3.csv
../DATA/PROCESSED/standardized/merged_kept_data_2yrprev_within3.csv
