In [100]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
import h5py
import os
import pickle
from matplotlib import pyplot as plt
%matplotlib inline
import gc

### Read in data

In [101]:
orig_folder ="../DATA/PROCESSED/standardized/"
new_folder = "../DATA/PROCESSED/standardized_stacked_imputed/"

In [102]:
years = 2 # how many years in the past to collect data from
file_suffix = str(years)+"yrprev_within3"
DATA = pd.read_csv(os.path.join(orig_folder,"merged_kept_data_%s.csv"%file_suffix), index_col=0)
DATA_all = pd.read_csv(os.path.join(orig_folder,"merged_data_all_%s.csv"%file_suffix), index_col=0).drop_duplicates()

In [1]:
for col in DATA.columns[4:]:
    if np.mean(np.isnan(DATA[col])) > 0:
        print(col, np.mean(np.isnan(DATA[col])))

NameError: name 'DATA' is not defined

### Impute missing values (mean for continuous, mode for binary)

In [108]:
# IF VARIABLE IS MISSING, USE MEAN IMPUTATION FOR CONTINUOUS VALS, MODE IMPUTATION FOR BINARY VALS:
# these can stay as is
binary = DATA.columns[[len(np.unique(DATA[c].dropna())) == 2 for c in DATA.columns]]

impute_vals = {}


for col in DATA.columns[4:]:
    if col in binary:
        impute_vals[col] = DATA[col].mode().values[0]
    else:
        impute_vals[col] = DATA[col].mean()

    if np.mean(np.isnan(DATA[col])) > 0:
        if col in binary:
            # FILL WITH MOST COMMON:
            DATA[col].fillna(DATA[col].mode().values[0], inplace=True)
        else:
            DATA[col].fillna(DATA[col].mean(), inplace=True)


In [111]:
feature_names = DATA.columns[6:]

In [112]:
feature_names

Index(['apoe_genotype__22.0', 'apoe_genotype__23.0', 'apoe_genotype__24.0',
       'apoe_genotype__33.0', 'apoe_genotype__34.0', 'apoe_genotype__44.0',
       'race__1.0', 'race__2.0', 'race__3.0', 'race__6.0', 'dcfdx__1.0',
       'dcfdx__2.0', 'dcfdx__3.0', 'cts_animals', 'cts_bname', 'cts_catflu',
       'cts_db', 'cts_delay', 'cts_df', 'cts_doperf', 'cts_ebdr', 'cts_ebmt',
       'cts_fruits', 'cts_idea', 'cts_lopair', 'cts_mmse30', 'cts_nccrtd',
       'cts_pmat', 'cts_pmsub', 'cts_read_nart', 'cts_sdmt', 'cts_story',
       'cts_stroop_cname', 'cts_stroop_wread', 'cts_wli', 'cts_wlii',
       'cts_wliii', 'med_con_sum_cum', 'vasc_3dis_sum', 'vasc_risks_sum',
       'age_at_visit', 'educ', 'cogn_ep', 'cogn_po', 'cogn_ps', 'cogn_se',
       'cogn_wo', 'cogn_global', 'hypertension_cum', 'cancer_cum',
       'diabetes_sr_rx', 'dm_cum', 'headinjrloc_cum', 'thyroid_cum',
       'claudication_cum', 'heart_cum', 'stroke_cum', 'msex', 'spanish'],
      dtype='object')

### Stack matrices for each year (required for LSTM)

In [113]:
ALL_SAMPLES = DATA[["projid", "fu_year", "onset_label_time", "onset_label_time_binary"]].values

ALL_FEATURES_TIME = np.zeros([len(DATA), years+1, len(feature_names)])
for i,row in DATA.iterrows():
    if i%1000==0:
        print("%i of %i"%(i, len(DATA)))

    for j,t in enumerate(range(years, 0, -1)):
        pid_df = DATA_all[DATA_all["projid"]==row["projid"]]
        
        new_rows = pid_df[pid_df["fu_year"].isin(row["fu_year"]-np.arange(1,years+1))]

        for col in new_rows.columns[new_rows.isnull().any()].tolist():
            new_rows[col] = new_rows[col].fillna(impute_vals[col])
        ALL_FEATURES_TIME[i,:-1,:] = new_rows[feature_names].values
        
    ALL_FEATURES_TIME[i,-1,:]=(row[feature_names].values.flatten())

0 of 9103


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1000 of 9103
2000 of 9103
3000 of 9103
4000 of 9103
5000 of 9103
6000 of 9103
7000 of 9103
8000 of 9103
9000 of 9103


In [114]:
ALL_FEATURES_TIME.shape

(9103, 3, 59)

In [116]:
import h5py

if not os.path.isdir(new_folder):
    os.makedirs(new_folder)

with h5py.File(os.path.join(new_folder,"%s.h5"%file_suffix), 'w') as hf:
    hf.create_dataset("features", data=ALL_FEATURES_TIME)
    hf.create_dataset("samples", data=ALL_SAMPLES)
