In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import h5py
import copy
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import pca
%matplotlib inline

import sys
path_to_configs = "../"
sys.path.append(path_to_configs)
from configs import *
from models import *
from experiment_helpers import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
path_to_final_data = path_to_configs + path_to_MDAD_data_folders + full_pca_dataset + ".h5"
with h5py.File(path_to_final_data, 'r') as hf:
    print(list(hf.keys()))
    ge = hf["ge_transformed"][:,:500]
    gene_symbols =  hf["gene_symbols"][:].astype(str)
    labels = hf["labels"][:]
    labels_names = hf["labels_names"][:]


['PCA_components_', 'ge_transformed', 'gene_symbols', 'labels', 'labels_names']


In [5]:
phenotypes = ["CERAD", "BRAAK", "PLAQUES", "TANGLES", "ABETA_IHC", "TAU_IHC"]

labs = pd.DataFrame(labels.astype(str), columns=labels_names.astype(str))

for p in phenotypes:
    labs[p] = labs[p].astype(float)
    if p in ["BRAAK", "CERAD"]:
        labs[p]=labs[p]/labs[p].max()

### Direct application method:Train linear model on original dataset:

In [8]:
LINEAR_MODELS = {}
for p in phenotypes:
    
    X = ge[np.where(~np.isnan(labs[p]))]
    Y = labs[p][~np.isnan(labs[p])].values
    LINEAR_MODELS[p] = LinearRegression()
    LINEAR_MODELS[p].fit(X,Y)

In [9]:
path_to_processed = path_to_configs + path_to_ext_val_data_folder
path_to_results = path_to_configs + path_to_ext_val_results

In [1]:
for dset in os.listdir("%sprocessed/all_human/ge_pca/"%path_to_processed):
    X_test = np.loadtxt("%sprocessed/all_human/ge_pca/%s"%(path_to_processed,dset))
    
    preds = pd.DataFrame()
    for p in phenotypes:
        preds[p] = LINEAR_MODELS[p].predict(X_test)
    
    if not os.path.isdir("%spredictions/%s/Linear"%(path_to_results,dset)):
        os.makedirs("%spredictions/%s/Linear"%(path_to_results,dset))
    preds.to_csv("%spredictions/%s/Linear/final.csv"%(path_to_results, dset))

# Intersection Method

In [11]:
dpath = "_intersection"
for dset in ["Mouse", "Blood_GSE63060", "Blood_GSE63061"]:

    if dpath == "_intersection":

        X = np.loadtxt("%sprocessed_intersection/%s/GE_PCA_train.txt"%(path_to_processed, dset))

        LINEAR_MODELS = {}
        for p in phenotypes:

            X = ge[np.where(~np.isnan(labs[p]))]
            Y = labs[p][~np.isnan(labs[p])].values
            LINEAR_MODELS[p] = LinearRegression()
            LINEAR_MODELS[p].fit(X,Y)

        X_test = np.loadtxt("%sprocessed%s/%s/GE_PCA_test.txt"%(path_to_processed,dpath,dset))

    else:

        with h5py.File(path_to_final_data, 'r') as hf:
            X = hf["ge_transformed"][:,:500]

        X_test = np.loadtxt("%sprocessed%s/%s/GE_PCA.txt"%(path_to_processed,dpath,dset))


    preds = []
    for p in phenotypes:
        model = LINEAR_MODELS[p]
        preds.append(model.predict(X_test))


    path_to_preds = "%spredictions%s/%s/Linear/"%(path_to_results,dpath,dset)

    if not os.path.isdir(path_to_preds):
        os.makedirs(path_to_preds)

    pred_df = pd.DataFrame(np.vstack(preds).T, columns=phenotypes)
    pred_df.to_csv("%sfinal.csv"%(path_to_preds))
    print("%sfinal.csv"%(path_to_preds))


../../../Pipeline_Outputs_Submitted/External_Validation/predictions_intersection/Mouse/Linear/final.csv
../../../Pipeline_Outputs_Submitted/External_Validation/predictions_intersection/Blood_GSE63060/Linear/final.csv
../../../Pipeline_Outputs_Submitted/External_Validation/predictions_intersection/Blood_GSE63061/Linear/final.csv
