In [1]:
import pandas as pd
import sys
import numpy as np
from matplotlib import pyplot
import h5py
import os
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from sklearn import preprocessing
from sklearn.metrics import f1_score
import time

results = []

  from ._conv import register_converters as _register_converters


In [103]:
#reading in files
X = pd.read_hdf("data/tcga_mutation_train.h5", "expression")
Y = pd.read_hdf('data/tcga_mutation_train.h5', 'labels')

X_holdout = pd.read_hdf("data/tcga_mutation_test_unlabeled.h5", "expression")
#L1000 subsetting
l1000_file = open("L1000_clueio_genelist.txt")
l1000 = [i.strip() for i in l1000_file.readlines()]
#get l1000 genes that are in data
# L1000_= pd.Series(list(set(X.columns) & set(l1000)))
# X_L1000 = X[L1000_] #subset X data

# Prune expression to only KEGG pathway genes
with open("data/c2.cp.kegg.v6.1.symbols.gmt") as f:
    genes_subset = list(set().union(*[line.strip().split("\t")[2:] for line in f.readlines()]))
X_pruned = X.drop(labels=(set(X.columns) - set(genes_subset)), axis=1, errors="ignore")

#using both kegg and L1000
subset_ = set(X_pruned.columns.tolist() + l1000)
subset = pd.Series(list(set(X.columns) & set(subset_)))

print("Number of genes after subsetting:", len(subset))

X_sub = X[subset] #subset X data
X_sub_holdout =X_holdout[subset]


x_array = np.array(X.values, dtype=np.float32)

# extract sample id values
y_names = list(set(Y["detailed_category"].values))
y_names = sorted(y_names)

m,n = X.shape
y_array = np.zeros(shape=(m, len(y_names)+3), dtype=np.float32)

# create a key for id's to indices
y_index_key = {name:i for i,name in enumerate(y_names)}
# generate one-hot vectors for all id's
for m,primary_site_name in enumerate(Y["detailed_category"].values):
    
    index = y_index_key[primary_site_name]
    y_array[m,index] = 1
    
    y_array[m, -3] = Y.iloc[m][6]
    y_array[m, -2] = Y.iloc[m][7]
    y_array[m, -1] = Y.iloc[m][8]
    
    
#split to train + test
xTrain, xTest, yTrain, yTest = sklearn.model_selection.train_test_split(X_sub, y_array, test_size=0.3, random_state=42)
#split test to test and validate 
xTest, xValidate, yTest, yValidate = sklearn.model_selection.train_test_split(xTest, yTest, test_size=1/3, random_state=42)


print(yTrain)
xTrain = np.array(xTrain.values, dtype=np.float32)
xTest = np.array(xTest.values, dtype=np.float32)

Number of genes after subsetting: 5676
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [3]:
class AutoencoderDataset(Dataset):
    def __init__(self, x):
        x_dtype = torch.FloatTensor

        self.length = x.shape[0]

        self.x_data = torch.from_numpy(x).type(x_dtype)

    def __getitem__(self, index):
        return self.x_data[index], self.x_data[index]

    def __len__(self):
        return self.length
    
class PredictorDataset(Dataset):
    def __init__(self, x, y):
        x_dtype = torch.FloatTensor
        y_dtype = torch.FloatTensor
        self.length = x.shape[0]

        self.x_data = torch.from_numpy(x).type(x_dtype)
        self.y_data = torch.from_numpy(y).type(y_dtype)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.length

In [146]:
import random
def addNoise(x):
    '''set 1% of 1s to 0'''
    noisy = [[0 if (y == 0 or (y * random.uniform(0,1)) <= 0.01) else 1 for y in vector]for vector in x]
    return np.array(noisy)

In [147]:

def train_batch(model, x, y, optimizer, loss_fn, layer):
    # Run forward calculation
    y_predict = model.forward(x, layer)

    # Compute loss.
    loss = loss_fn(y_predict, y)

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

    return loss.data[0]


def train(model, x_in, y_in, optimizer, loss_fn, layer, epochs=20, batch_size=16):
    losses = list()

    batch_index = 0
    for e in range(epochs):
        x_new = addNoise(x_in)
        dataset = PredictorDataset(x=x_new, y=y_in)
        loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
        for x, y in loader:
            x = Variable(x)
            y = Variable(y)

            loss = train_batch(model=model, x=x, y=y, optimizer=optimizer, loss_fn=loss_fn, layer=layer)
            losses.append(loss)

            batch_index += 1


    return losses



In [148]:
def test_batch(model, x, y):
    # run forward calculation
    y_predict = model.forward(x, 0)

    return y, y_predict


def test(model, loader):
    y_vectors = list()
    y_predict_vectors = list()

    batch_index = 0
    for x, y in loader:
        x = Variable(x)
        y = Variable(y)

        y, y_predict = test_batch(model=model, x=x, y=y)

        y_vectors.append(y.data.numpy())
        y_predict_vectors.append(y_predict.data.numpy())

        batch_index += 1

    y_predict_vector = np.concatenate(y_predict_vectors)

    return y_predict_vector


In [149]:
class ShallowLinear(nn.Module):
    '''
    A simple, general purpose, fully connected network
    '''
    def __init__(self, size1, size2, size3, trainData):
        # Perform initialization of the pytorch superclass
        super(ShallowLinear, self).__init__()
        learning_rate = 1e-2
        
        # Define network layer dimensions
        D_in, H1, H2, H3, D_out = [len(xTest[0]), size1, size2, size3, len(yTest[0])]    # These numbers correspond to each layer: [input, hidden_1, output]
        
        self.linear1 = nn.Linear(D_in, H1)
        self.decode1 = nn.Linear(H1, D_in)
        self.linear2 = nn.Linear(H1, H2)
        self.decode2 = nn.Linear(H2, H1)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, D_out)

    def forward(self, x, layer):
        '''
        This method defines the network layering and activation functions
        '''
        if layer == 0:
            x = self.linear1(x) # hidden layer
            x = F.relu(x)       # activation function

            x = self.linear2(x) # output layer
            x = F.relu(x)       # activation function

            x = self.linear3(x) # output layer
            x = F.relu(x)       # activation function

            x = self.linear4(x) # output layer
            
        elif layer == 1:
            
            x = self.linear1(x) # hidden layer
            x = F.relu(x)       # activation function

            x = self.decode1(x) # output layer
            x = F.relu(x)       # activation function
            
        elif layer == 2:
            
            x = self.linear2(x) # hidden layer
            x = F.relu(x)       # activation function

            x = self.decode2(x) # output layer
            x = F.relu(x)       # activation function
            
        return x
    
    
    def encode(self, dataset, layer):
        """Return the trained hidden layer and encoded input"""
        #Get the dataset
        loader = DataLoader(dataset=dataset, batch_size=len(dataset), shuffle=False)
        xs = []
        for x, y in loader:
            x = Variable(x)

            if layer == 0:
                x = self.linear1(x) # hidden layer
                x = F.relu(x)       # activation function

                x = self.linear2(x) # output layer
                x = F.relu(x)       # activation function
                
            elif layer == 1:
                x = self.linear1(x)
                x = F.relu(x)

            xs.append(x.data.numpy())
        encoding = np.concatenate(xs)
        return encoding

In [150]:
def run(dataset_train, dataset_test, dataset_train_encode, size1, size2, size3, ae):
    # Batch size is the number of training examples used to calculate each iteration's gradient
    batch_size_train = 32
    
    data_loader_train = DataLoader(dataset=dataset_train, batch_size=batch_size_train, shuffle=True)
    data_loader_train_encode = DataLoader(dataset=dataset_train_encode, batch_size=batch_size_train, shuffle=True)
    data_loader_test = DataLoader(dataset=dataset_test, batch_size=len(dataset_test), shuffle=False)
    
    # Define the hyperparameters
    learning_rate = 1e-2

    # Define the loss function
    loss_fn = nn.MSELoss()  # mean squared error

    # Train and get the resulting loss per iteration
    #Train first autoencoder layer
    #Create a new neural net for prediction
    predictor_model = ShallowLinear(size1, size2, size3, xTrain)
    if (ae):
        '''if using autoencoder'''
        
        encoder_model = ShallowLinear(size1, size2, size3, xTrain)

        # Initialize the optimizer with above parameters
        optimizer = optim.SGD(encoder_model.parameters(), lr=learning_rate)
        
        #Train the first autoencoder layer
        train(model=encoder_model, x_in=xTrain, y_in=xTrain, optimizer=optimizer, layer = 1, loss_fn=loss_fn, epochs = 10)
        
        #Train second autoencoder layer
        x_encoded = np.array(encoder_model.encode(dataset_train, 1))
        train(model=encoder_model, x_in = x_encoded, y_in = x_encoded, optimizer=optimizer, layer = 2, loss_fn=loss_fn, epochs = 10)
    
    

        #Copy parameters from autoencoding neural net to predicting neural net
        mp = list(encoder_model.parameters())
        mcp = list(predictor_model.parameters())
        for i in range(len(mp)):
            mp[i].data[:] = mcp[i].data[:]

    #Train the predicting model
    optimizer1 = optim.SGD(predictor_model.parameters(), lr=learning_rate) 
    loss = train(model=predictor_model, x_in=xTrain, y_in = yTrain, optimizer=optimizer1, layer = 0, loss_fn=loss_fn)
    
    # Test and get the resulting predicted y values
    y_predict = test(model=predictor_model, loader=data_loader_test)

    return loss, y_predict


In [None]:

aeResults = []
nnResults = []
def testAE_NN():
    print("Testing stacked autoencoder plus neural net. Two autoencoder layers with relu activation + one layer with relu activation")
    dataset_train = PredictorDataset(x=xTrain, y=yTrain)
    dataset_test = PredictorDataset(x=xTest, y=yTest)
    dataset_train_encode = AutoencoderDataset(xTrain)
    sizes = [(1200, 1100), (int(len(xTest[0])*.75), int(len(xTest[0])/2)),
             (int(len(xTest[0])/5), int(len(xTest[0])/10)), (int(len(xTest[0])/10), int(len(xTest[0])/15))]
    for size in sizes:
        size1, size2 = size
        size3=size2

        ### Train and predict with autoencoder
        startTime = time.time()
        losses, yPredict = run(dataset_train=dataset_train, dataset_test=dataset_test, dataset_train_encode = dataset_train_encode, 
                               size1=size1, size2=size2, size3=size3, ae = True)
        elapsedTime = time.time() - startTime
        yPred = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]>=0.5 else 0, 1 if vector[-2]>=0.5 else 0, 1 if vector[-1]>=0.5 else 0] 
                          for vector in yPredict])

        yActual = np.array([[y_names[np.argmax(vector[0:-3])],1 if vector[-3]==1 else 0, 1 if vector[-2]==1 else 0, 1 if vector[-1]==1 else 0] for vector in yTest])

        isCorrect = [[yPred[i][j] == yActual[i][j] for j in range(len(yPred[0]))] for i in range(len(yPred))]

        
        numCorrect = np.sum(isCorrect, axis=0)
        
        pred0, act0 = [v[0] for v in yPred], [v[0] for v in yActual]
        pred1, act1 =  [int(v[1]) for v in yPred], [v[1] for v in yActual]
        pred2, act2 =  [int(v[2]) for v in yPred], [v[2] for v in yActual]
        pred3, act3 =  [int(v[3]) for v in yPred], [v[3] for v in yActual]
        
        ##### Write output file and get f1 score
        pd.DataFrame({
            "TumorTypePrediction": pred0,
            "TP53MutationPrediction": pred1,
            "KRASMutationPrediction": pred2,
            "BRAFMutationPrediction": pred3,
        }).to_csv("test_predictions.tsv", sep="\t")

        pd.DataFrame({
            "primary.disease.or.tissue": act0,
            "TP53_mutant": act1,
            "KRAS_mutant": act2,
            "BRAF_mutant": act3,
        }).to_csv("test_actuals.tsv", sep="\t")
        
        print("% Correct with autoencoder: ", size1, size2, size3, (numCorrect/len(isCorrect)))
        print("Elapsed time %fs" % elapsedTime)
        !Rscript class/BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv
        
        ### Train and predict without autoencoder
        startTime = time.time()
        losses, yPredict = run(dataset_train=dataset_train, dataset_test=dataset_test, dataset_train_encode = dataset_train_encode, 
                               size1=size1, size2=size2, size3=size3, ae = False)
        elapsedTime = time.time() - startTime
        yPred = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]>=0.5 else 0, 1 if vector[-2]>=0.5 else 0, 1 if vector[-1]>=0.5 else 0] 
                          for vector in yPredict])

        yActual = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]==1 else 0, 1 if vector[-2]==1 else 0, 1 if vector[-1]==1 else 0] 
                            for vector in yTest])

        isCorrect = [[yPred[i][j] == yActual[i][j] for j in range(len(yPred[0]))] for i in range(len(yPred))]

        numCorrect = np.sum(isCorrect, axis=0)

        pred0, act0 = [v[0] for v in yPred], [v[0] for v in yActual]
        pred1, act1 =  [int(v[1]) for v in yPred], [v[1] for v in yActual]
        pred2, act2 =  [int(v[2]) for v in yPred], [v[2] for v in yActual]
        pred3, act3 =  [int(v[3]) for v in yPred], [v[3] for v in yActual]
        
        ##### Write output file and get f1 score
        pd.DataFrame({
            "TumorTypePrediction": pred0,
            "TP53MutationPrediction": pred1,
            "KRASMutationPrediction": pred2,
            "BRAFMutationPrediction": pred3,
        }).to_csv("test_predictions.tsv", sep="\t")

        pd.DataFrame({
            "primary.disease.or.tissue": act0,
            "TP53_mutant": act1,
            "KRAS_mutant": act2,
            "BRAF_mutant": act3,
        }).to_csv("test_actuals.tsv", sep="\t")
        
        print("% Correct without autoencoder: ", size1, size2, size3, (numCorrect/len(isCorrect)))
        print("Elapsed time %f s" % elapsedTime )
        !Rscript class/BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv
        
        

testAE_NN()

Testing stacked autoencoder plus neural net. Two autoencoder layers with relu activation + one layer with relu activation




## Train the model

In [None]:
dataset_train = PredictorDataset(x=xTrain, y=yTrain)
dataset_train_encode = AutoencoderDataset(xTrain)

data_loader_train = DataLoader(dataset=dataset_train, batch_size=batch_size_train, shuffle=True)
data_loader_train_encode = DataLoader(dataset=dataset_train_encode, batch_size=batch_size_train, shuffle=True)

# Batch size is the number of training examples used to calculate each iteration's gradient
batch_size_train = 16


# Define the hyperparameters
learning_rate = 1e-2

# Define the loss function
loss_fn = nn.MSELoss()  # mean squared error

# Train and get the resulting loss per iteration
#Train first autoencoder layer
encoder_model = ShallowLinear(size1, size2, size2, xtrain)

# Initialize the optimizer with above parameters
optimizer = optim.SGD(encoder_model.parameters(), lr=learning_rate)
train(model=encoder_model, x_in=xtrain, y_in = xtrain, optimizer=optimizer, layer = 1, loss_fn=loss_fn, epochs = 30)
#Train second autoencoder layer
x_encoded = encoder_model.encode(dataset_train, 1)
dataset_train_encode = AutoencoderDataset(x_encoded)
data_loader_train_encode = DataLoader(dataset=dataset_train_encode, batch_size=batch_size_train, shuffle=True)
train(model=encoder_model, x_in=x_encoded, y_in= x_encoded, optimizer=optimizer, layer = 2, loss_fn=loss_fn, epochs = 30)

newXTrain = encoder_model.encode(dataset_train, 0) #Pass xTest through the encoder
aeLogRegModel = OneVsRestClassifier(LogisticRegression()).fit(newXTrain, ytrain)
    

## Run the model

In [None]:

dataset_test = AutoencoderDataset(x=xtest)
data_loader_test = DataLoader(dataset=dataset_test, batch_size=len(dataset_test), shuffle=False)

newXTest = encoder_model.encode(dataset_test, 0) #Pass xTest through the encoder
y_predict = aeLogRegModel.predict(newXTest)

yPred = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]>=0.5 else 0, 1 if vector[-2]>=0.5 else 0, 1 if vector[-1]>=0.5 else 0] for vector in yPred])
    
pred0 = [v[0] for v in yPred]
pred1 =  [int(v[1]) for v in yPred]
pred2 =  [int(v[2]) for v in yPred]
pred3 =  [int(v[3]) for v in yPred]

##### Write output file and get f1 score
pd.DataFrame({
    "TumorTypePrediction": pred0,
    "TP53MutationPrediction": pred1,
    "KRASMutationPrediction": pred2,
    "BRAFMutationPrediction": pred3,
}).to_csv("predict.tsv", sep="\t")


In [106]:

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

def run_logreg(xtrain, ytrain, xtest, ytest, size1, size2, multi):
    
    dataset_train = PredictorDataset(x=xtrain, y=ytrain)
    dataset_test = PredictorDataset(x=xtest, y=ytest)
    dataset_train_encode = AutoencoderDataset(xtrain)
    
    
    # Batch size is the number of training examples used to calculate each iteration's gradient
    batch_size_train = 16
    
    data_loader_train = DataLoader(dataset=dataset_train, batch_size=batch_size_train, shuffle=True)
    data_loader_train_encode = DataLoader(dataset=dataset_train_encode, batch_size=batch_size_train, shuffle=True)
    data_loader_test = DataLoader(dataset=dataset_test, batch_size=len(dataset_test), shuffle=False)
    
    # Define the hyperparameters
    learning_rate = 1e-2

    # Define the loss function
    loss_fn = nn.MSELoss()  # mean squared error

    # Train and get the resulting loss per iteration
    #Train first autoencoder layer
    encoder_model = ShallowLinear(size1, size2, size2, xtrain)

    # Initialize the optimizer with above parameters
    optimizer = optim.SGD(encoder_model.parameters(), lr=learning_rate)
    train(model=encoder_model, x_in=xtrain, y_in = xtrain, optimizer=optimizer, layer = 1, loss_fn=loss_fn, epochs = 30)
    #Train second autoencoder layer
    x_encoded = encoder_model.encode(dataset_train, 1)
    dataset_train_encode = AutoencoderDataset(x_encoded)
    data_loader_train_encode = DataLoader(dataset=dataset_train_encode, batch_size=batch_size_train, shuffle=True)
    train(model=encoder_model, x_in=x_encoded, y_in= x_encoded, optimizer=optimizer, layer = 2, loss_fn=loss_fn, epochs = 30)

    newXTrain = encoder_model.encode(dataset_train, 0) #Pass xTest through the encoder
    newXTest = encoder_model.encode(dataset_test, 0) #Pass xTest through the encoder
    
    # Test and get the resulting predicted y values
    if multi:
        aeLogRegModel = OneVsRestClassifier(LogisticRegression()).fit(newXTrain, ytrain)
    else:
        aeLogRegModel = LogisticRegression().fit(newXTrain, ytrain)
    
    y_predict_ae = aeLogRegModel.predict(newXTest)
    return y_predict_ae


aeResults = []
yActual = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]==1 else 0, 1 if vector[-2]==1 else 0, 1 if vector[-1]==1 else 0] for vector in yTest])
sizes = [
         (int(len(xTest[0])/5), int(len(xTest[0])/10))]
'''
print("Results logreg without autoencoder:")
startTime = time.time()
aeLogRegModel = OneVsRestClassifier(LogisticRegression()).fit(xTrain, np.array(yTrain))
yPred = aeLogRegModel.predict(xTest)
elapsedTime = time.time() - startTime
yPred = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]>=0.5 else 0, 1 if vector[-2]>=0.5 else 0, 1 if vector[-1]>=0.5 else 0] for vector in yPred])

isCorrect = [[yPred[i][j] == yActual[i][j] for j in range(len(yPred[0]))] for i in range(len(yPred))]


numCorrect = np.sum(isCorrect, axis=0)

pred0, act0 = [v[0] for v in yPred], [v[0] for v in yActual]
pred1, act1 =  [int(v[1]) for v in yPred], [v[1] for v in yActual]
pred2, act2 =  [int(v[2]) for v in yPred], [v[2] for v in yActual]
pred3, act3 =  [int(v[3]) for v in yPred], [v[3] for v in yActual]

##### Write output file and get f1 score
pd.DataFrame({
    "TumorTypePrediction": pred0,
    "TP53MutationPrediction": pred1,
    "KRASMutationPrediction": pred2,
    "BRAFMutationPrediction": pred3,
}).to_csv("test_predictions.tsv", sep="\t")

pd.DataFrame({
    "primary.disease.or.tissue": act0,
    "TP53_mutant": act1,
    "KRAS_mutant": act2,
    "BRAF_mutant": act3,
}).to_csv("test_actuals.tsv", sep="\t")

print("% Correct with logreg : ", (numCorrect/len(isCorrect)))
print("Elapsed time: %f s" % elapsedTime)
!Rscript class/BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv
'''
print()
print("results logreg with autoencoder")
for size in sizes:
    size1, size2 = size
    
    startTime = time.time()
    yPred = run_logreg(xTrain, np.array(yTrain), np.array(X_sub_holdout), np.array(X_sub_holdout), size1, size2, True)
    elapsedTime = time.time() - startTime
    
    yPred = np.array([[y_names[np.argmax(vector[0:-3])], 1 if vector[-3]>=0.5 else 0, 1 if vector[-2]>=0.5 else 0, 1 if vector[-1]>=0.5 else 0] for vector in yPred])
    
    isCorrect = [[yPred[i][j] == yActual[i][j] for j in range(len(yPred[0]))] for i in range(len(yPred))]


    numCorrect = np.sum(isCorrect, axis=0)
    
    pred0, act0 = [v[0] for v in yPred], [v[0] for v in yActual]
    pred1, act1 =  [int(v[1]) for v in yPred], [v[1] for v in yActual]
    pred2, act2 =  [int(v[2]) for v in yPred], [v[2] for v in yActual]
    pred3, act3 =  [int(v[3]) for v in yPred], [v[3] for v in yActual]

    ##### Write output file and get f1 score
    pd.DataFrame({
        "TumorTypePrediction": pred0,
        "TP53MutationPrediction": pred1,
        "KRASMutationPrediction": pred2,
        "BRAFMutationPrediction": pred3,
    }).to_csv("predict.tsv", sep="\t")
    
    pd.DataFrame({
        "primary.disease.or.tissue": act0,
        "TP53_mutant": act1,
        "KRAS_mutant": act2,
        "BRAF_mutant": act3,
    }).to_csv("test_actuals.tsv", sep="\t")

    print("% Correct with logreg and autoencoder: ", size1, size2, (numCorrect/len(isCorrect)))
    print("Elapsed time: %fs" % elapsedTime)
    !Rscript class/BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv
    


results logreg with autoencoder


