## Shapley Values with MLP

## Load our RNA Sequencing data

In [4]:
import pickle
import pandas as pd

# load data
DATADIR = '/fdata/ohsu_data/'

X_rna_seq = pickle.load(open(DATADIR + 'X_rna_seq.p','rb'))
X_rna_seq = X_rna_seq.loc[:,~X_rna_seq.columns.duplicated()]
X_drug_labels = pickle.load(open(DATADIR + 'X_drug_labels.p','rb'))
final_frame = pickle.load(open(DATADIR + 'final_frame.p','rb'))

X = pd.concat([X_drug_labels,X_rna_seq],axis=1)
y = final_frame.IC50

## Load MERGE Data

In [None]:
import pandas as pd

merge_df = pd.read_csv("merge_features.csv")

In [None]:
merge_df = merge_df.transpose()
new_header = merge_df.iloc[0] #grab the first row for the header
merge_df = merge_df[1:] #take the data less the header row
merge_df.columns = new_header #set the header row as the df header
merge_df = merge_df.astype(float)

Some genes are present only in the MERGE dataset, while some are only present in the RNA-seq data.  Here we limit ourselves to only considering the genes present in both.

In [None]:
overlapping_genes = list(set(merge_df.columns).intersection(rna_seq.columns))
merge_df = merge_df[overlapping_genes]
rna_seq = pd.concat([rna_seq[['patient_id']], rna_seq[overlapping_genes]], axis=1)

## Load our Drug Response Data

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

The drug response data (our $y$'s) are _very_ bimodal.  As such, here we rescale it to make model prediction less insane.

In [44]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
y_train = sc.fit_transform(y_train.values.reshape(-1,1))
y_test = sc.transform(y_test.values.reshape(-1,1))

In [45]:
X_train = X_train.values
X_test = X_test.values

y_train = y_train.reshape(y_train.shape[0])
y_test = y_test.reshape(y_test.shape[0])

Here we generate the set of model inputs that we will use (i.e., our $x$).  For each drug response row in $y$, we grab the RNA-seq data for that patient.

In [57]:
from torch.nn import MSELoss
from torch import nn
import torch.optim as optim

class MultiDrugMLP(nn.Module):
    def __init__(self, ngenes, ndrugs, dropout):
        super(MultiDrugMLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(ngenes, 500),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(500, 250),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(250, ndrugs)
        )
        
    def forward(self, x):
        return self.layers(x)

mlp = MultiDrugMLP(
    ngenes = X_train.shape[1],
    ndrugs = 1,
    dropout = 0.5
).cuda()


optimizer = optim.Adam(mlp.parameters(),
                       lr=1e-5, weight_decay=5e-3)
criterion = MSELoss()

In [60]:
from torch.nn import MSELoss
import torch.optim as optim
import torch
import datetime
import time
from IPython.core.debugger import set_trace
from models.models import GCN

def train(model, num_epochs, X_train, X_test, y_train, y_test, batch_size):
     
    X_test = torch.FloatTensor(X_test).cuda()
    y_test = torch.FloatTensor(y_test).cuda()
    
    print("Beginning model training at {}".format(datetime.datetime.now()))
    
    for epoch in range(num_epochs):
        start_time = time.time()
        for i in range(0, X_train.shape[0], batch_size):
            X_batch = torch.FloatTensor(X_train[i:i+batch_size]).cuda()
            y_batch = torch.FloatTensor(y_train[i:i+batch_size]).cuda()
            model.train()
            optimizer.zero_grad()
            
            # Convert batch_size x 1 tensor into 1d tensor of length batch_size
            output = model(X_batch)
            
            loss_train = criterion(output, y_batch) + 0.001*(torch.norm(model.layers[0].weight, p=1) + torch.norm(model.layers[3].weight, p=1) + torch.norm(model.layers[6].weight, p=1))
            
            #print("Epoch {} batch {}/{} train loss {}".format(epoch, i+batch_size, x_train.shape[0], loss_train))
            loss_train.backward()
            optimizer.step()
            
        model.eval()
        output = model(X_test).squeeze(1)
        loss_test = criterion(output, y_test)
        end_time = time.time()
        epoch_time = end_time - start_time
        
        print("Epoch {} completed in {} secs with test loss {:.4f}".format(epoch, epoch_time, loss_test.item()))

In [61]:
train(mlp, num_epochs = 1000, X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test, batch_size=128)

Beginning model training at 2019-10-15 15:37:48.415739
Epoch 0 completed in 4.801731586456299 secs with test loss 1.0273
Epoch 1 completed in 4.43738317489624 secs with test loss 1.0271
Epoch 2 completed in 4.649473428726196 secs with test loss 1.0264
Epoch 3 completed in 4.282020330429077 secs with test loss 1.0267
Epoch 4 completed in 4.561908960342407 secs with test loss 1.0269
Epoch 5 completed in 4.641335964202881 secs with test loss 1.0268
Epoch 6 completed in 4.37453818321228 secs with test loss 1.0268
Epoch 7 completed in 4.597288131713867 secs with test loss 1.0268
Epoch 8 completed in 4.561112642288208 secs with test loss 1.0268
Epoch 9 completed in 4.685428142547607 secs with test loss 1.0268
Epoch 10 completed in 4.867961406707764 secs with test loss 1.0268
Epoch 11 completed in 4.804675340652466 secs with test loss 1.0268
Epoch 12 completed in 4.784103631973267 secs with test loss 1.0268
Epoch 13 completed in 4.9442548751831055 secs with test loss 1.0268
Epoch 14 completed

KeyboardInterrupt: 

In [None]:
y_train