In [1]:
import pandas as pd
import sys
import numpy as np
from matplotlib import pyplot
import h5py
import os
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from sklearn import preprocessing
from scipy.stats import randint as spRand


results = []

  from ._conv import register_converters as _register_converters


In [2]:
#reading in files
X = pd.read_hdf("data/tcga_mutation_train.h5", "expression")
Y = pd.read_hdf('data/tcga_mutation_train.h5', 'labels')

#L1000 subsetting
l1000_file = open("L1000_clueio_genelist.txt")
l1000 = [i.strip() for i in l1000_file.readlines()]
#get l1000 genes that are in data
L1000_= pd.Series(list(set(X.columns) & set(l1000)))
X_L1000 = X[L1000_] #subset X data

# Prune expression to only KEGG pathway genes
with open("c4.cgn.v6.1.symbols (1).gmt") as f:
    genes_subset = list(set().union(*[line.strip().split("\t")[2:] for line in f.readlines()]))
X_pruned = X.drop(labels=(set(X.columns) - set(genes_subset)), axis=1, errors="ignore")

# Encode disease
from sklearn import preprocessing
disease_encoder = preprocessing.LabelEncoder()
disease_encoder.fit(Y["primary.disease.or.tissue"])
Y["disease_encoding"] = disease_encoder.transform(Y["primary.disease.or.tissue"])

# Divide up into train and test
import sklearn.model_selection
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
    X_pruned, Y, test_size=0.20, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [5]:
# use RandomizedSearchCV to randomly probe hyperparameters, output is highest scoring param_dist
def randomCV(features):
    clf = RandomForestClassifier()
    param_dist = {'bootstrap' : [False, True], 
                      'criterion' : ['gini','entropy'],
                      'max_depth' : [3 , None], 
                      'max_features' : spRand(1,len(features) + 1),
                      'min_samples_leaf' : spRand(1,11), 
                      'min_samples_split' : spRand(2,11), 
                      'n_estimators' : spRand(500,996),
                      'warm_start' : [False, True]}

    n_iter_search = 4
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)
    return(random_search)
#tp53_model = randomCV(X_train.columns)
print('howdy')
#tp53_model.fit(X_train, Y_train.TP53_mutant)
#print("TP53 Score:", tp53_model.score(X_test, Y_test.TP53_mutant))


howdy


In [10]:
tp53_model = RandomForestClassifier(n_estimators=1000)
tp53_model.fit(X_train, Y_train.TP53_mutant)
print("TP53 Score:", tp53_model.score(X_test, Y_test.TP53_mutant))

kras_model = RandomForestClassifier(n_estimators=1000)
kras_model.fit(X_train, Y_train.KRAS_mutant)
print("KRAS Score:", kras_model.score(X_test, Y_test.KRAS_mutant))

braf_model = RandomForestClassifier(n_estimators=1000)
braf_model.fit(X_train, Y_train.BRAF_mutant)
print("BRAF Score:", braf_model.score(X_test, Y_test.BRAF_mutant))

TP53 Score: 0.8104460093896714
KRAS Score: 0.9465962441314554
BRAF Score: 0.9559859154929577


In [11]:
disease_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=500)).fit(X_train, Y_train["primary.disease.or.tissue"])
print("Disease Score:", disease_model.score(X_test, Y_test["primary.disease.or.tissue"]))

Disease Score: 0.9178403755868545


In [8]:
pd.DataFrame({
    "TumorTypePrediction": disease_model.predict(X_test),
    "TP53MutationPrediction": tp53_model.predict(X_test),
    "KRASMutationPrediction": kras_model.predict(X_test),
    "BRAFMutationPrediction": braf_model.predict(X_test),
}).to_csv("test_predictions.tsv", sep="\t")

pd.DataFrame({
    "primary.disease.or.tissue": Y_test["primary.disease.or.tissue"],
    "TP53_mutant": Y_test.TP53_mutant,
    "KRAS_mutant": Y_test.KRAS_mutant,
    "BRAF_mutant": Y_test.BRAF_mutant,
}).to_csv("test_actuals.tsv", sep="\t")
!Rscript BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv

[1] "Pheochromocytoma & Paraganglioma_F1_score: 0.967741935483871"
[1] "Cervical & Endocervical Cancer_F1_score: 0.703703703703704"
[1] "Breast Invasive Carcinoma_F1_score: 0.983050847457627"
[1] "Lung Adenocarcinoma_F1_score: 0.887573964497042"
[1] "Lung Squamous Cell Carcinoma_F1_score: 0.792207792207792"
[1] "Colon Adenocarcinoma_F1_score: 0.742857142857143"
[1] "Rectum Adenocarcinoma_F1_score: 0.222222222222222"
[1] "Thyroid Carcinoma_F1_score: 0.989247311827957"
[1] "Kidney Clear Cell Carcinoma_F1_score: 0.932038834951456"
[1] "Esophageal Carcinoma_F1_score: 0.646153846153846"
[1] "Mesothelioma_F1_score: 0.896551724137931"
[1] "Ovarian Serous Cystadenocarcinoma_F1_score: 0.976190476190476"
[1] "Prostate Adenocarcinoma_F1_score: 1"
[1] "Brain Lower Grade Glioma_F1_score: 0.929577464788732"
[1] "Cholangiocarcinoma_F1_score: 0.461538461538462"
[1] "Liver Hepatocellular Carcinoma_F1_score: 0.972222222222222"
[1] "Bladder Urothelial Carcinoma_F1_score: 0.8"
[1] "Uterine Carcinosarcoma_

In [12]:
# use RandomizedSearchCV to randomly probe hyperparameters, output is highest scoring param_dist
def randomCV(features):
    clf = RandomForestClassifier()
    param_dist = {'bootstrap' : [False, True], 
                      'criterion' : ['gini','entropy'],
                      'max_depth' : [3 , None], 
                      'max_features' : spRand(1,len(features) + 1),
                      'min_samples_leaf' : spRand(1,11), 
                      'min_samples_split' : spRand(2,11), 
                      'n_estimators' : spRand(500,996),
                      'warm_start' : [False, True]}

    n_iter_search = 4
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)
    return(random_search)

tp53_model = randomCV(X_train.columns)
tp53_model.fit(X_train, Y_train.TP53_mutant)
print("TP53 Score:", tp53_model.score(X_test, Y_test.TP53_mutant))

howdy
TP53 Score: 0.8309859154929577


In [None]:
kras_model = tp53_model
kras_model.fit(X_train, Y_train.KRAS_mutant)
print("KRAS Score:", kras_model.score(X_test, Y_test.KRAS_mutant))

braf_model = tp53_model
braf_model.fit(X_train, Y_train.BRAF_mutant)
print("BRAF Score:", braf_model.score(X_test, Y_test.BRAF_mutant))

disease_model = OneVsRestClassifier(tp53_model).fit(X_train, Y_train["primary.disease.or.tissue"])
print("Disease Score:", disease_model.score(X_test, Y_test["primary.disease.or.tissue"]))

In [None]:
pd.DataFrame({
    "TumorTypePrediction": disease_model.predict(X_test),
    "TP53MutationPrediction": tp53_model.predict(X_test),
    "KRASMutationPrediction": kras_model.predict(X_test),
    "BRAFMutationPrediction": braf_model.predict(X_test),
}).to_csv("test_predictions.tsv", sep="\t")

pd.DataFrame({
    "primary.disease.or.tissue": Y_test["primary.disease.or.tissue"],
    "TP53_mutant": Y_test.TP53_mutant,
    "KRAS_mutant": Y_test.KRAS_mutant,
    "BRAF_mutant": Y_test.BRAF_mutant,
}).to_csv("test_actuals.tsv", sep="\t")
!Rscript BME230_F1score_V2.R test_predictions.tsv test_actuals.tsv