In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, metrics
from sklearn.svm import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn import tree
from sklearn.linear_model import *
from sklearn.model_selection import *
import pandas as pd

In [None]:
# torch.manual_seed(42)
rs = 42
random.seed(rs)
np.random.seed(rs)

In [None]:
data_folder = '/kaggle/input/tcga-brca-final/'

train_cnv = np.load(data_folder + 'train_cnv_int.npy')
# train_cnv_unlabeled = np.load(data_folder + 'unlabeled_cnv_int.npy')
val_cnv = np.load(data_folder + 'val_cnv_int.npy')
test_cnv = np.load(data_folder + 'test_cnv_int.npy')

train_mrna = np.load(data_folder + 'train_mrna.npy')
# train_mrna_unlabeled = np.load(data_folder + 'unlabeled_mrna.npy')
val_mrna = np.load(data_folder + 'val_mrna.npy')
test_mrna = np.load(data_folder + 'test_mrna.npy')

train_label = np.load(data_folder + 'train_label.npy', allow_pickle=True)
val_label = np.load(data_folder + 'val_label.npy', allow_pickle=True)
test_label = np.load(data_folder + 'test_label.npy', allow_pickle=True)

In [None]:
train_patients = np.load(data_folder + 'train_patients.npy', allow_pickle=True).tolist()
val_patients = np.load(data_folder + 'val_patients.npy', allow_pickle=True).tolist()
test_patients = np.load(data_folder + 'test_patients.npy', allow_pickle=True).tolist()
clinical  = pd.read_csv(data_folder + 'TCGA.BRCA.sampleMap_BRCA_clinicalMatrix', sep='\t')
male_sample = set(clinical[clinical['gender']=='MALE']['sampleID'].values)
train_male = list(map(train_patients.index, list(male_sample&set(train_patients))))
val_male = list(map(val_patients.index, list(male_sample&set(val_patients))))
test_male = list(map(test_patients.index, list(male_sample&set(test_patients))))

train_cnv = np.delete(train_cnv, train_male, axis=0)
# train_cnv_unlabeled = np.delete(train_cnv_unlabeled, unlabeled_male_cnv, axis=0)
val_cnv = np.delete(val_cnv, val_male, axis=0)
test_cnv = np.delete(test_cnv, test_male, axis=0)

train_mrna = np.delete(train_mrna, train_male, axis=0)
# train_mrna_unlabeled = np.delete(train_mrna_unlabeled, unlabeled_male_mrna, axis=0)
val_mrna = np.delete(val_mrna, val_male, axis=0)
test_mrna = np.delete(test_mrna, test_male, axis=0)

train_label = np.delete(train_label, train_male)
val_label = np.delete(val_label, val_male)
test_label = np.delete(test_label, test_male)

In [None]:
g2id = np.load(data_folder+'g2id.npy', allow_pickle=True).item()
all_genes = g2id.keys()

In [None]:
pam50 = pd.read_csv(data_folder+'PAM50_GENES.csv')['PAM50_GENES'].to_numpy()
cosmic = pd.read_csv(data_folder+'COSMIC_brca_ground_truth.csv')['Gene Symbol'].to_numpy()
other = pd.read_csv(data_folder+'brca_gene_ground_truth.csv')
nature17676 = other['nature17676_of_somatic_mutations'].to_numpy()
ncomms11479 = other['ncomms11479'].to_numpy()

In [None]:
ord_enc = preprocessing.OrdinalEncoder(dtype='int64')
ord_enc.fit(train_label.reshape(-1,1))
y_train = ord_enc.transform(train_label.reshape(-1,1)).squeeze()
y_val = ord_enc.transform(val_label.reshape(-1,1)).squeeze()
y_test = ord_enc.transform(test_label.reshape(-1,1)).squeeze()

In [None]:
train_mrna = np.concatenate((train_mrna, val_mrna), axis=0)
train_cnv = np.concatenate((train_cnv, val_cnv), axis=0)
y_train = np.concatenate((y_train, y_val))

In [None]:
# max_row = 20
data_folder = '/kaggle/input/trained-models/'
# basal_genes = np.loadtxt(data_folder+'basal_top100_contrib.txt', dtype='str', max_rows=max_row)
# her2_genes = np.loadtxt(data_folder+'her2_top100_contrib.txt', dtype='str', max_rows=max_row)
# lumA_genes = np.loadtxt(data_folder+'lumA_top100_contrib.txt', dtype='str', max_rows=max_row)
# lumB_genes = np.loadtxt(data_folder+'lumB_top100_contrib.txt', dtype='str', max_rows=max_row)
# normal_genes = np.loadtxt(data_folder+'normal_top100_contrib.txt', dtype='str', max_rows=max_row)

basal_genes = pd.read_csv(data_folder+'basal_gecna.csv')['Gene'].values
her2_genes = pd.read_csv(data_folder+'her2_gecna.csv')['Gene'].values
lumA_genes = pd.read_csv(data_folder+'lumA_gecna.csv')['Gene'].values
lumB_genes = pd.read_csv(data_folder+'lumB_gecna.csv')['Gene'].values
normal_genes = pd.read_csv(data_folder+'normal_gecna.csv')['Gene'].values

In [None]:
# gen_set = list(set(basal_genes)|
#                set(her2_genes)|
#                set(lumA_genes)|
#                set(lumB_genes)|
#                set(normal_genes))
k = 100
gen_set = list(set(basal_genes[:k])|
               set(her2_genes[:k])|
               set(lumA_genes[:k])|
               set(lumB_genes[:k])|
               set(normal_genes[:k]))

In [None]:
len(gen_set)

In [None]:
gen_id = list(map(g2id.get, gen_set))
ge_gen_id = []
cnv_gen_id = []
for x,y in gen_id:
    ge_gen_id.extend(x)
    cnv_gen_id.extend(y)

In [None]:
X_train = np.concatenate((train_mrna[:, ge_gen_id], train_cnv[:, cnv_gen_id]), axis=1)
X_test = np.concatenate((test_mrna[:, ge_gen_id], test_cnv[:, cnv_gen_id]), axis=1)

# X_train = train_mrna[:, ge_gen_id]
# X_test = test_mrna[:, ge_gen_id]

# X_train = train_cnv[:, cnv_gen_id]
# X_test = test_cnv[:, cnv_gen_id]

# X_train = np.concatenate((train_mrna, train_cnv), axis=1)
# X_test = np.concatenate((test_mrna, test_cnv), axis=1)

In [None]:
# params = {'C': [0.001, 0.01, 0.1, 1],
#           'class_weight': ['balanced',
#                            {0:1, 1:1, 2:2, 3:5, 4:5},
#                            {0:1, 1:1, 2:1, 3:4, 4:4},
#                            {0:1, 1:1, 2:1, 3:2, 4:2},
#                            {0:1, 1:1, 2:1, 3:3, 4:3}]}

params = {'n_estimators':[1000],
          'max_samples':[0.5, 0.8, 0.9],
          'max_features':['sqrt', 'log2', None],
          'class_weight':['balanced', 'balanced_subsample']}

# params = {'C': [1, 10, 100, 1000, 10000],
#           'class_weight': ['balanced',
#                            {0:1, 1:1, 2:2, 3:5, 4:5},
#                            {0:1, 1:1, 2:1, 3:4, 4:4},
#                            {0:1, 1:1, 2:1, 3:2, 4:2},
#                            {0:1, 1:1, 2:1, 3:3, 4:3}]}

scoring = {'Accuracy': 'accuracy', 'F1': 'f1_macro'}
logis = RandomForestClassifier(random_state=rs)
# logis = DecisionTreeClassifier(random_state=rs)
# logis = LogisticRegression(max_iter=10000, tol=1e-4, random_state=rs)
# logis = SVC(max_iter=10000, tol=1e-3, random_state=rs)
cv_generator = RepeatedStratifiedKFold(random_state=rs)
clf = GridSearchCV(estimator=logis, 
                   cv=cv_generator, 
                   param_grid=params, 
                   scoring=scoring, 
                   refit='Accuracy', 
                   verbose=1, n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
# clf = GridSearchCV(estimator=logis, cv=cv_generator, param_grid=params, scoring=scoring, refit='F1', verbose=2)
# clf = GridSearchCV(estimator=logis, param_grid=params, scoring=scoring, refit='Accuracy', verbose=3)
# clf.fit(X_train, y_train)

In [None]:
# acc_fold = [clf.cv_results_['split'+str(i)+'_test_Accuracy'][clf.best_index_] for i in range(5)]
# f1_fold = [clf.cv_results_['split'+str(i)+'_test_F1'][clf.best_index_] for i in range(5)]

In [None]:
# print('Best param: ', clf.best_params_)
# print('Best accuracy: ', np.mean(acc_fold))
# print('Best f1_macro: ', np.mean(f1_fold))

In [None]:
print('Best param: ', clf.best_params_)
print('Best score: ', clf.best_score_)

In [None]:
results_df = pd.DataFrame(clf.cv_results_)
results_df[results_df.columns[results_df.columns.str.contains('rank')]]

In [None]:
print('Best param: ', clf.best_params_)
print('Best score: ', clf.best_score_)

In [None]:
# clf = SVC(C=10, max_iter=10000, tol=1e-3, random_state=42, class_weight='balanced')
# clf.fit(X=X_train, y=y_train)


In [None]:
preds = clf.predict(X_test)

In [None]:
print('HOLD-OUT REPORT:')
print(metrics.classification_report(y_test, preds, target_names=ord_enc.categories_[0], digits=4, zero_division=0))

In [None]:
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(y_test, preds), display_labels=ord_enc.categories_[0]).plot()