## Relationship between the hallmarks of cancer and liver cancer progression

### Load libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Set variables

In [2]:
data_dir=""
response_name="patient.race"
rnaseq_file=data_dir+"lihc_rnaseq.csv.gz"
clinical_file=data_dir+"lihc_clinical.csv.gz"

### Load data

#### RNASeq

In [4]:
rnaseq = (pd.
          read_csv(rnaseq_file,compression="gzip").
          set_index('bcr_patient_barcode').
          applymap(lambda x : int(np.ceil(x)))
         )
display(rnaseq.shape)
display(rnaseq.head())

(423, 20531)

Unnamed: 0_level_0,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,?|317712,...,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590,tAKR|389932
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2V-A95S-01A-11R-A37K-07,0,2,4,91,1018,0,142,1,0,0,...,25,274,795,19,500,3173,891,511,4,7
TCGA-2Y-A9GS-01A-12R-A38B-07,0,27,3,72,640,0,123,2,0,0,...,69,633,1154,72,1001,5302,756,861,7,483
TCGA-2Y-A9GT-01A-11R-A38B-07,0,0,5,96,743,0,96,2,1,0,...,47,1220,1134,13,1290,3220,861,524,15,84
TCGA-2Y-A9GU-01A-11R-A38B-07,0,6,6,62,1187,0,281,1,0,0,...,19,286,1151,10,942,3093,1340,344,3,3
TCGA-2Y-A9GV-01A-11R-A38B-07,0,12,6,105,879,0,283,0,0,0,...,42,1000,1632,5,1381,2903,576,666,3,120


In [5]:
gene_name_logical = [len(x[0])>1 for x in rnaseq.columns.str.split('|')]
sub = rnaseq.loc[:,gene_name_logical]
sub.columns = [x[0] for x in sub.columns.str.split('|')]
rnaseq_sub = sub.copy()
rnaseq_sub.head()

Unnamed: 0_level_0,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,AAAS,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2V-A95S-01A-11R-A37K-07,22283,584,0,375,0,286320,81,543,1,1032,...,25,274,795,19,500,3173,891,511,4,7
TCGA-2Y-A9GS-01A-12R-A38B-07,22642,1573,3,99,0,31169,163,2,1,903,...,69,633,1154,72,1001,5302,756,861,7,483
TCGA-2Y-A9GT-01A-11R-A38B-07,77670,1281,0,215,1,19515,119,1,3,773,...,47,1220,1134,13,1290,3220,861,524,15,84
TCGA-2Y-A9GU-01A-11R-A38B-07,9323,1253,0,2914,2,243941,72,0,0,722,...,19,286,1151,10,942,3093,1340,344,3,3
TCGA-2Y-A9GV-01A-11R-A38B-07,84243,1641,0,404,0,8756,83,0,9,828,...,42,1000,1632,5,1381,2903,576,666,3,120


In [6]:
rnaseq_sub.index = rnaseq_sub.index.map(lambda x: '-'.join(x.split('-')[:3]).lower())
rnaseq_sub.head()

Unnamed: 0_level_0,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,AAAS,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tcga-2v-a95s,22283,584,0,375,0,286320,81,543,1,1032,...,25,274,795,19,500,3173,891,511,4,7
tcga-2y-a9gs,22642,1573,3,99,0,31169,163,2,1,903,...,69,633,1154,72,1001,5302,756,861,7,483
tcga-2y-a9gt,77670,1281,0,215,1,19515,119,1,3,773,...,47,1220,1134,13,1290,3220,861,524,15,84
tcga-2y-a9gu,9323,1253,0,2914,2,243941,72,0,0,722,...,19,286,1151,10,942,3093,1340,344,3,3
tcga-2y-a9gv,84243,1641,0,404,0,8756,83,0,9,828,...,42,1000,1632,5,1381,2903,576,666,3,120


#### Clinical 

In [7]:
clinical = pd.read_csv('clinical.tsv', sep='\t')
clinical['submitter_id'] = clinical['submitter_id'].map(lambda x: x.lower())
clinical.head()

Unnamed: 0,case_id,submitter_id,project_id,gender,year_of_birth,race,days_to_birth,ethnicity,vital_status,days_to_death,...,treatment_effect,initial_disease_status,treatment_type,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,bce25281-502e-4599-9679-32dc8462ffb1,tcga-dd-a4ne,TCGA-LIHC,female,1936,white,-27549,hispanic or latino,Dead,660,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,no
1,654af137-70d9-4ee4-9f69-793e352d30f8,tcga-dd-a1ee,TCGA-LIHC,male,1933,white,-26858,not hispanic or latino,Dead,349,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,no
2,45f00f00-d793-4dca-aeb9-7626d4575c90,tcga-gj-a9db,TCGA-LIHC,male,1945,white,-25020,not hispanic or latino,Dead,67,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,no
3,496beef9-80d9-4734-a23d-334ae1b6aaab,tcga-dd-aac8,TCGA-LIHC,male,1936,asian,-26322,not hispanic or latino,Dead,16,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,no
4,d1aa9c1a-d732-48c0-b669-8a14e0397344,tcga-2y-a9gv,TCGA-LIHC,female,1953,white,-20011,not hispanic or latino,Dead,2532,...,--,--,"Pharmaceutical Therapy, NOS",--,--,--,--,--,--,yes


#### Join RNASeq and Clinical tumor stage

In [8]:
full_df = pd.merge(rnaseq_sub.reset_index(), clinical[['submitter_id','tumor_stage']], left_on='bcr_patient_barcode', right_on='submitter_id', how='inner') \
    .set_index('bcr_patient_barcode') \
    .drop('submitter_id', axis=1)
#ensuring ID uniqueness
full_df.index = [x+'-'+str(i) for i,x in enumerate(full_df.index)]
full_df.head()

Unnamed: 0,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,AAAS,...,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR,tumor_stage
tcga-2v-a95s-0,22283,584,0,375,0,286320,81,543,1,1032,...,274,795,19,500,3173,891,511,4,7,8170/3
tcga-2y-a9gs-1,22642,1573,3,99,0,31169,163,2,1,903,...,633,1154,72,1001,5302,756,861,7,483,not reported
tcga-2y-a9gt-2,77670,1281,0,215,1,19515,119,1,3,773,...,1220,1134,13,1290,3220,861,524,15,84,stage i
tcga-2y-a9gu-3,9323,1253,0,2914,2,243941,72,0,0,722,...,286,1151,10,942,3093,1340,344,3,3,20187
tcga-2y-a9gv-4,84243,1641,0,404,0,8756,83,0,9,828,...,1000,1632,5,1381,2903,576,666,3,120,stage i


In [9]:
full_df.index.drop_duplicates().shape

(423,)

In [10]:
tumor_stages = clinical['tumor_stage'].value_counts()

In [11]:
tumor_stages

stage i         43
stage iiia      34
stage ii        25
not reported    14
stage iiic       5
                ..
23380            1
24377            1
11838            1
25254            1
16760            1
Name: tumor_stage, Length: 252, dtype: int64

In [12]:
# Subset out the recognizable stages
tumor_stages[tumor_stages.index.str.startswith('stage')]

stage i       43
stage iiia    34
stage ii      25
stage iiic     5
stage iiib     4
stage ivb      2
stage iii      2
stage iv       1
Name: tumor_stage, dtype: int64

In [13]:
# Subset full dataframe for patient samples that have a corresponding tumor stage
full_df = full_df.loc[full_df['tumor_stage'].str.startswith('stage')]

# Since there are substages (eg, stage iia and stage iib), we will conver them to the 4 main stages
full_df['tumor_stage'] = full_df['tumor_stage'].str.replace('stage ', '') \
                            .str.replace('a', '') \
                            .str.replace('b', '') \
                            .str.replace('c', '') \
                            .str.replace('v', '')
full_df

Unnamed: 0,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,AAAS,...,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR,tumor_stage
tcga-2y-a9gt-2,77670,1281,0,215,1,19515,119,1,3,773,...,1220,1134,13,1290,3220,861,524,15,84,i
tcga-2y-a9gv-4,84243,1641,0,404,0,8756,83,0,9,828,...,1000,1632,5,1381,2903,576,666,3,120,i
tcga-2y-a9gw-5,73056,1423,0,270,0,43813,299,0,1,622,...,1647,707,30,748,8467,993,457,7,103,i
tcga-2y-a9gy-7,23352,1559,0,104,2,11699,42,2,3,1179,...,166,996,129,486,3825,458,471,14,1,ii
tcga-2y-a9gz-8,43942,2033,1,963,0,264407,97,1,0,919,...,444,1318,128,1550,1394,712,976,5,271,ii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tcga-ub-a7mf-396,13195,641,0,493,0,110307,169,0,1,999,...,307,1222,30,638,3961,918,465,5,0,iii
tcga-wx-aa47-404,689,763,0,199,0,67703,830,0,0,1075,...,295,750,91,647,811,1198,442,2,26,iii
tcga-xr-a8tf-408,62514,3612,0,306,0,3302,62,0,0,1152,...,777,1293,184,1291,945,814,871,2,29,i
tcga-ya-a8s7-410,2675,441,0,62,1,75528,1053,3,0,812,...,427,707,56,639,10911,800,699,3,3,iii


## Load the hallmarks of cancer annotations

In [14]:
import pickle

descr_dict = pickle.load(open('../../hallmarks_of_cancer_description_dictionary.pkl','rb'))
geneset_dict = pickle.load(open('../../hallmarks_of_cancer_geneset_dictionary.pkl','rb'))

## X and y

In [15]:
X = full_df.drop('tumor_stage',axis=1)
display(X.shape)
display(X.head())
y = full_df['tumor_stage']
display(y.shape)
display(y.head())
display(y.value_counts())

(139, 20501)

Unnamed: 0,A1BG,A1CF,A2BP1,A2LD1,A2ML1,A2M,A4GALT,A4GNT,AAA1,AAAS,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22,tAKR
tcga-2y-a9gt-2,77670,1281,0,215,1,19515,119,1,3,773,...,47,1220,1134,13,1290,3220,861,524,15,84
tcga-2y-a9gv-4,84243,1641,0,404,0,8756,83,0,9,828,...,42,1000,1632,5,1381,2903,576,666,3,120
tcga-2y-a9gw-5,73056,1423,0,270,0,43813,299,0,1,622,...,20,1647,707,30,748,8467,993,457,7,103
tcga-2y-a9gy-7,23352,1559,0,104,2,11699,42,2,3,1179,...,17,166,996,129,486,3825,458,471,14,1
tcga-2y-a9gz-8,43942,2033,1,963,0,264407,97,1,0,919,...,49,444,1318,128,1550,1394,712,976,5,271


(139,)

tcga-2y-a9gt-2     i
tcga-2y-a9gv-4     i
tcga-2y-a9gw-5     i
tcga-2y-a9gy-7    ii
tcga-2y-a9gz-8    ii
Name: tumor_stage, dtype: object

i      54
iii    53
ii     32
Name: tumor_stage, dtype: int64

## X Feature Engineering

In [16]:
gs_dfs = []
for gs,set_ in geneset_dict.items():
    gs_df = pd.DataFrame(X.loc[:,set_].std(1))
    gs_df.columns = [gs]
    gs_dfs.append(gs_df)
X_fe = pd.concat(gs_dfs,1)
X_fe.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Unnamed: 0,HALLMARK_ADIPOGENESIS,HALLMARK_ALLOGRAFT_REJECTION,HALLMARK_ANDROGEN_RESPONSE,HALLMARK_ANGIOGENESIS,HALLMARK_APICAL_JUNCTION,HALLMARK_APICAL_SURFACE,HALLMARK_APOPTOSIS,HALLMARK_BILE_ACID_METABOLISM,HALLMARK_CHOLESTEROL_HOMEOSTASIS,HALLMARK_COAGULATION,...,HALLMARK_PROTEIN_SECRETION,HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY,HALLMARK_SPERMATOGENESIS,HALLMARK_TGF_BETA_SIGNALING,HALLMARK_TNFA_SIGNALING_VIA_NFKB,HALLMARK_UNFOLDED_PROTEIN_RESPONSE,HALLMARK_UV_RESPONSE_DN,HALLMARK_UV_RESPONSE_UP,HALLMARK_WNT_BETA_CATENIN_SIGNALING,HALLMARK_XENOBIOTIC_METABOLISM
tcga-2y-a9gt-2,59644.714679,20328.294564,23425.239059,47314.342902,5986.829113,2476.732985,16926.134574,26106.104684,24065.057892,107141.388803,...,3487.974653,16793.659132,4121.451073,3994.318354,5059.029797,8219.092332,5120.565685,8193.6538,1588.157841,37744.498034
tcga-2y-a9gv-4,54798.654488,11013.987675,10450.210287,36372.345489,4067.949359,3043.309331,13501.10928,27160.200359,19069.604428,88275.726833,...,2995.396715,25506.728791,3032.467312,3903.673227,4323.088264,10278.107744,1774.643495,7736.825851,1046.335806,40181.464356
tcga-2y-a9gw-5,47242.144793,10659.188754,12191.750685,41813.537891,9406.91316,4777.895965,14891.634222,11825.712189,18534.39095,118611.841372,...,3916.577684,114833.579664,2472.251034,2717.541611,6815.115784,8530.569988,3215.02339,10018.626937,951.77189,27973.385345
tcga-2y-a9gy-7,14492.691522,8049.821376,7357.992553,13184.301183,8044.544575,1962.136731,7781.710114,7751.487326,11504.826063,32202.282235,...,4012.192629,69559.662887,1701.187878,2084.033907,3178.856311,7412.89624,1419.053873,5455.628246,1097.06169,15360.003825
tcga-2y-a9gz-8,40887.912687,7821.448847,9815.592247,61869.963777,3474.128339,2505.767634,12756.163096,31345.900199,16833.646407,87661.806033,...,3265.633115,16177.321426,3127.661485,3047.265238,2611.565048,8770.213105,2460.331724,9472.776739,1060.182058,29066.823053


## Vanilla Multinomial Logit

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.metrics as m

In [18]:
mod = LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial',
    max_iter=1e5
)

In [19]:
test_size=0.15
seed = 1
cv_split = 10
y_map = y.map({'i' : 0,'ii' : 1, 'iii' : 2})
X_fe = X_fe.loc[y_map.index]
X_train, X_test, y_train, y_test = train_test_split(X_fe,y_map,
                                                    test_size=test_size,
                                                    random_state=seed,
                                                    shuffle=True)
cv = StratifiedKFold(n_splits=cv_split,random_state=seed,shuffle=True)
fits = []
llosses = []
for train_index, test_index in cv.split(X_train,y_train):
    fit = mod.fit(X_train.iloc[train_index],y_train.iloc[train_index])
    y_proba = fit.predict_proba(X_train.iloc[test_index])
    fits.append(fit)
    lloss = m.log_loss(y_train.iloc[test_index].values,y_proba,labels=fit.classes_)
    llosses.append(lloss)
best_fit = fits[np.argmin(llosses)].fit(X_train,y_train)

coef_de = (pd.DataFrame(
    best_fit.coef_,
    index=best_fit.classes_,
    columns=X_test.columns).
           rename_axis(y.name).
           reset_index().
           melt(
               id_vars=['tumor_stage'],
               var_name='Feature',
               value_name='Coefficient')
          )
coef_de['MCCV'] = seed
coef_de[y.name] = coef_de[y.name].map({0 : 'i',1 : 'ii',2 : 'iii'})

y_proba = best_fit.predict_proba(X_test)
y_pred = best_fit.predict(X_test)
mcm = m.multilabel_confusion_matrix(
    y_test.values.reshape(-1,1),
    y_pred,labels=[0,1,2]
)



In [20]:
tn = mcm[:, 0, 0]
tp = mcm[:, 1, 1]
fn = mcm[:, 1, 0]
fp = mcm[:, 0, 1]
sensitivity = tp / (tp + fn)
print(sensitivity)
specificity = tn / (fp + tn)
print(specificity)

[0.11111111 0.25       0.        ]
[0.25       0.58823529 0.76923077]


In [21]:
coef_de.head()

Unnamed: 0,tumor_stage,Feature,Coefficient,MCCV
0,i,HALLMARK_ADIPOGENESIS,-0.009266,1
1,ii,HALLMARK_ADIPOGENESIS,0.004689,1
2,iii,HALLMARK_ADIPOGENESIS,0.004577,1
3,i,HALLMARK_ALLOGRAFT_REJECTION,0.044297,1
4,ii,HALLMARK_ALLOGRAFT_REJECTION,-0.034808,1


## Monte Carlo Cross Validation Prediction Functions

[Monte Carlo Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) (MCCV), also known as repeated random sub-sampling validation. MCCV creates multiple random splits of the dataset into training and validation data. For each such split, the model is fit to the training data, and predictive accuracy is assessed using the validation data.  The advantage of MCCV over traditional K-fold cross validation is that the proportion of the training/validation split is not dependent on the number of iterations (folds). The disadvantage of this method is that some observations may never be selected in the validation subsample, whereas others may be selected more than once. In other words, validation subsets may overlap. This method also exhibits Monte Carlo variation, meaning that the results will vary if the analysis is repeated with different random splits.

In [22]:
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
import sklearn.metrics as m
from sklearn.base import clone

def mccv(X,Y,models,metrics=['roc_auc'],cv_split=10,seed=42,test_size=0.15,return_train_score=True,n_jobs=1,retrained_models=False,patient_level_predictions=False,return_estimator=True):

    
    y_map = Y.map({'i' : 0,'ii' : 1, 'iii' : 2})
    X = X.loc[y_map.index]
    X_train, X_test, y_train, y_test = train_test_split(X,y_map,
                                                        test_size=test_size,
                                                        random_state=seed,
                                                        shuffle=True)
    cv = StratifiedKFold(n_splits=cv_split,random_state=seed,shuffle=True)
    fits = []
    llosses = []
    for train_index, test_index in cv.split(X_train,y_train):
        fit = mod.fit(X_train.iloc[train_index],y_train.iloc[train_index])
        y_proba = fit.predict_proba(X_train.iloc[test_index])
        fits.append(fit)
        lloss = m.log_loss(y_train.iloc[test_index].values,y_proba,labels=fit.classes_)
        llosses.append(lloss)
    best_fit = fits[np.argmin(llosses)].fit(X_train,y_train)

    coef_de = (pd.DataFrame(
        best_fit.coef_,
        index=best_fit.classes_,
        columns=X_test.columns).
               rename_axis(y_test.name).
               reset_index().
               melt(
                   id_vars=['tumor_stage'],
                   var_name='Feature',
                   value_name='Coefficient')
              )
    coef_de['MCCV'] = seed
    coef_de[y_test.name] = coef_de[y_test.name].map({0 : 'i',1 : 'ii',2 : 'iii'})

    y_proba = best_fit.predict_proba(X_test)
    y_pred = best_fit.predict(X_test)
    mcm = m.multilabel_confusion_matrix(
        y_test.values.reshape(-1,1),
        y_pred,labels=[0,1,2]
    )
    return mcm, coef_de

In [23]:
from joblib import Parallel, delayed

def bootstrap_of_fcn(func=None,params={},n_jobs=4,nboot=2):
    if func==None:
        return "Need fcn to bootstrap"
    parallel = Parallel(n_jobs=n_jobs)
    return parallel(
        delayed(func)(
            seed=k,**params)
        for k in range(nboot))


## Prediction

In [24]:
classification_metrics = ['roc_auc']
cv_split=2
test_size=0.15
seed=42
n_jobs=4
nboot=1000

mod = LogisticRegression(
    solver='lbfgs',
    multi_class='multinomial',
    max_iter=1e5,
    random_state=seed
)

params = {'X' : X_fe, 'Y' : y, 
          'models' : {'Logit' : mod},'cv_split' : cv_split,
          'metrics' : classification_metrics, 
          'n_jobs' : 1,'test_size' : test_size,
          'retrained_models' : True, 
          'patient_level_predictions' : True}

**NOTE:** The below code will take several minutes to execute:

In [25]:
nboot=nboot
lst = bootstrap_of_fcn(func=mccv,
                       params=params,
                       n_jobs=n_jobs,
                       nboot=nboot)

In [26]:
mcms = [lst[i][0] for i in range(len(lst))]
coef_dfs = [lst[i][1] for i in range(len(lst))]

In [None]:
def contigency_statistics(mcm):
    tn = mcm[:, 0, 0]
    tp = mcm[:, 1, 1]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    return [tp,fp,fn,tn]

dfs = []
for i,mcm in enumerate(mcms):
    df = (pd.DataFrame(
        contigency_statistics(mcm),
        index=['TP','FP','FN','TN'],
        columns=params['Y'].unique()).
          rename_axis('Statistic').
          reset_index().
          melt(id_vars=['Statistic'],var_name=params['Y'].name)
         )
    df['MCCV'] = i
    dfs.append(df.head())
mcm_df= pd.concat(dfs)
mcm_df.head()

Unnamed: 0,Statistic,tumor_stage,value,MCCV
0,TP,i,2,0
1,FP,i,5,0
2,FN,i,6,0
3,TN,i,8,0
4,TP,ii,1,0


In [None]:
feature_importances_df = pd.concat(coef_dfs)
feature_importances_df.head()

Unnamed: 0,tumor_stage,Feature,Coefficient,MCCV
0,i,HALLMARK_ADIPOGENESIS,-0.000584,0
1,ii,HALLMARK_ADIPOGENESIS,0.000273,0
2,iii,HALLMARK_ADIPOGENESIS,0.000312,0
3,i,HALLMARK_ALLOGRAFT_REJECTION,0.000214,0
4,ii,HALLMARK_ALLOGRAFT_REJECTION,-0.001342,0


In [None]:
mcm_df.to_csv('performance_statistics.csv')
feature_importances_df.to_csv('feature_importances.csv')

## Data visualization

The goal of this section is to visualize the feature importances that we found above in order to better understand the how features vary and across tumor stage.

We'll use a Sankey diagram to do this. A Sankey is a flow diagram, in which the width of arrows is proportional to the flow quantity. The open-sourced Plotly library has a [Sankey object](https://plot.ly/python/sankey-diagram/) that we will use to accomplish this. You can use the sampled code below to get started.

In [3]:
import plotly.graph_objects as go

In [81]:
feat_importances = pd.read_csv('feature_importances.csv')
mcm_df = pd.read_csv('performance_statistics.csv')

In [5]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ["A1", "A2", "B1", "B2", "C1", "C2"],
      color = "blue"
    ),
    link = dict(
      source = [0, 1, 0, 2, 3, 3], # indices correspond to labels, eg A1, A2, A2, B1, ...
      target = [2, 3, 3, 4, 4, 5],
      value = [8, 4, 2, 8, 4, 2]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [83]:
feat_importances['Coefficient'] = feat_importances['Coefficient'].apply(lambda x: abs(x))

In [65]:
labels = []
for hallmark in ['HALLMARK_ADIPOGENESIS', 'HALLMARK_ALLOGRAFT_REJECTION', 'HALLMARK_ANDROGEN_RESPONSE']:
    for stage in ['i', 'ii','iii']:
        stage_label = '{} {}'.format(hallmark, stage)
        labels.append(stage_label)
labels

['HALLMARK_ADIPOGENESIS i',
 'HALLMARK_ADIPOGENESIS ii',
 'HALLMARK_ADIPOGENESIS iii',
 'HALLMARK_ALLOGRAFT_REJECTION i',
 'HALLMARK_ALLOGRAFT_REJECTION ii',
 'HALLMARK_ALLOGRAFT_REJECTION iii',
 'HALLMARK_ANDROGEN_RESPONSE i',
 'HALLMARK_ANDROGEN_RESPONSE ii',
 'HALLMARK_ANDROGEN_RESPONSE iii']

In [84]:

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = labels,
      color = "blue"
    ),
    link = dict(
      source = list(range(0, len(labels), 3)) + list(range(1, len(labels), 3)), # indices correspond to labels, eg A1, A2, A2, B1, ...
      target = list(range(1, len(labels), 3)) + list(range(2, len(labels), 3)),
      value = feat_importances['Coefficient'].values[:9]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()