# Evaluating the performance of the CENTRE.MI.MSI consensus LcL classifier accross CT when using MI and MSI features from other cell types

In this case MI and MSI features are recomputed using IS and DI across CT

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
from xgboost import plot_importance
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
#from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import KFold, cross_val_score
from numpy import sort

In [2]:
#### Load the CENTRE.MI.MSI classifier
CENTREMIMSI = xgb.Booster()
CENTREMIMSI.load_model("/home/lopez_s/CRUP_scores/CENTRE_HiC/Training/CENTRE_HiC_classifiers/consensusLcL10kb_model.txt") 

In [3]:
##General paths and suffixes
suffixMIMSI = "-Benchmark.MI.MSI.v38.csv"
suffixBENGI = "-Benchmark.v38.txt"
rootMIMSI = "/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_datasets/10Kb/"
rootBENGI = "/project/CRUP_scores/toSara/BENGI_processed_datasets/"

In [4]:
sample = "consensusLcL"
consensusLcLMSIMI = pd.read_csv(rootMIMSI+sample+suffixMIMSI, 
                 header=0, sep=',')

In [5]:
print(list(consensusLcLMSIMI))

['pair', 'gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair19', 'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity']


In [6]:
consensusLcLMSIMI = consensusLcLMSIMI.drop_duplicates('pair19')
com=consensusLcLMSIMI.fillna(0)
#From the analysis coming from the master Thesis we found these to be the best parameters for the consensusLcL CENTREMI.MSI model
best_params = {'subsample': 0.9, 'reg_lambda': 0, 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 1.0, 'colsample_bytree': 0.9}


In [7]:
def CVF1results(com, best_params, test):

    cv_names=com["CV"].unique()
    d = dict(tuple(com.groupby('CV')))
    dtest = dict(tuple(test.groupby('CV')))
    result_all_xgboost={}
    result_all_tf={}
    result_cv_xg=[]
    result_cv_tf=[]
    for i in range(len(cv_names)):
        cv_test=cv_names[i]
        print(cv_test)
        cv_train=[x for k,x in enumerate(cv_names) if k!=i]
        print(cv_train)
        print(dtest.keys())
        X_test_xg = dtest[cv_test].drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
        y_test_xg = dtest[cv_test]['label'].copy()
        train_xg=pd.concat({k: d[k] for k in cv_train})
        X_train_xg= train_xg.drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
        y_train_xg = train_xg['label'].copy()
        clf_xgb = xgb.XGBClassifier(objective = "binary:logistic",scale_pos_weight=5,random_state=0,**best_params)
        clf_xgb.fit(X_train_xg, y_train_xg)
        pred_s = clf_xgb.predict_proba(X_test_xg)
        lr_probs =pred_s[:, 1]
        yhat = clf_xgb.predict(X_test_xg)
        result_cv = pd.DataFrame({'pred_prob':lr_probs,'pred_label': yhat,'true_label':y_test_xg})
        result_cv_xg.append(f1_score(result_cv['true_label'], result_cv['pred_label']))
        result_all_xgboost[cv_test]=result_cv
        



    results_xg=pd.concat(result_all_xgboost)
    
    lr_precision_xg, lr_recall_xg, _ = precision_recall_curve(results_xg['true_label'], results_xg['pred_prob'])
    lr_f1_xg, lr_auc_xg = f1_score(results_xg['true_label'], results_xg['pred_label']), auc(lr_recall_xg, lr_precision_xg)
    print('xgboost HiC 12 fold CV:auc=%.3f' % lr_auc_xg)
    print(lr_f1_xg)
    print(result_cv_xg)

    dist_precision, dist_recall, _=precision_recall_curve(com['label'],1/abs(com['distance']))
    dist_auc = auc(dist_recall, dist_precision)
    print('Distance:auc=%.3f' % (dist_auc))
    return(lr_f1_xg, result_cv_xg)

In [12]:
pathtometadata = "/project/CRUP_scores/CENTRE_HiC/AcrossCTComparisonMSIMI/AcrossCTMetadata.csv"
metadata = pd.read_csv(pathtometadata, 
                         header=0, sep=',')
print(metadata['JoinedName'])


0                  Colon.GTEx-ovary
1               Colon.GTEx-pancreas
2                Colon.GTEx-stomach
3          Ovary.GTEx-sigmoid-colon
4                  Ovary.GTEx-ovary
5               Ovary.GTEx-pancreas
6                Ovary.GTEx-stomach
7       Pancreas.GTEx-sigmoid-colon
8               Pancreas.GTEx-ovary
9            Pancreas.GTEx-pancreas
10            Pancreas.GTEx-stomach
11       Stomach.GTEx-sigmoid-colon
12               Stomach.GTEx-ovary
13            Stomach.GTEx-pancreas
14             Stomach.GTEx-stomach
15              GM12878.CHiC-IMR-90
16                GM12878.CHiC-K562
17      GM12878.CTCF-ChIAPET-IMR-90
18        GM12878.CTCF-ChIAPET-K562
19          GM12878.GEUVADIS-IMR-90
20            GM12878.GEUVADIS-K562
21               GM12878.HiC-IMR-90
22                 GM12878.HiC-K562
23    GM12878.RNAPII-ChIAPET-IMR-90
24      GM12878.RNAPII-ChIAPET-K562
25                IMR90.HiC-GM12878
26                 IMR90.HiC-IMR-90
27                   IMR90.H

In [14]:
##calculate F1 for mixed CT benchmarks
pathBenchmark = "/project/CRUP_scores/CENTRE_HiC/AcrossCTComparisonMSIMI/BENGI_MI_MSI_acrossCT/"
f1data = pd.DataFrame(columns = ["SampleBENGI","SampleMSIMI", "F1_Score_MIMSI", "NumRows"])

for sample in metadata['JoinedName']:
    test = pd.read_csv(pathBenchmark+
                       sample +
                       '.MIMSI.csv', 
                         header=0, sep=',')
    print(list(test))
    test=test.fillna(0)
    test['distance'] =test['distance'].abs()
    lr_f1, resultcv  = CVF1results(com, best_params, test)
    numrows = len(test)
    #Dataframe for the current sample  
    f1data1 = pd.DataFrame()
    f1data1['SampleBENGI'] = [samplebengi]*12
    f1data1['SampleMSIMI'] = [samplehic]*12
    f1data1['F1_Score_MIMSI'] = resultcv
    f1data1['NumRows'] = [numrows]*12
    # Concatenate with big dataframe
    print(f1data1)
    f1data1.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/F1scoresAcrossCT/' +
                   sample +
                   '_f1scores.csv')
    f1data = pd.concat([f1data, f1data1], ignore_index = True)
            

f1data.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/F1scoresAcrossCT/F1scoresAllCT.csv')

['pair', 'gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair19', 'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity']
cv-11
['cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys(['cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])


KeyError: "['CVnew', 'chr.x', 'chr.y'] not found in axis"

In [9]:
f1data.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/f1_consensusLcL.csv')

In [57]:
test = pd.read_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'
                   "Colon.GTEx"+
                   'BENGI_'+
                   "GM12878.HiC"+
                   'MIMSI.csv', 
                   header=0, sep=' ')
print((list(test)))

['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity', 'chr.x', 'chr.y']
