# Evaluating the performance of the CENTRE.MI.MSI consensus LcL classifier accross CT when using MI and MSI features from other cell types

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
from xgboost import plot_importance
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
#from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import KFold, cross_val_score
from numpy import sort

In [3]:
#### Load the CENTRE.MI.MSI classifier
CENTREMIMSI = xgb.Booster()
CENTREMIMSI.load_model("/home/lopez_s/CRUP_scores/CENTRE_HiC/Training/CENTRE_HiC_classifiers/consensusLcL10kb_model.txt") 

In [4]:
##General paths and suffixes
suffixMIMSI = "-Benchmark.MI.MSI.v38.csv"
suffixBENGI = "-Benchmark.v38.txt"
rootMIMSI = "/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_datasets/10Kb/"
rootBENGI = "/project/CRUP_scores/toSara/BENGI_processed_datasets/"

In [5]:
sample = "consensusLcL"
consensusLcLMSIMI = pd.read_csv(rootMIMSI+sample+suffixMIMSI, 
                 header=0, sep=',')

In [6]:
print(list(consensusLcLMSIMI))

['pair', 'gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair19', 'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity']


In [7]:
consensusLcLMSIMI = consensusLcLMSIMI.drop_duplicates('pair19')
com=consensusLcLMSIMI.fillna(0)
#From the analysis coming from the master Thesis we found these to be the best parameters for the consensusLcL CENTREMI.MSI model
best_params = {'subsample': 0.9, 'reg_lambda': 0, 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 1.0, 'colsample_bytree': 0.9}


In [8]:
#Lists containing the samples for which we have BENGI data sets (this means data set without Hic) and the samples for which we have hic

samplelisthic = ["Colon.GTEx", "GM12878.CHiC", "GM12878.CTCF-ChIAPET", "GM12878.GEUVADIS", "GM12878.HiC", "GM12878.RNAPII-ChIAPET",
                  "IMR90.HiC", "K562.CRISPR", "K562.HiC", "Ovary.GTEx", "Pancreas.GTEx", "Stomach.GTEx"]

samplelistbengi = ["Colon.GTEx", "GM12878.CHiC", "GM12878.CTCF-ChIAPET", "GM12878.GEUVADIS", "GM12878.HiC", "GM12878.RNAPII-ChIAPET",
                  "IMR90.HiC", "K562.CRISPR", "K562.HiC", "Ovary.GTEx", "Pancreas.GTEx", "Stomach.GTEx", "Liver.GTEx", "NHEK.HiC", 
                  "HeLa.CTCF-ChIAPET", "HeLa.HiC", "HeLa.RNAPII-ChIAPET"]

In [36]:
##WARNING: only run once
for samplebengi in samplelistbengi:
    for samplehic in samplelisthic:
        if samplebengi != samplehic :
            bengi = pd.read_csv(rootBENGI+samplebengi+suffixBENGI, 
                         header=0, sep='\t')
            bengi = bengi.drop_duplicates('pair19')
            print("Length of bengi is:", len(bengi))
            MSIMI = pd.read_csv(rootMIMSI+samplehic+suffixMIMSI, 
                         header=0, sep=',')
            MSIMI = MSIMI.drop_duplicates('pair19')
            MSIMI = MSIMI[['pair', 'min_insulation', 'mean_switch_intensity']]
            print("Length of hic is:", len(MSIMI))
            test = pd.merge(bengi, MSIMI, how="inner", on="pair")
            
            print("Length of test is:", len(test))
            test.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'+ samplebengi + 'BENGI_' + samplehic+ 'MIMSI.csv', index = False)

Length of bengi is: 26983
Length of hic is: 283632
Length of test is: 2869
Length of bengi is: 26983
Length of hic is: 80965
Length of test is: 1472
Length of bengi is: 26983
Length of hic is: 43288
Length of test is: 2416
Length of bengi is: 26983
Length of hic is: 35189
Length of test is: 524
Length of bengi is: 26983
Length of hic is: 116429
Length of test is: 1757
Length of bengi is: 26983
Length of hic is: 25564
Length of test is: 591
Length of bengi is: 26983
Length of hic is: 13961
Length of test is: 481
Length of bengi is: 26983
Length of hic is: 27079
Length of test is: 420
Length of bengi is: 26983
Length of hic is: 13789
Length of test is: 1343
Length of bengi is: 26983
Length of hic is: 43357
Length of test is: 5540
Length of bengi is: 26983
Length of hic is: 35857
Length of test is: 6801
Length of bengi is: 283632
Length of hic is: 26983
Length of test is: 2869
Length of bengi is: 283632
Length of hic is: 80965
Length of test is: 53723
Length of bengi is: 283632
Length of 

In [34]:
# since the sizes of the datasets have changed we need to reassign the cv folds
## Taken from BENGI github https://github.com/weng-lab/BENGI/blob/master/Scripts/Generate-Benchmark/assign.groups.py
## The code is adapted

##More general determination of groups. 
def Determine_Groups(chromDict):
    chromArray=[]
    groupDict={}
    for key, value in sorted(chromDict.items(), key=lambda x:x[1]):
        chromArray.append(key)
    print(chromArray)
    numtwo = len(chromArray) % 12
    res = 12 - numtwo
    for i in range(0,numtwo):
        groupDict[chromArray[i]]="cv-"+str(i)
        groupDict[chromArray[-res-i]]="cv-"+str(i)
    if numtwo != 11:
        for i in range(numtwo, 11):
            groupDict[chromArray[i]]="cv-"+str(i)
    groupDict[chromArray[-1]]="cv-11"
    return groupDict

def Create_Chrom_Dict(enhancers):
    chromDict={}
    enhancerDict={}
    for line in enhancers:
        line=line.rstrip().split(" ")
        #skip the header
        if line[38] == "chr.y":
            print(line)
            continue
        line[38]=line[38].rstrip()
        if line[38] not in chromDict:
            chromDict[line[38]]=1
        else:
            chromDict[line[38]]+=1
        if line[2].rstrip() not in enhancerDict:
            enhancerDict[line[2].rstrip()]=line[38]
    return chromDict, enhancerDict

def CVmaker(enhancerspath):
    enhancers=open(enhancerspath)
    chromDict, enhancerDict = Create_Chrom_Dict(enhancers)
    enhancers.close()
    groupDict=Determine_Groups(chromDict)
    sizecvdict = {}
    seenchroms = []
    for key, value in groupDict.items():
        if value not in sizecvdict:
            sizecvdict[value] = chromDict[key]
            seenchroms.append(key)
        if value in sizecvdict and key not in seenchroms : 
            sizecvdict[value] += chromDict[key]
    return chromDict, enhancerDict, groupDict, sizecvdict

In [38]:
def CVF1results(com, best_params, test):

    cv_names=com["CV"].unique()
    d = dict(tuple(com.groupby('CV')))
    dtest = dict(tuple(test.groupby('CV')))
    result_all_xgboost={}
    result_all_tf={}
    result_cv_xg=[]
    result_cv_tf=[]
    for i in range(len(cv_names)):
        cv_test=cv_names[i]
        print(cv_test)
        cv_train=[x for k,x in enumerate(cv_names) if k!=i]
        print(cv_train)
        print(dtest.keys())
        X_test_xg = dtest[cv_test].drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV', 'CVnew', 'chr.x', 'chr.y'], axis=1).copy()
        y_test_xg = dtest[cv_test]['label'].copy()
        train_xg=pd.concat({k: d[k] for k in cv_train})
        X_train_xg= train_xg.drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
        y_train_xg = train_xg['label'].copy()
        clf_xgb = xgb.XGBClassifier(objective = "binary:logistic",scale_pos_weight=5,random_state=0,**best_params)
        clf_xgb.fit(X_train_xg, y_train_xg)
        pred_s = clf_xgb.predict_proba(X_test_xg)
        lr_probs =pred_s[:, 1]
        yhat = clf_xgb.predict(X_test_xg)
        result_cv = pd.DataFrame({'pred_prob':lr_probs,'pred_label': yhat,'true_label':y_test_xg})
        result_cv_xg.append(f1_score(result_cv['true_label'], result_cv['pred_label']))
        result_all_xgboost[cv_test]=result_cv
        



    results_xg=pd.concat(result_all_xgboost)
    
    lr_precision_xg, lr_recall_xg, _ = precision_recall_curve(results_xg['true_label'], results_xg['pred_prob'])
    lr_f1_xg, lr_auc_xg = f1_score(results_xg['true_label'], results_xg['pred_label']), auc(lr_recall_xg, lr_precision_xg)
    print('xgboost HiC 12 fold CV:auc=%.3f' % lr_auc_xg)
    print(lr_f1_xg)
    print(result_cv_xg)

    dist_precision, dist_recall, _=precision_recall_curve(com['label'],1/abs(com['distance']))
    dist_auc = auc(dist_recall, dist_precision)
    print('Distance:auc=%.3f' % (dist_auc))
    return(lr_f1_xg, result_cv_xg)

In [8]:
##Separate cell lines and tissues
## This is only tissues
samplelisthictissue = ["Colon.GTEx", "Ovary.GTEx", "Pancreas.GTEx", "Stomach.GTEx"]
samplelistbengitissue = ["Colon.GTEx", "Ovary.GTEx", "Pancreas.GTEx", "Stomach.GTEx", "Liver.GTEx"]

In [39]:

for samplebengi in samplelistbengitissue:
    for samplehic in samplelisthictissue:
        if samplebengi != samplehic :
            print(samplebengi, samplehic)
            test = pd.read_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'+
                               samplebengi +
                               'BENGI_'
                               + samplehic
                               + 'MIMSI.csv', 
                         header=0, sep=' ')
            print(len(set(test["chr.x"])))
            path = '/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'+samplebengi+'BENGI_'+ samplehic+ 'MIMSI.csv'
            chromDict, enhancerDict, groupDict, sizecvdict = CVmaker(path)
            #remove old CV column to fill it with new CV indexes
            print(sizecvdict)
            print(groupDict)
            print(chromDict)
            test = test.drop(['CV'], axis=1)    
            test['CV'] = test['chr.x'].map(groupDict)
    
            test.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'+ samplebengi + 'BENGI_' + samplehic+ 'MIMSI.csv', 
                            index = False,
                           sep = " ")
                
            

Colon.GTEx Ovary.GTEx
18
['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity', 'chr.x', 'chr.y', 'CVnew', 'CV']
['chr8', 'chr12', 'chr18', 'chr2', 'chr4', 'chr5', 'chr22', 'chr1', 'chr9', 'chr6', 'chr7', 'chr15', 'chr19', 'chr10', 'chr3', 'chr16', 'chr11', 'chr17']
{'cv-0': 88, 'cv-1': 90, 'cv-2': 17, 'cv-10': 61, 'cv-3': 22, 'cv-9': 47, 'cv-4': 23, 'cv-8': 37, 'cv-5': 28, 'cv-7': 37, 'cv-6': 34, 'cv-11': 221}
{'chr8': 'cv-0', 'chr19': 'cv-0', 'chr12': 

In [41]:
##calculate F1 for tissues
f1data = pd.DataFrame(columns = ["SampleBENGI","SampleMSIMI", "F1_Score_MIMSI", "NumRows"])

for samplebengi in samplelistbengitissue:
    for samplehic in samplelisthictissue:
        if samplebengi != samplehic :
            test = pd.read_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'+
                               samplebengi +
                               'BENGI_'
                               + samplehic
                               + 'MIMSI.csv', 
                         header=0, sep=' ')
            print(list(test))
            test=test.fillna(0)
            test['distance'] =test['distance'].abs()
            lr_f1, resultcv  = CVF1results(com, best_params, test)
            numrows = len(test)
            #Dataframe for the current sample  
            f1data1 = pd.DataFrame()
            f1data1['SampleBENGI'] = [samplebengi]*12
            f1data1['SampleMSIMI'] = [samplehic]*12
            f1data1['F1_Score_MIMSI'] = resultcv
            f1data1['NumRows'] = [numrows]*12
            # Concatenate with big dataframe
            print(f1data1)
            f1data1.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/F1scoresAcrossCT/'+
                           samplebengi +
                           '_' +
                           samplehic +
                           '_f1scores.csv')
            f1data = pd.concat([f1data, f1data1], ignore_index = True)
            

f1data.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/F1scoresAcrossCT/F1scoresTissues.csv')

['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity', 'chr.x', 'chr.y', 'CVnew', 'CV']
cv-11
['cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-0
['cv-11', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', '

  f1data = pd.concat([f1data, f1data1], ignore_index = True)


['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity', 'chr.x', 'chr.y', 'CVnew', 'CV']
cv-11
['cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-0
['cv-11', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', '

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


cv-5
['cv-11', 'cv-0', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-10
['cv-11', 'cv-0', 'cv-5', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-7
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-2
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-1
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


cv-5
['cv-11', 'cv-0', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-10
['cv-11', 'cv-0', 'cv-5', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-7
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-2
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-1
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


cv-5
['cv-11', 'cv-0', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-10
['cv-11', 'cv-0', 'cv-5', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-7
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-2
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-1
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


cv-5
['cv-11', 'cv-0', 'cv-10', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-10
['cv-11', 'cv-0', 'cv-5', 'cv-7', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-7
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-2', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-2
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-1', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-4', 'cv-5', 'cv-6', 'cv-7', 'cv-8', 'cv-9'])
cv-1
['cv-11', 'cv-0', 'cv-5', 'cv-10', 'cv-7', 'cv-2', 'cv-9', 'cv-8', 'cv-3', 'cv-4', 'cv-6']
dict_keys([0, 'cv-0', 'cv-1', 'cv-10', 'cv-11', 'cv-2', 'cv-3', 'cv-

In [9]:
f1data.to_csv('/project/CRUP_scores/CENTRE_HiC/Training/CENTRE_final_training/f1_consensusLcL.csv')

In [57]:
test = pd.read_csv('/project/CRUP_scores/CENTRE_HiC/Training/BENGI_MSI_MI_acrossCT/'
                   "Colon.GTEx"+
                   'BENGI_'+
                   "GM12878.HiC"+
                   'MIMSI.csv', 
                   header=0, sep=' ')
print((list(test)))

['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3', 'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2', 'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1', 'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5', 'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4', 'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests', 'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom', 'norm_reg_dist_prom', 'RNA_seq', 'min_insulation', 'mean_switch_intensity', 'chr.x', 'chr.y']
