# CENTRE CT only training for Ekin's project

In [2]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np # calculate the mean and standard deviation
import xgboost as xgb # XGBoost stuff
from xgboost import plot_importance
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
#from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedKFold, cross_val_score
from numpy import sort

In [4]:
def optimalparamsearch(com):
    com['distance'] =com['distance'].abs()
    com=com.sort_values('pair19')
    com=com.reset_index(drop=True)
    ''' function to do parameter search '''
    cv_names=com["CV"].unique()
    myCViterator = []
    for i in range(len(cv_names)):
        trainIndices = com[ com['CV']!=cv_names[i] ].index.values.astype(int)
        testIndices =  com[ com['CV']==cv_names[i] ].index.values.astype(int)
        myCViterator.append( (trainIndices, testIndices) )


    #run randomized search for optimal parameters

    X_train= com.drop(['gene_id1','gene_id','symbol38','symbol19','pair','pair19','label','CV'], axis=1).copy()
    y_train = com['label'].copy()
    model = xgb.XGBClassifier(objective = "binary:logistic",scale_pos_weight=5,random_state=0)
    param_grid = {
            'max_depth': [4, 5, 6,8,10,12],
            'learning_rate': [0.1, 0.05, 0.01],
            'gamma': [0, 0.25, 1.0],
            'reg_lambda': [0, 1.0, 10.0],
            'n_estimators': [100,200,300,400,500],
            'colsample_bytree': [0.5,0.6,0.7,0.9],
            'subsample': [0.7, 0.9]
        }
    search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,scoring='f1', cv=myCViterator, n_jobs=12, refit=True)
    result = search.fit(X_train, y_train)
    print('est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
    return(result.best_score_, result.best_params_)

In [5]:
def trainOptParams(com, best_params, pathtomodel):
    X_train_xg= com.drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
    y_train_xg = com['label'].copy()
    clf_xgb = xgb.XGBClassifier(objective = "binary:logistic",scale_pos_weight=5,random_state=0,**best_params)
    clf_xgb.fit(X_train_xg, y_train_xg)
    clf_xgb.save_model(pathtomodel)
    return(clf_xgb)

In [6]:
def CVF1results(com, best_params):

    cv_names=com["CV"].unique()
    d = dict(tuple(com.groupby('CV')))
    result_all_xgboost={}
    result_all_tf={}
    result_cv_xg=[]
    result_cv_tf=[]
    for i in range(len(cv_names)):
        cv_test=cv_names[i]
        cv_train=[x for k,x in enumerate(cv_names) if k!=i]
        X_test_xg = d[cv_test].drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
        y_test_xg = d[cv_test]['label'].copy()
        train_xg=pd.concat({k: d[k] for k in cv_train})
        X_train_xg= train_xg.drop(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19', 'label', 'CV'], axis=1).copy()
        y_train_xg = train_xg['label'].copy()
        clf_xgb = xgb.XGBClassifier(objective = "binary:logistic",scale_pos_weight=5,random_state=0,**best_params)
        clf_xgb.fit(X_train_xg, y_train_xg)
        pred_s = clf_xgb.predict_proba(X_test_xg)
        lr_probs =pred_s[:, 1]
        yhat = clf_xgb.predict(X_test_xg)
        result_cv = pd.DataFrame({'pred_prob':lr_probs,'pred_label': yhat,'true_label':y_test_xg})
        result_cv_xg.append(f1_score(result_cv['true_label'], result_cv['pred_label']))
        result_all_xgboost[cv_test]=result_cv
        



    results_xg=pd.concat(result_all_xgboost)
    
    lr_precision_xg, lr_recall_xg, _ = precision_recall_curve(results_xg['true_label'], results_xg['pred_prob'])
    lr_f1_xg, lr_auc_xg = f1_score(results_xg['true_label'], results_xg['pred_label']), auc(lr_recall_xg, lr_precision_xg)
    print('xgboost HiC 12 fold CV:auc=%.3f' % lr_auc_xg)
    print(lr_f1_xg)
    print(result_cv_xg)

    dist_precision, dist_recall, _=precision_recall_curve(com['label'],1/abs(com['distance']))
    dist_auc = auc(dist_recall, dist_precision)
    print('Distance:auc=%.3f' % (dist_auc))
    ##1st f1 scores of CENTRE.MSI.MI
    ##2nd f1 scores of CENTRE
    return(lr_f1_xg, result_cv_xg)

### Consensus LcL training without Wilcoxon tests and CRUP cor (w/o Generic features except genomic distance)


In [16]:
traindatafile = "Give path to train set file"
modelpath="Give path to where you want to save the model
##Load consensusLclfile
lcls = pd.read_csv(traindatafile, 
                     header=0, sep='\t')

## Drop generic features except distance
lcls.drop(['cor_CRUP', 'combined_tests'], axis=1)

       
##Parameter search using Grid Search with 12-fold CV (Moore et al. scheme)
com=lcls.fillna(0)
    
best_score, best_params = optimalparamsearch(com)

##Train with opt parameters and save the CENTRECT model
centreCTDT = trainOptParams(com, best_params, modelpath)

##This is only to get a measure of performance
f1_CENTRECTDT, f1_CENTRECTDT = CVF1results(com, best_params)

est=0.538, cfg={'subsample': 0.7, 'reg_lambda': 10.0, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0.25, 'colsample_bytree': 0.9}




KeyboardInterrupt: 

In [13]:
lcls.columns ## order ibn which features should be fed to classifier

Index(['gene_id1', 'gene_id', 'symbol38', 'symbol19', 'pair', 'pair19',
       'label', 'CV', 'EP_prob_enh.1', 'EP_prob_enh.2', 'EP_prob_enh.3',
       'EP_prob_enh.4', 'EP_prob_enh.5', 'EP_prob_gene.1', 'EP_prob_gene.2',
       'EP_prob_gene.3', 'EP_prob_gene.4', 'EP_prob_gene.5', 'PP_prob_enh.1',
       'PP_prob_enh.2', 'PP_prob_enh.3', 'PP_prob_enh.4', 'PP_prob_enh.5',
       'PP_prob_gene.1', 'PP_prob_gene.2', 'PP_prob_gene.3', 'PP_prob_gene.4',
       'PP_prob_gene.5', 'distance', 'cor_CRUP', 'combined_tests',
       'reg_dist_enh', 'norm_reg_dist_enh', 'reg_dist_prom',
       'norm_reg_dist_prom', 'RNA_seq'],
      dtype='object')