In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from lib.read_data import dataset,Datasets,readDataWithRawScore
from math import copysign

from sklearn.pipeline import Pipeline

# feature extractors
from sklearn.decomposition import PCA
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
# finetuning
from sklearn.model_selection import GridSearchCV
# validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# plot
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
def run_display_output(classifier,test,DRAW=False):
    '''
    get confusion matrix and auc score for test dataset
    (optional) draw roc curve
    '''
    pred = classifier.predict(test[0])
    tn, fp, fn, tp = confusion_matrix(test[1],pred).ravel()#confusion matrix
    print(tn,fp,fn,tp)
    sensitivity = tp/(fn+tp)
    specificity = tn/(fp+tn)
    prods = classifier.predict_proba(test[0])[:,1]
    fpr, tpr, _ = metrics.roc_curve(test[1], prods)
    score = metrics.auc(fpr,tpr) #auc score
    if DRAW:
        draw_roc_curve(fpr,tpr,score)

    return sensitivity, specificity, score

def display_res_gavin_and_best_model(param_grid,pipeline,mvid,filename=None):
    '''
    use model defined by pipeline to fit mvid Dataset
    gridsearchCV determine the parameters given in param_grid
    (optional) save the model in path given in filename
    '''
    classifier = GridSearchCV(estimator=pipeline,
                              param_grid=param_grid)

    print('Start training...')
    classifier.fit(mvid[0],mvid[1])
    print('Model Description:\n',classifier.best_estimator_)
    if filename:
        pickle.dump(classifier,open(filename,'wb'))
        print('Saved model to path:',filename)
    sensitivity,specificity,score = run_display_output(classifier,[mvid[2],mvid[3]])
    print('>>> best model results: sensitivity: {:.{prec}}\tspecificity: {:.{prec}f}\tauc:{}'.\
    format(sensitivity,specificity,score,prec=3))
    return classifier

In [24]:
if __name__=='__main__':
    
    path = 'data/bonder_withzscore.csv'
    data = pd.read_csv(path,sep=',',index_col=0)
    def binarize(row):
        if row > 0:
            return 1
        else:
            return 0
    data['direction'] = data['zscore'].apply(binarize)
    
    features = [col for col in data.columns if col not in ['cpgName','direction','zscore']]
    for i in range(len(features)):
        for j in range(len(features)):
            combinationName = '{0}_{1}'.format(features[i].split('_')[0],
                                               features[j].split('_')[0])
            data[combinationName] = data[features[i]].values*data[features[j]].values
    
    data.to_csv('data/bonder_withZscoreAndNonLinear.csv',index=False)
    
    print('Raw data loaded.')
    dataset = readDataWithRawScore(data,'direction')
    

Raw data loaded.
Data Normalized.


In [22]:
    # get the error cases and the correct cases
    eqtm_data = bonder
    feature_list = [col for col in eqtm_data.train.values.columns
                    if col not in ['zscore','cpgName']]
    train_data = eqtm_data.train.values[feature_list]
    test_data = eqtm_data.test.values[feature_list]
    
    # ============random forest================
    pipeline_ranfor = Pipeline(steps=[('ranfor',
                                       RandomForestClassifier())])
    n_estimators = [10,50,100]
    class_weight = ['balanced',{1:4,0:1},{1:2,0:1}]
    param_grid_ranfor = [{'ranfor__n_estimators':n_estimators,
                        'ranfor__class_weight':class_weight}]
    classifier_ranfor = display_res_gavin_and_best_model(
                        param_grid_ranfor,
                        pipeline_ranfor,
                        [train_data,
                         eqtm_data.train.labels,
                         test_data,
                         eqtm_data.test.labels])
    
    # ===========knn============================
    pipeline_kneighbor = Pipeline(steps=[('kneighbor',
                                          KNeighborsClassifier())])
    # print(pipeline_kneighbor.get_params().keys())
    n_neighbors = range(2,10)
    weights = ['uniform','distance']
    # algorithms = ['auto']
    # leaf_size = [20,30,40,50]
    # p = [1,2,3]
    param_grid_kneighbor = [{'kneighbor__n_neighbors':n_neighbors,
                            'kneighbor__weights':weights}]#,
                            # 'kneighbor__algorithms':algorithms}]#,
                            # 'kneighbor__leaf_size':leaf_size,
                            # 'kneighbor__p':p}]
    classifier_kneighbor = display_res_gavin_and_best_model(
                            param_grid_kneighbor,
                            pipeline_kneighbor,
                            [train_data,
                         eqtm_data.train.labels,
                         test_data,
                         eqtm_data.test.labels])
    # ==========================================
    
    
#     model = LogisticRegression()
#     model.fit(train_data,eqtm_data.train.labels)
    pred = classifier_kneighbor.predict(test_data)

Start training...
Model Description:
 Pipeline(memory=None,
     steps=[('ranfor', RandomForestClassifier(bootstrap=True, class_weight={1: 2, 0: 1},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])
300 60 123 228
>>> best model results: sensitivity: 0.65	specificity: 0.833	auc:0.8025482747704971
Start training...
Model Description:
 Pipeline(memory=None,
     steps=[('kneighbor', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='distance'))])
288 72 96 255
>>> best model results: sensitivity: 0.726	specificity: 0.800	auc:0.8118194048749604


In [16]:
data[features[0]].values*data[features[1]].values

array([0.        , 0.        , 0.        , ..., 0.        , 0.32142857,
       0.        ])

In [20]:
data.head()

Unnamed: 0,cpgName,H3K27ac_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,H3K9me3_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,H3K27me3_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,H3K36me3_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,H3K4me1_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,H3K4me3_ImputedGapped_Bonder_celltypes_MJ_final_eqtms_unique_,TSS_Distance,zscore,direction,...,H3K4me3_H3K4me1,H3K4me3_H3K4me3,H3K4me3_TSS,TSS_H3K27ac,TSS_H3K9me3,TSS_H3K27me3,TSS_H3K36me3,TSS_H3K4me1,TSS_H3K4me3,TSS_TSS
0,cg08128007_ENSG00000188976,0.392857,0.0,0.964286,0.0,1.0,0.928571,54191,6.721807,1,...,0.928571,0.862245,50320.214287,21289.321431,0.0,52255.607144,0.0,54191.0,50320.214287,2936664481
1,cg23733394_ENSG00000188976,0.428571,0.0,0.071429,0.0,1.0,1.0,53922,7.517455,1,...,1.0,1.0,53922.0,23109.428573,0.0,3851.571427,0.0,53922.0,53922.0,2907582084
2,cg13856810_ENSG00000188976,0.035714,0.0,1.0,0.0,0.892857,0.178571,31788,5.672401,1,...,0.159439,0.031888,5676.428572,1135.285714,0.0,31788.0,0.0,28382.142859,5676.428572,1010476944
3,cg06624358_ENSG00000188976,0.0,0.0,1.0,0.0,0.785714,0.0,22816,5.745242,1,...,0.0,0.0,0.0,0.0,0.0,22816.0,0.0,17926.857143,0.0,520569856
4,cg14156792_ENSG00000188976,0.0,0.0,1.0,0.0,0.785714,0.0,22716,7.106451,1,...,0.0,0.0,0.0,0.0,0.0,22716.0,0.0,17848.285714,0.0,516016656
