In [2]:
import pandas as pd
import os
from os.path import isfile, join
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [7]:
def display_column_names(column_names,size = 4):
    '''
    when there are too many columns
    display every four column names in a line
    '''
    for i in range(1,len(column_names),size):
        print(column_names[i:i+4].values)

In [8]:
display_column_names(datatable_pd.columns)

['Pos' 'Ref' 'Anc' 'Alt']
['Type' 'Length' 'isTv' 'isDerived']
['AnnoType' 'Consequence' 'ConsScore' 'ConsDetail']
['GC' 'CpG' 'mapAbility20bp' 'mapAbility35bp']
['scoreSegDup' 'priPhCons' 'mamPhCons' 'verPhCons']
['priPhyloP' 'mamPhyloP' 'verPhyloP' 'GerpN']
['GerpS' 'GerpRS' 'GerpRSpval' 'bStatistic']
['mutIndex' 'dnaHelT' 'dnaMGW' 'dnaProT']
['dnaRoll' 'mirSVR-Score' 'mirSVR-E' 'mirSVR-Aln']
['targetScan' 'fitCons' 'cHmmTssA' 'cHmmTssAFlnk']
['cHmmTxFlnk' 'cHmmTx' 'cHmmTxWk' 'cHmmEnhG']
['cHmmEnh' 'cHmmZnfRpts' 'cHmmHet' 'cHmmTssBiv']
['cHmmBivFlnk' 'cHmmEnhBiv' 'cHmmReprPC' 'cHmmReprPCWk']
['cHmmQuies' 'EncExp' 'EncH3K27Ac' 'EncH3K4Me1']
['EncH3K4Me3' 'EncNucleo' 'EncOCC' 'EncOCCombPVal']
['EncOCDNasePVal' 'EncOCFairePVal' 'EncOCpolIIPVal' 'EncOCctcfPVal']
['EncOCmycPVal' 'EncOCDNaseSig' 'EncOCFaireSig' 'EncOCpolIISig']
['EncOCctcfSig' 'EncOCmycSig' 'Segway' 'tOverlapMotifs']
['motifDist' 'motifECount' 'motifEName' 'motifEHIPos']
['motifEScoreChng' 'TFBS' 'TFBSPeaks' 'TFBSPeaksMax'

In [17]:
if __name__=='__main__':
    # read data file
    datafile = os.path.join('data','mhy7.tsv')
    datatable_pd = pd.read_csv(datafile,sep='\t')
    datatable_pos = datatable_pd['Pos']
    
    # delete some columns that were not used in cadd paper
    del_cols = ['#Chrom','Pos','isDerived','AnnoType','ConsScore',
                'ConsDetail','mapAbility20bp','mapAbility35bp',
                'scoreSegDup','isKnownVariant','ESP_AF','ESP_AFR',
                'ESP_EUR','TG_AF','TG_ASN','TG_AMR','TG_AFR','TG_EUR',
                'GeneID','FeatureID','CCDS','GeneName','Exon',
                'Intron','RawScore']
    datatable_pd = datatable_pd.drop(columns=del_cols)

    # delete columns without a single value
    datatable_pd = datatable_pd.dropna(axis=1,how='all')

    # fill in values recommended by cadd paper
    values = {'GerpRS':0, 'GerpRSpval':1,'EncExp':0,'EncOCC':5,
              'EncOCCombPVal':0,'EncOCDNasePVal':0,'EncOCFairePVal':0,
              'EncOCpolIIPVal':0,'EncOCctcfPVal':0,'EncOCmycPVal':0,
              'EncOCDNaseSig':0,'EncOCFaireSig':0,'EncOCpolIISig':0,
              'EncOCctcfSig':0,'EncOCmycSig':0,'tOverlapMotifs':0,
              'motifDist':0,'TFBS':0,'TFBSPeaksMax':0,'PolyPhenVal':0,
              'SIFTval':0,'TFBSPeaks':0}
    datatable_pd = datatable_pd.fillna(values)
    
    # transform objects to dummies
    categorical_feature_names = \
    datatable_pd.select_dtypes(include=np.object).columns
    categories={} # contains all the levels in those feature columns
    for f in categorical_feature_names:
        datatable_pd[f] = datatable_pd[f].astype('category')
        categories[f] = datatable_pd[f].cat.categories

    dummy_data = pd.get_dummies(datatable_pd,columns=[col for col in
                                                      categorical_feature_names
                                                      if col not in ['INFO']])
    
    # change info column into scalar column
    dummy_data['INFO'] = datatable_pd['INFO'].astype('category').cat.codes
    
    # drop nan values -TODO
    dummy_data_del_all_nan = dummy_data.copy()
    print('Deleted columns that I do not know how to impute:')
    for col in dummy_data.columns:
        null = dummy_data[col].isnull().values.ravel().sum()
        if null > 0:
            print(null,col)
            dummy_data_del_all_nan = dummy_data_del_all_nan.drop(columns=col)
    
    # normalized the numerical values before any processing afterwards
    min_max_scaler = MinMaxScaler()
    dummy_data_scaled = min_max_scaler.fit_transform(dummy_data_del_all_nan)
    dummy_data_scaled = pd.DataFrame(dummy_data_scaled,
                                     columns=dummy_data_del_all_nan.columns)

    # save the preprocessed data as csv file
    dummy_data_scaled['POS'] = datatable_pos
    res_path = os.path.join('data','dummy_no_nan_data.csv')
    dummy_data_scaled.to_csv(res_path,sep='\t',index=False)
    print('Saved to %s'%res_path)

Deleted columns that I do not know how to impute:
3 EncH3K27Ac
78 EncNucleo
17 cDNApos
17 relcDNApos
16 CDSpos
16 relCDSpos
16 protPos
16 relProtPos
643 Dst2Splice
83 Grantham
Saved to data/dummy_no_nan_data.csv


In [18]:
datatable_pd.shape

(850, 83)

In [19]:
dummy_data.shape

(850, 1002)

In [20]:
dummy_data_del_all_nan.shape

(850, 992)

# try pca+LogReg

In [21]:
from lib.read_data import dataset,Datasets

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# feature extractors
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVC
# finetuning
from sklearn.model_selection import GridSearchCV
# validation
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [49]:
def read_data_set(data_table,test_size=0.25,BENCHMARK=False):
    '''
    convert a pandas dataframe data table into Datasets(dataset,dataset)
    '''
    train, test = train_test_split(data_table,test_size=0.25)
    train_x = train[[col for col in train.columns
    if col not in ['INFO','gavin_res']]]
    features = train_x.columns
    train_x = np.array(train_x)
    test_x = np.array(test[[col for col in train.columns
    if col not in ['INFO','gavin_res']]])
    train_y = np.array(train['INFO'],dtype=np.int8)
    test_y = np.array(test['INFO'],dtype=np.int8)

    # # check what columns are in the train Dataset
    # for i in range(0,len(train_x.columns),5):
    #     print(train_x.columns[i:i+5])

    if BENCHMARK:
        return Datasets(train=dataset(train_x,train_y,features),
                        test=dataset(test_x,test_y,features)),\
                        train['gavin_res'],\
                        test['gavin_res']
    return Datasets(train=dataset(train_x,train_y,features),
                    test=dataset(test_x,test_y,features))

def run_display_output(classifier,test,DRAW=False):
    '''
    get confusion matrix and auc score for test dataset
    (optional) draw roc curve
    '''
    pred = classifier.predict(test.values)
    tn, fp, fn, tp = confusion_matrix(test.labels,pred).ravel()#confusion matrix
    print(tn,fp,fn,tp)
    sensitivity = tp/(fn+tp)
    specificity = tn/(fp+tn)
    prods = classifier.predict_proba(test.values)[:,1]
    fpr, tpr, _ = metrics.roc_curve(test.labels, prods)
    score = metrics.auc(fpr,tpr) #auc score
    if DRAW:
        draw_roc_curve(fpr,tpr,score)

    return sensitivity, specificity, score

def display_res_gavin_and_best_model(param_grid,pipeline,mvid,filename=None):
    '''
    use model defined by pipeline to fit mvid Dataset
    gridsearchCV determine the parameters given in param_grid
    (optional) save the model in path given in filename
    '''
    classifier = GridSearchCV(estimator=pipeline,
                              param_grid=param_grid)

    print('Start training...')
    classifier.fit(mvid.train.values,mvid.train.labels)
    print('Model Description:\n',classifier.best_estimator_)
    if filename:
        pickle.dump(classifier,open(filename,'wb'))
        print('Saved model to path:',filename)
    sensitivity,specificity,score = run_display_output(classifier,mvid.test)
    print('>>> best model results: sensitivity: {:.{prec}}\tspecificity: {:.{prec}f}\tauc:{}'.\
    format(sensitivity,specificity,score,prec=3))
    return classifier

def read_gavin(gavin_res, labels):
    '''
    compare gavin results with labels for a certain subset of data
    '''
    gavin_res = gavin_res.replace('Pathogenic',1)
    gavin_res = gavin_res.replace('Benign',0)
    tn_g, fp_g, fn_g, tp_g = \
    confusion_matrix(labels, gavin_res.astype(np.int8)).ravel()
    sensitivity_g = tp_g/(fn_g+tp_g)
    specificity_g = tn_g/(fp_g+tn_g)
    return sensitivity_g, specificity_g

In [50]:
if __name__=='__main__':

    # read data
    data = pd.read_csv('data/dummy_no_nan_data_with_gavinres.tsv',sep='\t')
    data = data.drop('POS',axis=1) # drop pos
    mvid, train_gavin, test_gavin = read_data_set(data,BENCHMARK=True)
    # print(data.head())
    # raise NotImplementedError # check the dataset loaded
    print('Dataset loaded.',mvid.train.values.shape)

# ================model selection==========================================
    # # PCA + LogisticRegression
    # # Parameters
    n_components = [10]#np.arange(10,100,10)
    class_weight = ['balanced']#,{1:4,0:1},{1:2,0:1}]
    param_grid_logr = [{'pca__n_components':n_components,
                   'logr__penalty':['l1'],#'l2'],
                   'logr__C':[2],#,3,4,5],
                   'logr__class_weight':class_weight}]
    # pipeline
    pipeline_logr = Pipeline(steps=[('pca',PCA()),
                               ('logr',LogisticRegression())])
    # save model
    filename = os.path.join('model')#,'pca_logr_new.sav')
    # display results
    classifier_logr = display_res_gavin_and_best_model(param_grid_logr,
                                     pipeline_logr,
                                     mvid)#,
                                     #filename)
    # display gavin results
    sensitivity_g,specificity_g = read_gavin(test_gavin,mvid.test.labels)
    print('>>> gavin model results: sensitivity: {:.{prec}}\tspecificity: {:.{prec}f}'.\
    format(sensitivity_g,specificity_g,prec=3))

Dataset loaded. (637, 991)
Start training...
Model Description:
 Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logr', LogisticRegression(C=2, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
56 15 45 97
>>> best model results: sensitivity: 0.683	specificity: 0.789	auc:0.8090656615750844
>>> gavin model results: sensitivity: 0.739	specificity: 0.070


In [43]:
classifier_logr.best_estimator_.steps[0][1].explained_variance_ratio_

array([ 0.11853035,  0.0780476 ,  0.06277689,  0.04903768,  0.04624863,
        0.04406613,  0.03369834,  0.02918563,  0.02725033,  0.02369261])