In [54]:
import pandas as pd
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import numpy as np

In [55]:
# load data
file_path = os.path.join('data','myo5b_variants_patho_benign_cadd1.3fullannot_v1.xlsx')
data = pd.ExcelFile(file_path)
data = data.parse(data.sheet_names[0])
data_no_null = data
data.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,INFO,Type,Length,isTv,isDerived,...,Intron,oAA,nAA,Grantham,PolyPhenCat,PolyPhenVal,SIFTcat,SIFTval,RawScore,PHRED
0,18,47352774,MYO5B:c.5616-2A>G,T,C,Pathogenic,SNV,0,False,True,...,,,,,,,,,0.334537,6.023
1,18,47361716,MYO5B:c.5392C>T,G,A,Pathogenic,SNV,0,False,True,...,,Q,*,,,,,,14.849415,48.0
2,18,47361725,MYO5B:c.5383C>T,G,A,Pathogenic,SNV,0,False,True,...,,R,*,,,,,,15.701726,52.0
3,18,47365526,MYO5B:c.4840C>T,G,A,Pathogenic,SNV,0,False,True,...,,Q,*,,,,,,14.577744,46.0
4,18,47365610,MYO5B:c.4755_4756dupT,C,CCA,Pathogenic,INS,2,,True,...,,,,,,,,,9.553096,35.0


In [56]:
# delete columns with less than 30% values
null_thres = data.shape[0] * 0.3
for col in data.columns:
    null = data[col].isnull().values.ravel().sum()
    if null > null_thres:
        data_no_null = data_no_null.drop([col],axis=1)
# delete 'CHROM', 'ID', 'GeneID','FeatureID','GeneName'
del_list = ['CHROM', 'ID', 'GeneID','FeatureID','GeneName','PHRED','RawScore']
data_no_null = data_no_null.drop(del_list,axis=1)

In [57]:
# labelencoding
le = preprocessing.LabelEncoder()
for col in data_no_null.select_dtypes([np.object]).columns:
    data_no_null[col] = data_no_null[col].fillna('not applicable')
    data_no_null[col] = le.fit_transform(data_no_null[col].astype(str))

In [58]:
data_no_null = data_no_null.fillna(0)
data_no_null.head()

Unnamed: 0,POS,REF,ALT,INFO,Type,Length,isTv,isDerived,AnnoType,Consequence,...,minDistTSE,CCDS,cDNApos,relcDNApos,CDSpos,relCDSpos,protPos,relProtPos,Domain,Exon
0,47352774,14,3,1,2,0,0,True,2,0,...,114,0,5914.0,0.62,0.0,0.0,0.0,0.0,2,29
1,47361716,11,0,1,2,0,0,True,0,8,...,9056,0,5692.0,0.6,5392.0,0.97,1798.0,0.97,1,27
2,47361725,11,0,1,2,0,0,True,0,8,...,9065,0,5683.0,0.6,5383.0,0.97,1795.0,0.97,1,27
3,47365526,11,0,1,2,0,0,True,0,8,...,11904,0,5140.0,0.54,4840.0,0.87,1614.0,0.87,1,25
4,47365610,6,5,1,1,2,2,True,0,2,...,11819,0,5055.0,0.53,4755.0,0.86,1585.0,0.86,1,25


In [59]:
# split train and test datasets
train, test = train_test_split(data_no_null,test_size=0.25)
print(train.shape, test.shape)
train_x = train[[col for col in train.columns if col not in ['INFO']]]
train_y = train['INFO']
test_x = test[[col for col in test.columns if col not in ['INFO']]]
test_y = test['INFO']
print(train_x.shape)

(186, 66) (62, 66)
(186, 65)


In [92]:
# no normalization and pca
pca = PCA(n_components=5)
pca.fit(train_x)
abs_coef = np.absolute(pca.components_)
print(abs_coef[0].argmax(),abs_coef[0].max())
print(abs_coef[1].argmax(),abs_coef[1].max())
print(abs_coef[2].argmax(),abs_coef[2].max())
print(abs_coef[3].argmax(),abs_coef[3].max())
print(abs_coef[4].argmax(),abs_coef[4].max())

print(data_no_null.columns[54],data_no_null.columns[0],data_no_null.columns[57],data_no_null.columns[59])

54 0.834101446253
0 0.621683441288
0 0.705833328933
57 0.763288687913
59 0.721412351903
isKnownVariant POS CCDS relcDNApos


In [79]:
# normalization and pca
nler = Normalizer(norm='l1')
train_x_norm = nler.fit_transform(train_x)
pca.fit(train_x_norm)
abs_coef = np.absolute(pca.components_)
print(abs_coef[0].argmax(),abs_coef[0].max())
print(abs_coef[1].argmax(),abs_coef[1].max())
print(abs_coef[2].argmax(),abs_coef[2].max())
print(data_no_null.columns[0],data_no_null.columns[55],data_no_null.columns[57])

0 0.799698093307
55 0.774487730813
57 0.567408559678
POS minDistTSS CCDS


In [45]:
# classifiers
C_range = [1, 2, 3, 4, 5]
penalty_range = ['l1', 'l2']
tolerance_range = [0.000001, 0.00001, 0.0001]
cw_range = ['balanced']
# pca_component_range = [20,60]

param_grid = [{'lr__C': C_range
               ,'lr__penalty': penalty_range
               ,'lr__tol': tolerance_range
               ,'lr__class_weight': cw_range
#                ,'pca__n_components': pca_component_range
               }]

In [53]:
# Make pipeline (logistic regression)
pipe_lr = Pipeline(steps=[
    #('norm', Normalizer(norm='l1')),
    ('pca', PCA()),
    ('lr', LogisticRegression())
])

classifier = GridSearchCV(estimator=pipe_lr,
                    param_grid=param_grid)

classifier.fit(train_x,train_y)
pred = classifier.predict(test_x)
confusion_matrix(test_y,pred)

array([[37, 12],
       [ 5,  8]])