In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

## Examine data

In [2]:
%%capture
df = pd.read_csv('clinvar_20180225.ann.tsv.gz', compression='gzip', sep='\t', error_bad_lines=False, header=0)

In [3]:
len(df)

372159

In [4]:
df.head()

Unnamed: 0,#id,chrom,inputPos,inputRef,inputAlt,unannotatedReason,gene,geneId,geneDesc,transcript,...,granthamDist,AGVGDclass,AGVGDgv,AGVGDgd,SIFTprediction,SIFTweight,SIFTmedian,MAPPprediction,MAPPpValue,MAPPpValueMedian
0,475283,1,949422,G,A,-,ISG15,4053,ISG15 ubiquitin-like modifier,NM_005101.3,...,46.0,C0,178.468,0.0,,,,,,
1,183381,1,949523,C,T,-,ISG15,4053,ISG15 ubiquitin-like modifier,NM_005101.3,...,,,,,,,,,,
2,475278,1,949597,C,T,-,ISG15,4053,ISG15 ubiquitin-like modifier,NM_005101.3,...,,,,,,,,,,
3,402986,1,949608,G,A,-,ISG15,4053,ISG15 ubiquitin-like modifier,NM_005101.3,...,46.0,C0,130.977,0.0,,,,,,
4,161455,1,949696,C,CG,-,ISG15,4053,ISG15 ubiquitin-like modifier,NM_005101.3,...,,,,,,,,,,


In [5]:
df.dtypes

#id                       int64
chrom                    object
inputPos                  int64
inputRef                 object
inputAlt                 object
unannotatedReason        object
gene                     object
geneId                    int64
geneDesc                 object
transcript               object
strand                    int64
transLen                  int64
cdsLen                    int64
protein                  object
Uniprot                  object
varType                  object
codingEffect             object
varLocation              object
assembly                 object
gDNAstart                 int64
gDNAend                   int64
gNomen                   object
cDNAstart                object
cDNAend                  object
cNomen                   object
pNomen                   object
alt_pNomen               object
exon                      int64
intron                  float64
omimId                  float64
                         ...   
wtAA_3  

In [6]:
df.isnull().sum()

#id                          0
chrom                        0
inputPos                     0
inputRef                     0
inputAlt                     0
unannotatedReason            0
gene                         0
geneId                       0
geneDesc                     0
transcript                   0
strand                       0
transLen                     0
cdsLen                       0
protein                  16608
Uniprot                  88433
varType                      0
codingEffect            131716
varLocation                  0
assembly                     0
gDNAstart                    0
gDNAend                      0
gNomen                       0
cDNAstart                    0
cDNAend                      0
cNomen                       0
pNomen                    2380
alt_pNomen                2380
exon                         0
intron                  318115
omimId                   21182
                         ...  
wtAA_3                  158696
wtCodon 

## Preprocessing

In [7]:
RELEVANT_FEATURES = [
    'chrom',
    'inputPos',
    'inputRef',
    'inputAlt',
    'transcript',
    'codingEffect',
    'varLocation',
    'alt_pNomen',
    'wtSSFScore',
    'wtMaxEntScore',
    'varSSFScore',
    'varMaxEntScore',
    'rsId',
    'rsClinicalSignificance',
    'rsMAF',
    '1000g_AF',
    'gnomadAltFreq_all',
    'espAllMAF',
    'espAllAAF',
    'clinVarMethods',
    'clinVarClinSignifs',
    'nOrthos',
    'conservedOrthos'
]


RULES = [
    lambda df: df['codingEffect'] != 'synonymous',
    lambda df: df['varLocation'] != 'intron',
    lambda df: np.invert(df['1000g_AF'] > 0.01),
    lambda df: np.invert(df['gnomadAltFreq_all'] > 0.01)
]


def rules_filter(df):
    for rule in RULES:
        df = df[rule(df)]
    return df


df = df[RELEVANT_FEATURES]  # Filtering relevant features
df = rules_filter(df)  # Rules filtering

In [8]:
np.random.seed(42)
df = df.sample(frac=1).reset_index(drop=True)

## Making target

In [9]:
def pick_target(df, column='clinVarClinSignifs'):
    df = df.copy()
    df['y'] = df[column].astype(str).apply(lambda x: int('pathogenic' in x.lower()))
    del df[column]
    return df

In [10]:
df = pick_target(df)
df.head()

Unnamed: 0,chrom,inputPos,inputRef,inputAlt,transcript,codingEffect,varLocation,alt_pNomen,wtSSFScore,wtMaxEntScore,...,rsClinicalSignificance,rsMAF,1000g_AF,gnomadAltFreq_all,espAllMAF,espAllAAF,clinVarMethods,nOrthos,conservedOrthos,y
0,9,98231113,C,T,NM_000264.3,missense,exon,p.Glu724Lys,81.522,7.45706,...,,0.0002,0.0002,2.2e-05,7.7e-05,7.7e-05,clinical testing,11.0,6.0,0
1,X,25023006,G,GGGGGCCATTGTGGAAAAGAGCCTGCAGGGAGAGCAAACAGCGCGG...,NM_139058.2,frameshift,exon,p.Leu491Alafs*19,89.7583,10.8994,...,,,,,,,clinical testing,,,1
2,3,52486140,C,T,NM_003280.2,missense,exon,p.Asp62Asn,80.6382,9.43568,...,,,,,,,clinical testing,19.0,15.0,0
3,11,72004597,G,GC,NM_030813.5,frameshift,exon,p.Cys647Leufs*26,78.7932,8.9546,...,,,,,,,literature only,,,1
4,12,6105290,C,A,NM_000552.4,stop gain,exon,p.Glu1981*,93.3547,9.9074,...,not_provided,0.0,,,,,not provided,,,0


## TT split

In [11]:
def tt_split(df, select_floats=False):
    df = df.copy()
    if select_floats:
        df = df.select_dtypes(include=['int64', 'float64']).apply(lambda c: c.astype(float))
    df.fillna(-999, inplace=True)
    y = df.y
    del df['y']
    X = df
    cat_featues = np.where(X.dtypes != np.float)[0]
    return train_test_split(X, y, test_size=0.2), cat_featues

In [12]:
(X_train, X_test, y_train, y_test), cat_featues = tt_split(df)
len(X_train), len(X_test)

(189366, 47342)

## Catboost

In [13]:
model = CatBoostClassifier()

In [14]:
model.fit(X_train, y_train, cat_features=cat_featues)

0:	learn: 0.6468704	total: 646ms	remaining: 10m 45s
1:	learn: 0.6050881	total: 1.01s	remaining: 8m 23s
2:	learn: 0.5670939	total: 1.55s	remaining: 8m 36s
3:	learn: 0.5340578	total: 1.9s	remaining: 7m 54s
4:	learn: 0.5032156	total: 2.18s	remaining: 7m 13s
5:	learn: 0.4762412	total: 2.47s	remaining: 6m 49s
6:	learn: 0.4531838	total: 2.87s	remaining: 6m 47s
7:	learn: 0.4309555	total: 3.12s	remaining: 6m 26s
8:	learn: 0.4100795	total: 3.47s	remaining: 6m 21s
9:	learn: 0.3923362	total: 3.75s	remaining: 6m 11s
10:	learn: 0.3749436	total: 4.18s	remaining: 6m 15s
11:	learn: 0.3592332	total: 4.47s	remaining: 6m 8s
12:	learn: 0.3456863	total: 4.85s	remaining: 6m 8s
13:	learn: 0.3318061	total: 5.25s	remaining: 6m 9s
14:	learn: 0.3203023	total: 5.54s	remaining: 6m 3s
15:	learn: 0.3063561	total: 5.95s	remaining: 6m 6s
16:	learn: 0.2957660	total: 6.33s	remaining: 6m 6s
17:	learn: 0.2866003	total: 6.84s	remaining: 6m 13s
18:	learn: 0.2770786	total: 7.17s	remaining: 6m 10s
19:	learn: 0.2685198	total: 

158:	learn: 0.1143429	total: 1m 32s	remaining: 8m 9s
159:	learn: 0.1142613	total: 1m 33s	remaining: 8m 9s
160:	learn: 0.1140379	total: 1m 33s	remaining: 8m 9s
161:	learn: 0.1139303	total: 1m 34s	remaining: 8m 9s
162:	learn: 0.1138660	total: 1m 35s	remaining: 8m 9s
163:	learn: 0.1137393	total: 1m 36s	remaining: 8m 10s
164:	learn: 0.1136533	total: 1m 37s	remaining: 8m 10s
165:	learn: 0.1135585	total: 1m 37s	remaining: 8m 12s
166:	learn: 0.1133760	total: 1m 38s	remaining: 8m 12s
167:	learn: 0.1132772	total: 1m 39s	remaining: 8m 12s
168:	learn: 0.1131912	total: 1m 40s	remaining: 8m 14s
169:	learn: 0.1131140	total: 1m 41s	remaining: 8m 14s
170:	learn: 0.1130482	total: 1m 42s	remaining: 8m 15s
171:	learn: 0.1129805	total: 1m 43s	remaining: 8m 16s
172:	learn: 0.1129216	total: 1m 44s	remaining: 8m 17s
173:	learn: 0.1128365	total: 1m 44s	remaining: 8m 17s
174:	learn: 0.1127991	total: 1m 45s	remaining: 8m 18s
175:	learn: 0.1126941	total: 1m 46s	remaining: 8m 19s
176:	learn: 0.1126435	total: 1m 4

312:	learn: 0.1066173	total: 3m 18s	remaining: 7m 16s
313:	learn: 0.1065821	total: 3m 19s	remaining: 7m 15s
314:	learn: 0.1065602	total: 3m 20s	remaining: 7m 14s
315:	learn: 0.1065345	total: 3m 20s	remaining: 7m 14s
316:	learn: 0.1065323	total: 3m 21s	remaining: 7m 14s
317:	learn: 0.1065262	total: 3m 22s	remaining: 7m 14s
318:	learn: 0.1064926	total: 3m 23s	remaining: 7m 13s
319:	learn: 0.1064781	total: 3m 23s	remaining: 7m 13s
320:	learn: 0.1064538	total: 3m 24s	remaining: 7m 12s
321:	learn: 0.1063922	total: 3m 25s	remaining: 7m 12s
322:	learn: 0.1063865	total: 3m 25s	remaining: 7m 11s
323:	learn: 0.1063748	total: 3m 26s	remaining: 7m 11s
324:	learn: 0.1063455	total: 3m 27s	remaining: 7m 10s
325:	learn: 0.1063248	total: 3m 27s	remaining: 7m 9s
326:	learn: 0.1063011	total: 3m 28s	remaining: 7m 9s
327:	learn: 0.1062890	total: 3m 29s	remaining: 7m 8s
328:	learn: 0.1062704	total: 3m 29s	remaining: 7m 7s
329:	learn: 0.1062661	total: 3m 30s	remaining: 7m 6s
330:	learn: 0.1062308	total: 3m 3

KeyboardInterrupt: 