### count clinvar variants
* [count summary](https://docs.google.com/document/d/1XexRyu7gULSjbIMiUc2q98ZYlPfk8BOuYBQBRNSDwwI/edit)

In [1]:
import pandas

In [19]:
clin_file = '../data/interim/clinvar/clinvar.dat'
clinvar_df_pre = pandas.read_csv(clin_file, sep='\t').fillna(0)

In [20]:
print('# total clinvar variants', len(clinvar_df_pre))
print('# total clinvar positions', len(clinvar_df_pre[['chrom','pos']].drop_duplicates()))

# total clinvar variants 10806
# total clinvar positions 9234


In [21]:
def calc_final_sig(row):
    sig_set = set(str(row['clinSig'].split('|')))
    has_benign = '2' in sig_set or '3' in sig_set
    has_path = '4' in sig_set or '5' in sig_set
    if has_path and not has_benign:
        return 1
    if not has_path and has_benign:
        return 0
    return -1

clinvar_df_pre.loc[:, "y"] = clinvar_df_pre.apply(calc_final_sig, axis=1)
p = len(clinvar_df_pre[clinvar_df_pre.y==1])
b = len(clinvar_df_pre[clinvar_df_pre.y==0])
print('# clinvar variants with all pathogenic', p)
print('# clinvar variants with all benign', b)

p = len(clinvar_df_pre[clinvar_df_pre.y==1][['chrom','pos']].drop_duplicates())
b = len(clinvar_df_pre[clinvar_df_pre.y==0][['chrom','pos']].drop_duplicates())
print('# clinvar positions with all pathogenic', p)
print('# clinvar positions with all benign', b)

# clinvar variants with all pathogenic 2517
# clinvar variants with all benign 2050
# clinvar positions with all pathogenic 2350
# clinvar positions with all benign 1997


In [26]:
p = clinvar_df_pre[ (clinvar_df_pre.y==1) & (clinvar_df_pre.eff=='missense_variant')]
b = clinvar_df_pre[ (clinvar_df_pre.y==0) & (clinvar_df_pre.eff=='missense_variant')]
print('# clinvar variants with all pathogenic missense', len(p))
print('# clinvar variants with all benign missense', len(b))

p = len(p[['chrom','pos']].drop_duplicates())
b = len(b[['chrom','pos']].drop_duplicates())
print('# clinvar positions with all pathogenic missesne', p)
print('# clinvar positions with all benign missense', b)

# clinvar variants with all pathogenic missense 1059
# clinvar variants with all benign missense 523
# clinvar positions with all pathogenic missesne 1023
# clinvar positions with all benign missense 518


In [30]:
p = clinvar_df_pre[ (clinvar_df_pre.y==1) & (clinvar_df_pre.mpc>0) & (clinvar_df_pre.pfam!='none') & (clinvar_df_pre.eff=='missense_variant')]
b = clinvar_df_pre[ (clinvar_df_pre.y==0) & (clinvar_df_pre.mpc>0) & (clinvar_df_pre.pfam!='none') & (clinvar_df_pre.eff=='missense_variant')]
print('# clinvar variants with all pathogenic and mpc score in pfam domain', len(p))
print('# clinvar variants with all benign and mpc score in pfam domain', len(b))

p = len(p[['chrom','pos']].drop_duplicates())
b = len(b[['chrom','pos']].drop_duplicates())
print('# clinvar positions with missense pathogenic and mpc score in pfam domain', p)
print('# clinvar positions with missense benign and mpc score in pfam domain', b)

# clinvar variants with all pathogenic and mpc score in pfam domain 652
# clinvar variants with all benign and mpc score in pfam domain 154
# clinvar positions with missense pathogenic and mpc score in pfam domain 627
# clinvar positions with missense benign and mpc score in pfam domain 154


In [31]:
p = clinvar_df_pre[ (clinvar_df_pre.y==1) & ((clinvar_df_pre.mpc>0) | (clinvar_df_pre.pfam!='none')) & (clinvar_df_pre.eff=='missense_variant')]
b = clinvar_df_pre[ (clinvar_df_pre.y==0) & ((clinvar_df_pre.mpc>0) | (clinvar_df_pre.pfam!='none')) & (clinvar_df_pre.eff=='missense_variant')]
print('# clinvar variants with all pathogenic and mpc score or in pfam domain', len(p))
print('# clinvar variants with all benign and mpc score or in pfam domain', len(b))

p = len(p[['chrom','pos']].drop_duplicates())
b = len(b[['chrom','pos']].drop_duplicates())
print('# clinvar positions with missense pathogenic and mpc score or in pfam domain', p)
print('# clinvar positions with missense benign and mpc score or in pfam domain', b)

# clinvar variants with all pathogenic and mpc score or in pfam domain 1029
# clinvar variants with all benign and mpc score or in pfam domain 396
# clinvar positions with missense pathogenic and mpc score or in pfam domain 994
# clinvar positions with missense benign and mpc score or in pfam domain 393


In [27]:
p = clinvar_df_pre[ (clinvar_df_pre.y==1) & (clinvar_df_pre.mpc>0) & (clinvar_df_pre.eff=='missense_variant')]
b = clinvar_df_pre[ (clinvar_df_pre.y==0) & (clinvar_df_pre.mpc>0) & (clinvar_df_pre.eff=='missense_variant')]
print('# clinvar variants with missesense pathogenic and mpc score', len(p))
print('# clinvar variants with missense benign and mpc score', len(b))

p = len(p[['chrom','pos']].drop_duplicates())
b = len(b[['chrom','pos']].drop_duplicates())
print('# clinvar positions with missesnse pathogenic and mpc score', p)
print('# clinvar positions with missense benign and mpc score', b)

# clinvar variants with missesense pathogenic and mpc score 966
# clinvar variants with missense benign and mpc score 374
# clinvar positions with missesnse pathogenic and mpc score 934
# clinvar positions with missense benign and mpc score 371


In [23]:
clinvar_df_pre[ (clinvar_df_pre.y==1) & (clinvar_df_pre.mpc==0)].groupby('eff').size()

eff
3_prime_UTR_variant                                           1
disruptive_inframe_deletion                                  11
disruptive_inframe_deletion+synonymous_variant                1
disruptive_inframe_insertion                                 10
downstream_gene_variant                                       5
frameshift_variant                                          694
frameshift_variant+missense_variant                          17
frameshift_variant+stop_gained                                9
frameshift_variant+stop_gained+missense_variant               1
frameshift_variant+synonymous_variant                         4
inframe_deletion                                             30
inframe_insertion                                             5
initiator_codon_variant                                       3
intragenic_variant                                            7
intron_variant                                                7
missense_variant                    

In [18]:
clinvar_df_pre[ (clinvar_df_pre.y==1) & (clinvar_df_pre.mpc==0) & (clinvar_df_pre.eff=='missense_variant')].head(20)

Unnamed: 0,chrom,pos,ref,alt,pfam,eff,clinSig,af_1kg_all,gene,mpc,y
321,1,43395364,TT,AC,"MFS_1:47,Sugar_tr:32",missense_variant,5|0,0.0,SLC2A1,0.0,1
366,1,43396529,GA,AT,"MFS_1:49,Sugar_tr:34",missense_variant,5,0.0,SLC2A1,0.0,1
1804,2,166165903,T,G,Ion_trans:475,missense_variant,4,0.0,SCN2A,0.0,1
1805,2,166165903,T,G,Ion_trans:475,missense_variant,4,0.0,SCN2A,0.0,1
1806,2,166165903,T,G,Ion_trans:475,missense_variant,4,0.0,SCN2A,0.0,1
1807,2,166165924,G,A,Ion_trans:475,missense_variant,5,0.0,SCN2A,0.0,1
1808,2,166165924,G,A,Ion_trans:475,missense_variant,5,0.0,SCN2A,0.0,1
1809,2,166165924,G,A,Ion_trans:475,missense_variant,5,0.0,SCN2A,0.0,1
1813,2,166166853,GC,TT,Ion_trans:476,missense_variant,4,0.0,SCN2A,0.0,1
1814,2,166166853,GC,TT,Ion_trans:476,missense_variant,4,0.0,SCN2A,0.0,1


In [None]:
clin_file = '../data/interim/clinvar/clinvar.dat'
clinvar_df_pre = pandas.read_csv(clin_file, sep='\t')


clinvar_df_pre.loc[:, "y"] = clinvar_df_pre.apply(calc_final_sig, axis=1)
clinvar_df = clinvar_df_pre[(clinvar_df_pre.y!=-1) & (clinvar_df_pre.pfam!='none') & (clinvar_df_pre.mpc>0)].drop_duplicates()
clinvar_df.loc[:, 'path_frac_t'] = clinvar_df.apply(lambda row: match(row, domain_info)[0], axis=1)
clinvar_df.loc[:, 'size_t'] = clinvar_df.apply(lambda row: match(row, domain_info)[1], axis=1)
clinvar_df.loc[:, 'path_na_t'] = clinvar_df.apply(lambda row: match(row, domain_info)[2], axis=1)