# VEP API processing
##### Updated 11/03/2023
##### Selin Kubali

#### Goal:
This notebook takes VCF files generated by vcfByGene and uses VEP API to add annotations predicting variants' conservation and deleteriousness.

#### Required inputs
VCF files with chrom, pos, ref, and alt information. Found in *selected_genes/hcm/vcf_files/*

In [1]:
# download vcf files generated by vcfByGene
!dx cd "Cassa Lab Shared Project:selected_genes/hcm/vcf_files/"
!dx download *.vcf.gz

^C2K[===>                                                        ] Downloaded 100,663,296 of 1,662,258,548 bytes (6%) /opt/notebooks/MYH6_variants.vcf.gz




In [2]:
# unzip vcf files
!gzip -d *.vcf.gz

In [1]:
## cut out header and information related to patient genotypes from VCF files
# you may have to switch to a bash kernel for this and switch back to a python kernel
%%bash
for vcf_file in *.vcf; do
  # Remove lines starting with '##' and then use 'cut' to extract fields 1-9
  grep -v '^##' "$vcf_file" | cut -d$'\t' -f 1-9 > "${vcf_file%.vcf}_cleaned.vcf"
done




In [None]:
# upload unzipped and cut VCF files
!dx cd "Cassa Lab Shared Project:selected_genes/hcm/cleaned_vcf_files/"
!dx upload *_cleaned.vcf

In [1]:
# if the files are already uploaded in cleaned_vcf_files, skip the above steps and use this
!dx cd "Cassa Lab Shared Project:selected_genes/hcm/cleaned_vcf_files/"
!dx download *_variants_cleaned.vcf



In [36]:
import pandas as pd
import numpy as np
import requests
import re
import sys
from requests.exceptions import Timeout

In [None]:
def find_max_val(gnomad_dict):
    """ Loop through gnomad values and find highest gnomadg value """
    max_val = 0
    for key in gnomad_dict:
        if (gnomad_dict[key] > max_val) & ("gnomadg" in key):
            max_val = gnomad_dict[key]
    return max_val



In [37]:
def find_max_af(decoded, alt):
    """ Find gnomad values in JSON output """
    try:
        gnomadg_s = decoded[0]['colocated_variants'][0]['frequencies'][alt]
        gnomad_max_af = find_max_val(gnomadg_s)
    except KeyError:
        try:
            gnomadg_s = decoded[0]['colocated_variants'][1]['frequencies'][alt]
            gnomad_max_af = find_max_val(gnomadg_s)
        except IndexError:
            gnomad_max_af = None
        except KeyError:
            try:
                gnomadg_s = decoded[0]['colocated_variants'][2]['frequencies'][alt]
                gnomad_max_af = find_max_val(gnomadg_s)
            except:
                gnomad_max_af = None
            
    return gnomad_max_af


In [38]:
def find_trv(ref, alt):
    """ Find whether transition or transversion """
    # transition = 0
    # transversion = 1
    trv = None
    if (((ref == 'A') | (ref == 'G')) & ((alt == 'A') | (alt == 'G'))):
        trv = 0
    elif (((ref == 'C') | (ref == 'T')) & ((alt == 'C') | (alt == 'T'))):
        trv = 0
    else:
        trv = 1
    return trv

ERROR! Session/line number was not unique in database. History logging moved to new session 4


In [39]:
def add_annotations(row, gene, df, df_row):
    
    
    """ Takes a variant from .vcf files and returns information about variant from VEP API """
    # loads json with annotations for this variant
    # note: works whether chr is int or chr# (eg 19 vs chr19, X vs chrX)

    # parameters to request from API
    parameters = {
        "species": "human",
        "dbNSFP": 'ALL',
        "assembly": "GRCh38",
        "transcript_match":1,
        "max_af":1,
        "LoF":1,
        "pick":1,
        "pick_order":"canonical,appris,tsl,biotype,ccds,length,rank,mane_select,mane_plus_clinical",
        "SpliceAI":1
    }


    # send API request 
    try:
        req = "https://rest.ensembl.org/vep/human/hgvs/" + f"{row['#CHROM']}:g.{row['POS']}{row['REF']}>{row['ALT']}/?" + "&".join([f"{key}={value}" for key, value in parameters.items()])
        res = requests.get(req, headers={ "Content-Type" : "application/json"}, timeout=60)
        
        if not res.ok:
          res.raise_for_status()
          sys.exit()
            
        decoded = res.json()
        

    except requests.exceptions.HTTPError as e:
 
        try:
            len_ref = str(len(row['REF']) + row['POS'] - 1) # convert to as numeric
            req = "https://rest.ensembl.org/vep/human/hgvs/" + f"{row['#CHROM']}:g.{row['POS']}_{len_ref}del{row['REF']}ins{row['ALT']}/?" + "&".join([f"{key}={value}" for key, value in parameters.items()])
            res = requests.get(req, headers={ "Content-Type" : "application/json"}, timeout=60)

            if not res.ok:
              res.raise_for_status()
              sys.exit()
            decoded = res.json()
        except Timeout:
            print("Timeout error occurred. The request took too long to complete")
        
        

        

    except Timeout:
        try:
            len_ref = str(len(row['REF']) + row['POS'] - 1) # convert to as numeric
            req = "https://rest.ensembl.org/vep/human/hgvs/" + f"{row['#CHROM']}:g.{row['POS']}_{len_ref}del{row['REF']}ins{row['ALT']}/?" + "&".join([f"{key}={value}" for key, value in parameters.items()])
            res = requests.get(req, headers={ "Content-Type" : "application/json"}, timeout=60)

            if not res.ok:
              res.raise_for_status()
              sys.exit()
            decoded = res.json()
        except Timeout:
            print("Timeout error occurred. The request took too long to complete")
        
        

   
    ref = row['REF']
    alt = row['ALT']
    pos = row['POS']

  
    try:
        ds_ag = decoded[0]['transcript_consequences'][0]['spliceai']['DS_AG']
        ds_al = decoded[0]['transcript_consequences'][0]['spliceai']['DS_AL']
        ds_dg = decoded[0]['transcript_consequences'][0]['spliceai']['DS_DG']
        ds_dl = decoded[0]['transcript_consequences'][0]['spliceai']['DS_DL']
    except KeyError:
        ds_ag, ds_al, ds_dg, ds_dl = None, None, None, None



    # extract relevant parameters
    try:
        phylop100way_vertebrate = decoded[0]['transcript_consequences'][0]['phylop100way_vertebrate']
        gerp = decoded[0]['transcript_consequences'][0]['gerp++_rs']
        cadd_raw = decoded[0]['transcript_consequences'][0]['cadd_raw']
  
    except KeyError:
        phylop100way_vertebrate, gerp, cadd_raw  = None, None, None


    trv = find_trv(ref, alt)
    vep_consequence = decoded[0]['transcript_consequences'][0]['consequence_terms']
    gene_id = decoded[0]['input']
    chrom = re.search(r'^(.*?):', gene_id).group(1)
    vep_consequence = "&".join(str(consequence) for consequence in vep_consequence)


    # change formatting of ID
    gene_id = gene_id.replace(">", "-" ) 
    gene_id = gene_id.replace(":g.", "-" ) 
    gene_id = gene_id[3:-3] + "-" + gene_id[-3:len(gene_id)]


    # add parameters to df
    df.loc[df_row, 'Name'] = gene_id
    df.loc[df_row, 'Chrom'] = chrom
    df.loc[df_row, 'Pos'] = pos
    df.loc[df_row, 'Ref'] = ref
    df.loc[df_row, 'Alt'] = alt
    df.loc[df_row, 'vep_consequence'] = vep_consequence
    df.loc[df_row, 'CADD_raw'] = cadd_raw
    df.loc[df_row, 'phyloP100way_vertebrate'] = phylop100way_vertebrate
    df.loc[df_row, 'GERP++_RS'] = gerp
    df.loc[df_row, 'trv'] = trv
    df.loc[df_row, 'ds_ag'] = ds_ag
    df.loc[df_row, 'ds_al'] = ds_al
    df.loc[df_row, 'ds_dg'] = ds_dg
    df.loc[df_row, 'ds_dl'] = ds_dl
    
    
    
    
    

In [42]:
# loop through all rows in VCF; annotate with VEP; add to dataframe; and convert to CSV
#genes =  ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR"]
genes =  ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR"]

for g in genes:
    print(g)
    parsed_df = pd.DataFrame({'Name': [], 'Chrom': [], 'Pos': [], 'Ref': [], 'Alt': [], 'vep_consequence': [], 'CADD_raw': [], 'phyloP100way_vertebrate': [], 'GERP++_RS': [], 'trv':[], 'ds_ag':[], 'ds_al':[], 'ds_dg': [], 'ds_dl': []})
    gene_df = pd.read_csv(g + "_variants_cleaned.vcf", sep = "\t")
    
    
    for i in range(len(gene_df)):
        add_annotations(gene_df.iloc[i,:],g,parsed_df, i)
        print(i)
        

    parsed_df.to_csv(g+".csv")
    


TTR
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175


In [43]:
# upload processed VCFs
!dx mkdir -p "Cassa Lab Shared Project:selected_genes/hcm/parsed_vep_files/"
!dx cd /selected_genes/hcm/parsed_vep_files/
!dx upload *.csv

ID                          file-GbggfZQJqBjB9J7739k3j1V9
Class                       file
Project                     project-GGy3Bb0JqBj7zfxY8v4by61X
Folder                      /selected_genes/hcm/parsed_vep_files
Name                        ACTN2.csv
State                       [33mclosing[0m
Visibility                  visible
Types                       -
Properties                  -
Tags                        -
Outgoing links              -
Created                     Tue Dec  5 19:34:42 2023
Created by                  skubali
 via the job                job-GbgX4XjJqBj3V7GyK8YGF1zj
Last modified               Tue Dec  5 19:34:43 2023
Media type                  
archivalState               "live"
cloudAccount                "cloudaccount-dnanexus"
ID                          file-Gbggfb0JqBjGq0yKypkvKgYV
Class                       file
Project                     project-GGy3Bb0JqBj7zfxY8v4by61X
Folder                      /selected_genes/hcm/parsed_vep_files
Name        