In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os


In [4]:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
plt.rcParams["font.family"] = "Helvetica" #somethings this one doesnt work
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['figure.dpi'] = 300

In [5]:
# Loading in info about the ADs
cc_names = pd.read_excel("../data/gene_names_with_location_and_pmvs.xlsx")
cc_names = cc_names.rename(columns = {"gene" : "Gene", "start" : "Start", "end" : "End"})
known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv")
cc_names = pd.merge(known_ADs[["uniprotID", "Gene", "Start", "End"]], cc_names)
ensts = pd.read_csv("../data/SFARI_TFs_with_ENST_corrected.csv", index_col = 0)
cc_names = pd.merge(cc_names, ensts)
cc_names["ENST"] = cc_names["ENST"].str.split(".").str[0]
cc_names["len"] = cc_names["End"] - cc_names["Start"] + 1
cc_names = cc_names.rename(columns = {"Start" : "start", "End" : "end"})
cc_names.at[9, "end"] = 93 # to make consistent with other table
cc_names

Unnamed: 0,uniprotID,Gene,start,end,pMVS #,Gene Name,ENST,len
0,O94983,CAMTA2,285,468,404,CAMTA2_AD1,ENST00000348066,184
1,O94983,CAMTA2,472,581,405,CAMTA2_AD2,ENST00000348066,110
2,P11308,ERG,433,479,400,ERG_AD1,ENST00000288319,47
3,P11308,ERG,118,261,406,ERG_AD2,ENST00000288319,144
4,Q13422,IKZF1,284,365,402,IKZF1_AD,ENST00000331340,82
5,O14770,MEIS2,340,477,377,MEIS2_AD,ENST00000561208,138
6,Q15788,NCOA1,1241,1385,379,NCOA1_AD2,ENST00000348332,145
7,Q15788,NCOA1,840,1011,380,NCOA1_AD3,ENST00000348332,172
8,O95096,NKX2-2,220,273,381,NKX2-2_AD,ENST00000377142,54
9,P43354,NR4A2,1,93,382,NR4A2_AD1,ENST00000339562,91


In [24]:
all_tested_vars = []

for ENST in set(cc_names["ENST"]):
    all_tested_vars.append(pd.read_csv("../soto_analysis/outputs/mutations/cds_expanded_iWES_v2_variants_snv_classified/" + ENST + ".bed", header = None, index_col = None, sep = "\t"))

all_tested_vars = pd.concat(all_tested_vars)
all_tested_vars = all_tested_vars.drop_duplicates()
all_tested_vars = all_tested_vars.reset_index(drop = True)
all_tested_vars = all_tested_vars[all_tested_vars[13] == "No-Syn"]
all_tested_vars

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
4,17,4968793,4968794,ENST00000348066,-1,17,4968793,4968794,C,G,0.000431,E,Q,No-Syn
5,17,4968795,4968796,ENST00000348066,-1,17,4968795,4968796,T,C,0.000009,Q,R,No-Syn
7,17,4968798,4968799,ENST00000348066,-1,17,4968798,4968799,T,C,0.000009,N,S,No-Syn
8,17,4968807,4968808,ENST00000348066,-1,17,4968807,4968808,A,C,0.000084,L,R,No-Syn
10,17,4968815,4968816,ENST00000348066,-1,17,4968815,4968816,C,T,0.000023,M,I,No-Syn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4105,15,37098182,37098183,ENST00000561208,-1,15,37098182,37098183,T,C,0.000005,H,R,No-Syn
4107,15,37098190,37098191,ENST00000561208,-1,15,37098190,37098191,C,G,0.000033,E,D,No-Syn
4108,15,37098193,37098194,ENST00000561208,-1,15,37098193,37098194,A,C,0.000019,D,E,No-Syn
4110,15,37098193,37098194,ENST00000561208,-1,15,37098193,37098194,A,T,0.000005,D,E,No-Syn


In [39]:
# #THIS APPROACH DID NOT WORK - spliceai didn't accept
# with open("../data/all_tested_vars.vcf", "w") as file:
#     file.write("##fileFormat=VCF\n")
#     file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\tSAMPLE2\n")
#     for i in all_tested_vars.index[:1]:
#         file.write(str(all_tested_vars[0].iloc[i]) + "\t") # CHROM
#         file.write(str(all_tested_vars[2].iloc[i]) + "\t") # POS
#         file.write(".\t") # ID
#         file.write(str(all_tested_vars[8].iloc[i]) + "\t") # REF
#         file.write(str(all_tested_vars[9].iloc[i]) + "\t") # ALT
#         file.write("\n")

In [40]:
# ! spliceai -I ../data/all_tested_vars.vcf -O ../output/spliceai_tested_vars_output.vcf -R ../data/hg38.fa -A grch38

In [37]:
import requests

def query_spliceai(variant, hg=38, distance=50, mask=0):
    # Choose the base URL for the GRCh38 version
    url = f"https://spliceai-38-xwkwwwxdwq-uc.a.run.app/spliceai/?hg={hg}&distance={distance}&variant={variant}&mask={mask}"
    
    # Make the GET request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()  # Return the JSON response
    else:
        print(f"Error: {response.status_code}")
        return None


In [47]:
for i in all_tested_vars.index[:1]:
    variant = f"chr{all_tested_vars[0].iloc[i]}-{all_tested_vars[2].iloc[i]}-{all_tested_vars[8].iloc[i]}-{all_tested_vars[9].iloc[i]}"
    result = query_spliceai(variant)
    ENST = all_tested_vars[3].iloc[0]

    if result:
        result = result["scores"]
        for entry in result:
            if ENST in entry["t_id"]:
                result = entry


{'DS_AG': '0.02',
 'DS_AL': '0.00',
 'DS_DG': '0.00',
 'DS_DL': '0.00',
 'DP_AG': -3,
 'DP_AL': 3,
 'DP_DG': 3,
 'DP_DL': -21,
 'DS_AG_REF': '0.01',
 'DS_AL_REF': '1.00',
 'DS_DG_REF': '0.00',
 'DS_DL_REF': '0.00',
 'DS_AG_ALT': '0.02',
 'DS_AL_ALT': '0.99',
 'DS_DG_ALT': '0.00',
 'DS_DL_ALT': '0.00',
 'SCORES_FOR_INSERTED_BASES': [],
 'g_id': 'ENSG00000108509.21',
 'g_name': 'CAMTA2',
 't_id': 'ENST00000348066.8',
 't_priority': 'MS',
 't_refseq_ids': ['NM_015099.4'],
 't_strand': '-',
 't_type': 'protein_coding'}

In [None]:
# Try 2: save as bed, then convert to vcf

all_tested_vars["chr"] = "chr" + all_tested_vars[0]
all_tested_vars[["chr", 1, 2, .to_csv("../data/all_tested_vars.bed"