In [1]:
import pandas as pd

import protfasta
import requests
import re
import warnings
warnings.filterwarnings("ignore")
import glob


from Bio import AlignIO
import matplotlib.pyplot as plt

from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import rich
from rich_msa import RichAlignment

import logomaker

#Import modules
import gzip, logomaker, matplotlib.pyplot as plt, numpy as np, os, pandas as pd, protfasta


import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


Matplotlib is building the font cache; this may take a moment.


# 1. Downloading codon alignments for all TFs with ADs

First, converting uniprotIDs to all possible ENSTs.

In [3]:
known_ADs = pd.read_csv("../output/known_ADs_considering_isoforms_and_canonical.csv")
uniprotIDs_with_ADs = set(known_ADs["uniprotID"])
print(str(len(uniprotIDs_with_ADs)) + " uniprotIDs")

579 uniprotIDs


In [3]:
# Slow to run

# Uniprot API to get canonical uniprot ID isoform for consistency
def get_canonical_uniprot_id(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        regex_matches = re.findall(r"'isoformIds': \['(.*?)'\]", str(data))
        if len(regex_matches) == 0:
            return uniprotID
        return(regex_matches[0])
    else:
        return f"Error: {response.status_code}"

i = 0
isoform_ids = []
for uniprotID in uniprotIDs_with_ADs:
    if "-" in uniprotID:
        isoform_ids.append(uniprotID) # Already an isoform
    else:
        isoform_ids.append(get_canonical_uniprot_id(uniprotID))
    
    if i %25 == 0:
        print(i)
    i += 1

0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575


In [5]:
len(isoform_ids) # Should be 579

579

In [60]:
# downloading ids to map to 1) ENSTs and 2) gene names
isoform_ids_df = pd.DataFrame(isoform_ids).drop_duplicates()
isoform_ids_df[0] = isoform_ids_df[0].str.split("'").str[0]
isoform_ids_df.to_csv("../data/AD_uniprotIDs.csv", header = None, index = None)
isoform_ids_df

Unnamed: 0,0
0,Q96JB3-1
1,Q8NHW3
2,Q9UKD1
3,P41743
4,P28324-1
...,...
574,P19793-1
575,Q9HCS4
576,P14651-1
577,Q9UBR4-1


In [None]:
# 564 matches to uniprot entries

In [5]:
# Loading in the table of ensembl matches
ensembl_matches = pd.read_csv("../data/AD_uniprotIDs_to_ENST_gene.tsv", sep = "\t")
ensembl_matches["gene"] = ensembl_matches["Gene Names"].str.split(" ").str[0]
ensembl_matches

Unnamed: 0,From,Entry,Gene Names,Ensembl,gene
0,A6NJG6,A6NJG6,ARGFX,ENST00000334384.5;ENST00000651603.1;,ARGFX
1,A8MTJ6,A8MTJ6,FOXI3,ENST00000428390.3;,FOXI3
2,A8MW92-1,A8MW92,PHF20L1 CGI-72,ENST00000337920.8 [A8MW92-2];ENST00000395386.7...,PHF20L1
3,A8MYZ6,A8MYZ6,FOXO6,ENST00000641094.2;,FOXO6
4,A8MZ59-2,A8MZ59,LEUTX,ENST00000396841.4 [A8MZ59-1];ENST00000629267.1...,LEUTX
...,...,...,...,...,...
559,Q9Y692-1,Q9Y692,GMEB1,ENST00000294409.2 [Q9Y692-1];ENST00000361872.8...,GMEB1
560,Q9Y6F1-1,Q9Y6F1,PARP3 ADPRT3 ADPRTL3,ENST00000398755.8 [Q9Y6F1-1];ENST00000417220.6...,PARP3
561,Q9Y6J9,Q9Y6J9,TAF6L PAF65A,ENST00000294168.8;,TAF6L
562,Q9Y6Q9-1,Q9Y6Q9,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,ENST00000371997.3 [Q9Y6Q9-3];ENST00000371998.8...,NCOA3


In [12]:
# Looking at uniprotIDs that do not have multiple isoforms
non_isoforms = ensembl_matches[ensembl_matches["Ensembl"].str.contains("\[") == False]
non_isoforms['Ensembl'] = non_isoforms['Ensembl'].str.split(";").str[:-1]
non_isoforms = non_isoforms.explode('Ensembl')

# Arbitrarily choosing the first transcript to keep 
# since all of the transcripts correspond to the same protein
#non_isoforms['ENST'] = non_isoforms['Ensembl'].str[0]

# Formatting
non_isoforms = non_isoforms.rename(columns = {"From" : "uniprotID", "Ensembl" : "ENST"})
non_isoforms = non_isoforms[["uniprotID", "ENST"]]
non_isoforms["ENST"] = non_isoforms["ENST"].str.split(".").str[0]
non_isoforms

Unnamed: 0,uniprotID,ENST
0,A6NJG6,ENST00000334384
0,A6NJG6,ENST00000651603
1,A8MTJ6,ENST00000428390
3,A8MYZ6,ENST00000641094
12,O00570,ENST00000330949
...,...,...
553,Q9Y4B4,ENST00000684192
555,Q9Y4Z2,ENST00000242462
556,Q9Y5Q3,ENST00000373313
558,Q9Y5W3,ENST00000248071


In [13]:
# Looking at uniprotIDs that have multiple isoforms
multiple_isoforms = ensembl_matches[ensembl_matches["Ensembl"].str.contains("\[") == True]
multiple_isoforms["ENST"] = multiple_isoforms["Ensembl"].str.split(";")
multiple_isoforms = multiple_isoforms[["ENST"]].explode("ENST")
multiple_isoforms["uniprotID"] = multiple_isoforms["ENST"].str.extract(r'\[(.*)\]')
multiple_isoforms["ENST"] = multiple_isoforms["ENST"].str.split(".").str[0]
multiple_isoforms = multiple_isoforms.dropna()
multiple_isoforms = multiple_isoforms[["uniprotID", "ENST"]]
multiple_isoforms

Unnamed: 0,uniprotID,ENST
2,A8MW92-2,ENST00000337920
2,A8MW92-1,ENST00000395386
2,A8MW92-1,ENST00000622263
4,A8MZ59-1,ENST00000396841
4,A8MZ59-1,ENST00000629267
...,...,...
562,Q9Y6Q9-1,ENST00000371998
562,Q9Y6Q9-5,ENST00000372004
563,Q9Y6Y1-1,ENST00000303635
563,Q9Y6Y1-3,ENST00000473578


In [14]:
# Combining to get all mappings
all_mappings = pd.concat([non_isoforms, multiple_isoforms])
all_mappings

Unnamed: 0,uniprotID,ENST
0,A6NJG6,ENST00000334384
0,A6NJG6,ENST00000651603
1,A8MTJ6,ENST00000428390
3,A8MYZ6,ENST00000641094
12,O00570,ENST00000330949
...,...,...
562,Q9Y6Q9-1,ENST00000371998
562,Q9Y6Q9-5,ENST00000372004
563,Q9Y6Y1-1,ENST00000303635
563,Q9Y6Y1-3,ENST00000473578


In [15]:
all_mappings[all_mappings["uniprotID"].isin(isoform_ids)] # All mappings to try

NameError: name 'isoform_ids' is not defined

In [36]:
uniprot_to_gene_dict = dict(zip(ensembl_matches["From"], ensembl_matches["gene"]))

In [77]:
# Check every valid enst.gene mapping

with open("../output/toga_commands.txt", "w") as file:
    for uniprotID in uniprot_to_gene_dict.keys():
        uniprotID_ENSTs = all_mappings[all_mappings["uniprotID"] == uniprotID]["ENST"]
        #print(uniprotID)
        gene = uniprot_to_gene_dict[uniprotID]
        
        for ENST in uniprotID_ENSTs:
            command = "wget --no-check-certificate https://genome.senckenberg.de/download/TOGA/human_hg38_reference/MultipleCodonAlignments/" + ENST + "." + gene + ".fasta.gz -P /Users/sanjanakotha/Desktop/Staller_Lab/SFARI/data/zoonomia_toga_mca"
            file.write(command + "\n")


In [78]:
len(uniprot_to_gene_dict)

564

In [79]:
# Succesfully downloaded 416 of the 546
# Manually looking at a few below- ones not downloaded are cases where only non canonical isoform is included
# Next time - could download any ENSTs available for the genes, translate, then look for AD sequences

In [84]:
# Which TFs were not downloaded?
downloaded_names = glob.glob("../data/zoonomia_toga_mca/ENST*")
downloaded_names = [_.split(".")[-2] for _ in downloaded_names]

In [92]:
ensembl_matches[ensembl_matches["gene"] == "APBB3"]

Unnamed: 0,From,Entry,Gene Names,Ensembl,gene
68,O95704-1,O95704,APBB3 FE65L2,ENST00000354402.9 [O95704-4];ENST00000356738.6...,APBB3


In [93]:
all_mappings[all_mappings["uniprotID"].str.contains("O95704")]

Unnamed: 0,uniprotID,ENST
68,O95704-4,ENST00000354402
68,O95704-3,ENST00000356738
68,O95704-1,ENST00000357560
68,O95704-2,ENST00000412920
68,O95704-6,ENST00000467078
68,O95704-5,ENST00000509914


In [94]:
len(set(ensembl_matches["gene"]) - set(downloaded_names))

148

# 2. Convert codon alignments to protein alignments

In [98]:
files = glob.glob("../data/zoonomia_toga_mca/ENST*")

# Converting codon alignment to unaligned protein sequences
for file in files:
    # Read in as dataframe  
    df = protfasta.read_fasta(file, invalid_sequence_action = 'convert') #Convert - to ""
    df = pd.DataFrame({"id" : df.keys(), "nt_seq" : df.values()})

    # Translate sequences
    df["prot_seq"] = [str(Seq(_).translate())[:-1] for _ in df["nt_seq"]]

    ENST = file.split("/")[-1].split(".")[0]
    gene =  file.split("/")[-1].split(".")[1]

    # Save protein sequences in dictionary
    prot_dict = dict(zip(df["id"], df["prot_seq"]))
    protfasta.write_fasta(prot_dict, "../data/zoonomia_toga_mca/prot_fastas/" + gene)
 

# 3. Run MAFFT on all protein sequences

In [99]:
# Slow to run
genes = [_.split(".")[-2] for _ in files]
for gene in genes:
    os.system("mafft --auto --quiet ../data/zoonomia_toga_mca/prot_fastas/" + gene + " > ../data/zoonomia_toga_mca/prot_alignments/" + gene + ".fasta")

# 1B. Follow up on 1 - GETTING REST OF SEQUENCES
Which ADs did we miss?

In [24]:
ensembl_matches

Unnamed: 0,From,Entry,Gene Names,Ensembl,gene
0,A6NJG6,A6NJG6,ARGFX,ENST00000334384.5;ENST00000651603.1;,ARGFX
1,A8MTJ6,A8MTJ6,FOXI3,ENST00000428390.3;,FOXI3
2,A8MW92-1,A8MW92,PHF20L1 CGI-72,ENST00000337920.8 [A8MW92-2];ENST00000395386.7...,PHF20L1
3,A8MYZ6,A8MYZ6,FOXO6,ENST00000641094.2;,FOXO6
4,A8MZ59-2,A8MZ59,LEUTX,ENST00000396841.4 [A8MZ59-1];ENST00000629267.1...,LEUTX
...,...,...,...,...,...
559,Q9Y692-1,Q9Y692,GMEB1,ENST00000294409.2 [Q9Y692-1];ENST00000361872.8...,GMEB1
560,Q9Y6F1-1,Q9Y6F1,PARP3 ADPRT3 ADPRTL3,ENST00000398755.8 [Q9Y6F1-1];ENST00000417220.6...,PARP3
561,Q9Y6J9,Q9Y6J9,TAF6L PAF65A,ENST00000294168.8;,TAF6L
562,Q9Y6Q9-1,Q9Y6Q9,NCOA3 AIB1 BHLHE42 RAC3 TRAM1,ENST00000371997.3 [Q9Y6Q9-3];ENST00000371998.8...,NCOA3


In [25]:
downloaded_paths = glob.glob("../data/zoonomia_toga_mca/ENST*")
downloaded_paths[:3]

['../data/zoonomia_toga_mca/ENST00000359842.RXRG.fasta',
 '../data/zoonomia_toga_mca/ENST00000357992.ELK4.fasta',
 '../data/zoonomia_toga_mca/ENST00000595661.ZNF473.fasta']

In [26]:
downloaded_genes = [_.split(".")[-2] for _ in downloaded_paths]
downloaded_genes[:3]

['RXRG', 'ELK4', 'ZNF473']

In [27]:
missed_genes = ensembl_matches[~ensembl_matches["gene"].isin(downloaded_genes)]
missed_genes

Unnamed: 0,From,Entry,Gene Names,Ensembl,gene
7,O00255-2,O00255,MEN1 SCG2,ENST00000312049.11 [O00255-2];ENST00000315422....,MEN1
8,O00321-1,O00321,ETV2 ER71 ETSRP71,ENST00000379026.6 [O00321-2];ENST00000402764.6...,ETV2
9,O00327-2,O00327,BMAL1 ARNTL BHLHE5 MOP3 PASD3,ENST00000389707.8 [O00327-8];ENST00000401424.6...,BMAL1
10,O00470-1,O00470,MEIS1,ENST00000272369.14 [O00470-1];ENST00000398506....,MEIS1
17,O14686-1,O14686,KMT2D ALR MLL2 MLL4,ENST00000301067.12 [O14686-1];ENST00000685166....,KMT2D
...,...,...,...,...,...
522,Q9UGU0-1,Q9UGU0,TCF20 KIAA0292 SPBP,ENST00000335626.8 [Q9UGU0-2];ENST00000359486.8...,TCF20
529,Q9UKW6-1,Q9UKW6,ELF5 ESE2,ENST00000257832.7 [Q9UKW6-2];ENST00000312319.6...,ELF5
531,Q9UL68-1,Q9UL68,MYT1L KIAA1106,ENST00000399161.8 [Q9UL68-4];ENST00000407844.6...,MYT1L
543,Q9Y261-1,Q9Y261,FOXA2 HNF3B TCF3B,ENST00000377115.4 [Q9Y261-1];ENST00000419308.7...,FOXA2


In [29]:
# For these, will try every ENST option!

In [40]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


In [41]:
base_url = "https://genome.senckenberg.de/download/TOGA/human_hg38_reference/MultipleCodonAlignments/"


In [43]:
response = requests.get(base_url, verify = False)

In [52]:
names = []

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find all links (anchor tags with href attributes)
    links = soup.find_all("a", href=True)

    for link in links:
        for gene in missed_genes["gene"]:
            if gene in link['href']:
                if str(link['href']).split(".")[-3] == gene:
                    names.append(link['href'])
            
    # # Filter the links that contain 'ELP1' in their name
    # elp1_files = [link['href'] for link in links if "ELP1" in link['href']]

In [54]:
len(names)

125

In [55]:
names[:3]

['ENST00000226574.NFKB1.fasta.gz',
 'ENST00000227163.SPI1.fasta.gz',
 'ENST00000229022.VDR.fasta.gz']

In [60]:
# Check every valid enst.gene mapping

with open("../output/toga_commands_retry.txt", "w") as file:
    for name in names:
        
        for ENST in uniprotID_ENSTs:
            command = "wget --no-check-certificate https://genome.senckenberg.de/download/TOGA/human_hg38_reference/MultipleCodonAlignments/" + name + " -P /Users/sanjanakotha/Desktop/Staller_Lab/SFARI/data/zoonomia_toga_mca/wrong_isoforms_TF_seqs"
            file.write(command + "\n")


Ran gunzip *.gz in directory to unzip everything

# 2A. Convert codon alignments to protein alignments

In [64]:
files = glob.glob("../data/zoonomia_toga_mca/wrong_isoforms_TF_seqs/ENST*")

# Converting codon alignment to unaligned protein sequences
for file in files:
    # Read in as dataframe  
    df = protfasta.read_fasta(file, invalid_sequence_action = 'convert') #Convert - to ""
    df = pd.DataFrame({"id" : df.keys(), "nt_seq" : df.values()})

    # Translate sequences
    df["prot_seq"] = [str(Seq(_).translate())[:-1] for _ in df["nt_seq"]]

    ENST = file.split("/")[-1].split(".")[0]
    gene =  file.split("/")[-1].split(".")[1]

    # Save protein sequences in dictionary
    prot_dict = dict(zip(df["id"], df["prot_seq"]))
    protfasta.write_fasta(prot_dict, "../data/zoonomia_toga_mca/prot_fastas_wrong_isoforms/" + gene)
 

# 3A. Run MAFFT

In [67]:
# Slow to run
genes = [_.split(".")[-2] for _ in files]
for gene in genes:
    os.system("mafft --auto --quiet ../data/zoonomia_toga_mca/prot_fastas_wrong_isoforms/" + gene + " > ../data/zoonomia_toga_mca/prot_alignments_wrong_isoforms/" + gene + ".fasta")