<a href="https://colab.research.google.com/github/sanjaynagi/AnoFold/blob/main/notebooks/AnoFold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import requests
import os
import re
import glob

def vectorbase_to_uniprot(gene_id):
    url = "https://rest.uniprot.org/idmapping/run"
    data = {
        "from": "VEuPathDB",
        "to": "UniProtKB",
        "ids": f"VectorBase:{gene_id}"
    }

    response = requests.post(url, data=data)
    response.raise_for_status()
    job_id = response.json()["jobId"]

    status_url = f"https://rest.uniprot.org/idmapping/status/{job_id}"
    while True:
        status_response = requests.get(status_url)
        status_response.raise_for_status()
        status = status_response.json()
        if "jobStatus" in status and status["jobStatus"] in ("RUNNING", "NEW"):
            continue
        elif "results" in status or "failedIds" in status:
            break

    results_url = f"https://rest.uniprot.org/idmapping/stream/{job_id}"
    results_response = requests.get(results_url)
    results_response.raise_for_status()
    results = results_response.json()

    if "results" in results and results["results"]:
        return results["results"][0]["to"]
    else:
        return None

def download_alphafold_pdb(gene_id, output_dir='.'):
    # Convert VectorBase GeneID to UniProt accession
    uniprot_accession = vectorbase_to_uniprot(gene_id)
    if uniprot_accession is None:
        print(f"No UniProt accession found for GeneID: {gene_id}")
        return None

    # Download the PDB file
    response = requests.get(f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_accession}-F1-model_v4.pdb")
    if response.status_code == 200:
        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the PDB file
        output_file = os.path.join(output_dir, f"{gene_id}.pdb")
        with open(output_file, 'wb') as f:
            f.write(response.content)

        print(f"Downloaded AlphaFold PDB for {gene_id} (UniProt: {uniprot_accession}) to {output_file}")
        return output_file
    else:
        print(f"Failed to download AlphaFold PDB for {gene_id} (UniProt: {uniprot_accession}). Status code: {response.status_code}")
        return None

amino_acid_map = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

def pdb_to_coords(pdb_path, motif, target_molecule, target_codon_in_motif, flanking_size=5):
    # Read and process the PDB file
    df_pdb = pd.read_csv(pdb_path, skiprows=90, header=None)[0].str.split("\s+", expand=True).iloc[:-3,:-1]
    df_pdb.columns = [
        "record", "atom", "atom_name", "amino_acid", "chain_identifier", "codon",
        "x", "y", "z", "occupancy", "b-factor", "element_symbol",
    ]
    df_pdb = df_pdb.query("record == 'ATOM'")

    # Extract unique amino acids
    aa3 = df_pdb[['amino_acid', 'codon']].drop_duplicates()['amino_acid']

    convert = np.vectorize(lambda x: amino_acid_map.get(x, x))
    aa1 = convert(aa3)

    # Find all occurrences of the motif
    aa_sequence = ''.join(aa1)
    motif_matches = list(re.finditer(motif, aa_sequence))

    results = []
    for match in motif_matches:
        start, end = match.start(), match.end()

        # Get flanking region
        flanking_start = max(0, start - flanking_size)
        flanking_end = min(len(aa_sequence), end + flanking_size)
        flanking = aa_sequence[flanking_start:start] + '[' + aa_sequence[start:end] + ']' + aa_sequence[end:flanking_end]

        print(f"Motif detected at codon {start+1}:{end} = {aa_sequence[start:end]}")
        print(f"Flanking region: {flanking}")

        # Find coordinates for the target molecule
        target_idx = str(start + 1 + target_codon_in_motif)
        coords = df_pdb.query("codon == @target_idx and atom_name == @target_molecule")

        if not coords.empty:
            coord_array = coords.drop_duplicates('atom_name')[['x', 'y', 'z']].to_numpy()[0].astype(float)
            results.append({
                'start': start + 1,
                'end': end,
                'motif': aa_sequence[start:end],
                'flanking': flanking,
                'coordinates': coord_array
            })

    if len(results) > 1:
        print(f"Warning, multiple {len(results)} matching motifs found")

    return results


# Function to write Vina configuration files
def write_vina_config(receptor, ligand, x, y, z, size=20, threads=4):

    # x,y,z = pdb_to_coords(f"../protein-modelling/coeaexg/{receptor}.pdb", r"G.S.G", "O", target_codon_idx=3)
    config_filename = f"config_files/{receptor}_{ligand}.config"
    os.makedirs(os.path.dirname(config_filename), exist_ok=True)

    with open(config_filename, "w") as fileConn:
        fileConn.write(f"""receptor = receptors/{receptor}.pdbqt
                        ligand = ligands/{ligand}.pdbqt

                        center_x = {x}
                        center_y = {y}
                        center_z = {z}

                        size_x = {size}
                        size_y = {size}
                        size_z = {size}

                        out = vina_results/{receptor}_{ligand}.log

                        cpu = {threads}
                        exhaustiveness = 8
                        """)


In [3]:
# Example usage
gene_id = "AGAP006227"
pdb_file = download_alphafold_pdb(gene_id, output_dir='alphafold_pdbs')

Downloaded AlphaFold PDB for AGAP006227 (UniProt: Q7PPA9) to alphafold_pdbs/AGAP006227.pdb


In [14]:
res = pdb_to_coords(pdb_path="alphafold_pdbs/AGAP006227.pdb", motif='[LVG].G.S.G', target_molecule="O", target_codon_in_motif=5)

Motif detected at codon 188:194 = LFGESAG
Flanking region: PDNVT[LFGESAG]GCSVH
