# Imports & Dependencies

In [1]:
import pandas as pd
from collections import defaultdict
import sys
import os
import shutil as sh
import urllib
import tarfile
from pathlib import Path

import importlib


from ipywidgets import interact, interactive, fixed, interact_manual, IntProgress
import ipywidgets as widgets # type: ignore
from IPython.display import display
import gzip

#Pandarallel works only on linux and mac
try:
    from pandarallel import pandarallel
    pandarallel.initialize(nb_workers=8,progress_bar=True)
    PARRALEL = True
except:
    PARRALEL = False

from tqdm.notebook import tnrange, tqdm
tqdm.pandas() #activate tqdm progressbar for pandas apply

# Folder Creation

In [2]:
SETUP = {} #Dictionnary with ALL parameters

In [3]:
#Folder definition
from sys import platform
if platform == "linux" or platform == "linux2":
    PEPRMINT_FOLDER = "/home/user_stel/AISB/Project"
elif platform == "darwin":
    PEPRMINT_FOLDER = "/home/user_stel/AISB/Project"
else:
    raise ValueError("OS NOT FOUND")
WORKDIR = f"{PEPRMINT_FOLDER}/dataset/"
CATHFOLDER = f"{PEPRMINT_FOLDER}/databases/cath/"
ALFAFOLDFOLDER = f"{PEPRMINT_FOLDER}/databases/alfafold/"
PROSITEFOLDER = f"{PEPRMINT_FOLDER}/databases/prosite/"
UNIPROTFOLDER = f"{PEPRMINT_FOLDER}/databases/uniprot/"
FIGURESFOLDER = f"{PEPRMINT_FOLDER}/figures/"

SETUP["PEPRMINT_FOLDER"]=PEPRMINT_FOLDER
SETUP["WORKDIR"]=WORKDIR
SETUP["CATHFOLDER"]=CATHFOLDER
SETUP["PROSITEFOLDER"]=PROSITEFOLDER
SETUP["ALFAFOLDFOLDER"]=ALFAFOLDFOLDER
SETUP["UNIPROTFOLDER"]=UNIPROTFOLDER
SETUP["FIGURESFOLDER"]=FIGURESFOLDER

In [4]:
if not os.path.exists(PEPRMINT_FOLDER):
    os.makedirs(PEPRMINT_FOLDER)
if not os.path.exists(WORKDIR):
    os.makedirs(WORKDIR)
if not os.path.exists(FIGURESFOLDER):
    os.makedirs(FIGURESFOLDER)
if not os.path.exists(ALFAFOLDFOLDER):
    os.makedirs(ALFAFOLDFOLDER)
if not os.path.exists(UNIPROTFOLDER):
    os.makedirs(UNIPROTFOLDER)
if not os.path.exists(PROSITEFOLDER):
    os.makedirs(PROSITEFOLDER) #MSA will contains the alignments in "msa" format (FASTA). 
if not os.path.exists(CATHFOLDER):
    os.makedirs(CATHFOLDER)

In [5]:
for k in SETUP:
    exec(f"{k}2 = SETUP['{k}']")

In [6]:
DOMAIN_PROSITE = {
    "PH": "PS50003",
    "C2": ["PS50004","PS51547"],
    "C1": "PS50081",  # Note : no C1 prosite on SMART but 2 C1 ProSite on Interprot (PS50081,PS00479), I took PS50081 since the data in PS00479 are in PS50081.
    "PX": "PS50195",
    # "FYVE":"PS50178",
    "FYVE": ["PS50178",'PS50089', 'PS00518','PS50016','PS01359','PS50014','PS00633','PS50119'],  # FYVE CAN BE THIS ONE TOO....
    # "PPASE_MYOTUBULARIN":"PS51339",# no GRAM domain found on prosite. Has to do this manually. Go on http://smart.embl-heidelberg.de/smart/do_annotation.pl?DOMAIN=GRAM&BLAST=DUMMY
    "BAR": "PS51021",  # 1URU is missing on prosite
    # "GLA":"PS50963",
    "ENTH": "PS50942",
    "SH2": "PS50001",
    "SEC14": "PS50191",
    "START": "PS50848",
    "C2DIS":"PS50022",
    "GLA": "PS50998",
    "PLD":"PS50035",
    "PLA":"PS00118",
    "ANNEXIN":"PS00223",
}
# Invert keys and values to have PROSITEID ==> DOMAIN
PROSITE_DOMAIN = {}
for key, value in DOMAIN_PROSITE.items():
    if type(value) == type([]):
        for subvalues in value:
            PROSITE_DOMAIN[subvalues] = key
    else:
        PROSITE_DOMAIN[value] = key
# PROSITE_DOMAIN = {v: k for k, v in DOMAIN_PROSITE.items()}

DOMAIN_CATH = {
    "PH": "2.30.29.30",
    "C2": "2.60.40.150",
    "C1": "3.30.60.20",
    "PX": "3.30.1520.10",
    "FYVE": "3.30.40.10",
    "BAR": "1.20.1270.60",
    "ENTH": "1.25.40.90",
    "SH2": "3.30.505.10",
    "SEC14": "3.40.525.10",
    "START": "3.30.530.20",
    "C2DIS": "2.60.120.260",
    "GLA":"2.40.20.10",
    "PLD":"3.20.20.190",
    "PLA":"1.20.90.10",
    "ANNEXIN":"1.10.220.10",
}

DOMAIN_INTERPRO = {
    "PH": "SSF50729",
    "C2": "SSF49562",
    "C1": None,
    "PX": "SSF64268",
    "FYVE": "SSF57903", #badly classified it looks like...
    "BAR": "SSF103657",
    "ENTH": "SSF48464",
    "SH2": "SSF55550",
    "SEC14": ["SSF52087","SSF46938"], #the CRAL TRIO domain is truncated in SSF.
    "START": "SSF55961",
    "C2DIS": "SSF49785",
    "GLA":None,
    "PLD":"SSF51695",
    "PLA":"G3DSA:1.20.90.10",
    "ANNEXIN":"SSF47874",
}

DOMAIN_INTERPRO_REFINE = {
    "PH": True,
    "C2": False,
    "C1": False,
    "PX": True,
    "FYVE": False,
    "BAR": False,
    "ENTH": False,
    "SH2": False,
    "SEC14": False,
    "START": True,
    "C2DIS": False,
    "GLA":False,
    "PLD":False,
    "PLA":True,
    "ANNEXIN":False,
}

# Invert keys and values to have CATHID ==> DOMAIN
CATH_DOMAIN = {v: k for k, v in DOMAIN_CATH.items()}
SUPERFAMILY = CATH_DOMAIN
SETUP["DOMAIN_PROSITE"] = DOMAIN_PROSITE
SETUP["PROSITE_DOMAIN"] = PROSITE_DOMAIN
SETUP["DOMAIN_CATH"] = DOMAIN_CATH
SETUP["CATH_DOMAIN"] = CATH_DOMAIN
SETUP["SUPERFAMILY"] = SUPERFAMILY

In [7]:
PROSITEFOLDER

'/home/user_stel/AISB/Project/databases/prosite/'

# Methods 

In [8]:
def selectUniquePerCluster(df, cathCluster, Uniref, withAlignment = True):
    """
    Return a datasert with only 1 data per choosed clusters.
    """
    
    if cathCluster not in ["S35","S60","S95","S100"]:
        raise ValueError('CathCluster given not in ["S35","S60","S95","S100"]')
    
    if Uniref not in ["uniref50","uniref90","uniref100"]:
        raise ValueError('CathCluster given not in ["uniref50","uniref90","uniref100"]')
    
    if withAlignment:
        df = df[~df.alignment_position.isnull()]
    
    cathdf = df.query("data_type == 'cathpdb'")
    seqdf = df.query("data_type == 'prosite'")
    
    def selectUniqueCath(group):
        uniqueNames = group.cathpdb.unique()
        select = uniqueNames[0]
        
        #return group.query("cathpdb == @select")
        return select
    
    def selectUniqueUniref(group,exclusion):
        uniqueNames = group.uniprot_acc.unique()
        select = uniqueNames[0]
        #return group.query("uniprot_acc == @select")
        if select not in exclusion:
            return select
        

    dfReprCathNames = cathdf.groupby(["domain",cathCluster]).apply(selectUniqueCath).to_numpy()
    
    excludeUniref = df.query("cathpdb in @dfReprCathNames").uniprot_acc.unique() #Structures are prior to sequences.
    dfReprUnirefNames = seqdf.groupby(["domain",Uniref]).apply(selectUniqueUniref, exclusion=excludeUniref).to_numpy()
    dfReprCath = cathdf.query("cathpdb in @dfReprCathNames")
    dfReprUniref = seqdf.query("uniprot_acc in @dfReprUnirefNames")
    
    return (pd.concat([dfReprCath,dfReprUniref]))

# Download data 

### Download CATH-domain-list

In [9]:
# ─── 1) Settings ───────────────────────────────────────────────────────────────
UPDATE = False  # set to True when you actually want to re-download
CATHFOLDER = "/home/user_stel/AISB/Project/databases/cath/"
domfile    = "/home/user_stel/AISB/Project/databases/cath/cath-domain-list-v4_2_0.txt"
url        = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/latest-release/cath-classification-data/cath-domain-list-<v4_2_0>.txt"

# ─── 2) Build the destination path ─────────────────────────────────────────────
destination = Path(CATHFOLDER) / domfile

# ─── 4) Load into pandas ────────────────────────────────────────────────────────
column_names = [
    'Domain','Class','Architecture','Topology','Homologous',
    'S35','S60','S95','S100','S100Count','DomSize','Resolution'
]

# CATH domain‐list files are space­delimited; lines starting with "#" are comments
df_cath = pd.read_csv(
    destination,
    sep=r'\s+',
    header=None,
    names=column_names,
    comment='#',
    engine='python'
)

print(df_cath)


         Domain  Class  Architecture  Topology  Homologous  S35  S60  S95  \
0       1oaiA00      1            10         8        10.0  1.0  1.0  1.0   
1       1go5A00      1            10         8        10.0  1.0  1.0  1.0   
2       3frhA01      1            10         8        10.0  2.0  1.0  1.0   
3       3friA01      1            10         8        10.0  2.0  1.0  1.0   
4       3b89A01      1            10         8        10.0  2.0  1.0  1.0   
...         ...    ...           ...       ...         ...  ...  ...  ...   
430036  4uldS02      3           100        10        10.0  3.0  1.0  1.0   
430037  4ulgN02      3           100        10        10.0  3.0  1.0  1.0   
430038  4uliS02      3           100        10        10.0  3.0  1.0  1.0   
430039  4ulmN02      3           100        10        10.0  3.0  1.0  1.0   
430040  4uloS02      3           100        10         NaN  NaN  NaN  NaN   

        S100  S100Count  DomSize  Resolution  
0        1.0        1.0     

### Download Correspondance between Uniprot and PDB code

In [10]:
import os, requests
from time import sleep

url = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/pdb_chain_uniprot.csv.gz"
destination = os.path.join(CATHFOLDER, "pdb_chain_uniprot.csv.gz")

def download_with_retries(url, dest, max_tries=3, chunk_size=1024*1024):
    for attempt in range(1, max_tries+1):
        try:
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()
                with open(dest, "wb") as f:
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        if chunk:
                            f.write(chunk)
            return
        except Exception as e:
            print(f"↻ Attempt {attempt} failed: {e}")
            if attempt == max_tries:
                raise
            sleep(5)

# download only if missing or forced
if not os.path.exists(destination) or UPDATE:
    download_with_retries(url, destination)
    print("✅ Download complete:", destination)
else:
    print("✔️  Already downloaded:", destination)


✔️  Already downloaded: /home/user_stel/AISB/Project/databases/cath/pdb_chain_uniprot.csv.gz


In [11]:
import gzip, shutil

gz_path  = "/home/user_stel/AISB/Project/databases/uniprot/pdb_chain_uniprot.csv.gz"
csv_path = gz_path[:-3]

try:
    with gzip.open(gz_path, "rb") as f_in, open(csv_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
except EOFError:
    print("⚠️ Warning: EOFError raised, file may be slightly truncated but proceeding anyway.")

print("✅ Decompressed to:", csv_path)


✅ Decompressed to: /home/user_stel/AISB/Project/databases/uniprot/pdb_chain_uniprot.csv


### Download Prosite files 

In [12]:
import os, urllib.request, tarfile

PROSITE_URL      = "ftp://ftp.expasy.org/databases/prosite/prosite_alignments.tar.gz"
PROSITEFOLDER    = "/home/user_stel/AISB/Project/databases/prosite/"
archive_path     = os.path.join(PROSITEFOLDER, "prosite_alignments.tar.gz")
prosite_alignments       = os.path.join(PROSITEFOLDER, "msa")

# Only download & extract if the msa folder doesn’t already exist:
if not os.path.isdir(prosite_alignments):
    print(f"↓ Downloading PROSITE alignments to {archive_path}")
    urllib.request.urlretrieve(PROSITE_URL, archive_path)

    print("→ Extracting…")
    with tarfile.open(archive_path, "r:gz") as tf:
        tf.extractall(path=PROSITEFOLDER)

    # Rename the extracted folder to “msa”
    os.rename(
        os.path.join(PROSITEFOLDER, "prosite_alignments"),
        prosite_alignments
    )

    # Clean up
    os.remove(archive_path)
    print("✅ PROSITE data ready in", prosite_alignments)
else:
    print("✅ PROSITE data already present in", prosite_alignments)


✅ PROSITE data already present in /home/user_stel/AISB/Project/databases/prosite/msa


## Download CATH PDB files

In [13]:
# Reading Cath domain list
cathDomains = pd.read_csv(domfile,comment='#', sep=r"\s+", header=None)
cathDomains.columns = column_names
if PARRALEL:
    cathDomains['Superfamily'] = cathDomains.parallel_apply(lambda x: f"{x.Class}.{x.Architecture}.{x.Topology}.{x.Homologous}", axis=1)
else:
    cathDomains['Superfamily'] = cathDomains.progress_apply(lambda x: f"{x.Class}.{x.Architecture}.{x.Topology}.{x.Homologous}", axis=1)

  0%|          | 0/430041 [00:00<?, ?it/s]

In [14]:
# Creating the superfamily 
cathSuperFamily = pd.DataFrame()
cathSuperFamily['Superfamily'] = cathDomains.Superfamily
cathSuperFamily['Domain'] = cathDomains.Domain

In [15]:
# Creating a dictionary with the superfamily as key and list of cathdomain (pdb format) as value
cathDomainsPerSuperFamily = defaultdict(list)
#do not parralel this one
_ = cathSuperFamily.progress_apply(lambda x: cathDomainsPerSuperFamily[x.Superfamily].append(x.Domain), axis=1)

  0%|          | 0/430041 [00:00<?, ?it/s]

In [16]:
CATHVERSION = 'v4_2_0'

In [17]:
def download_dom(dom, folder):
    url = "http://www.cathdb.info/version/"+CATHVERSION+"/api/rest/id/"+dom+".pdb"
    destination = folder+dom+'.pdb'
    if not os.path.isfile(destination): 
        urllib.request.urlretrieve(url, destination)
    #progressbar.value += 1

def fetch_dom_for_superfamily(superfamily, cathDomainsPerSuperFamily, domName):
    print(f">Working with {domName} domain")
    global CATHFOLDER
    folder = CATHFOLDER+'domains/'+domName+'/raw/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    if not os.path.exists(CATHFOLDER+'domains/'+domName+'/cleaned/'):
        os.makedirs(CATHFOLDER+'domains/'+domName+'/cleaned/')

    domlist = cathDomainsPerSuperFamily[superfamily]
    
    if PARRALEL:
        pd.Series(domlist).parallel_apply(lambda x: download_dom(x, folder))
    else:
        print(domlist)
        pd.Series(domlist).progress_apply(lambda x: download_dom(x, folder))


        
        
for superfamily,domain in SUPERFAMILY.items():
    fetch_dom_for_superfamily(superfamily, cathDomainsPerSuperFamily, domain)
    

>Working with PH domain
[]


0it [00:00, ?it/s]

>Working with C2 domain
[]


0it [00:00, ?it/s]

>Working with C1 domain
[]


0it [00:00, ?it/s]

>Working with PX domain
[]


0it [00:00, ?it/s]

>Working with FYVE domain
[]


0it [00:00, ?it/s]

>Working with BAR domain
[]


0it [00:00, ?it/s]

>Working with ENTH domain
[]


0it [00:00, ?it/s]

>Working with SH2 domain
[]


0it [00:00, ?it/s]

>Working with SEC14 domain
[]


0it [00:00, ?it/s]

>Working with START domain
[]


0it [00:00, ?it/s]

>Working with C2DIS domain
[]


0it [00:00, ?it/s]

>Working with GLA domain
[]


0it [00:00, ?it/s]

>Working with PLD domain
[]


0it [00:00, ?it/s]

>Working with PLA domain
[]


0it [00:00, ?it/s]

>Working with ANNEXIN domain
[]


0it [00:00, ?it/s]

# Generation Phase

In [18]:
RECALCULATION = False 
recalculation_widget = widgets.ToggleButton(
    value=RECALCULATION,
    description='Recalculation ?',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click for recalculation',
    icon='cogs' # (FontAwesome names without the `fa-` prefix)
)
display(recalculation_widget)

ToggleButton(value=False, description='Recalculation ?', icon='cogs', tooltip='Click for recalculation')

## Instanciating the builder object

In [19]:
import os
# this tells you where the notebook thinks “.” is
print("cwd:", os.getcwd())
# make sure this is /home/user_stel/AISB/Project (or wherever your src/ lives)


cwd: /home/user_stel/AISB/Project/notebooks


In [20]:
import sys
PROJECT_ROOT = "/home/user_stel/AISB/Project"   # <— adjust if your path is different
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# verify
print("sys.path[0] =", sys.path[0])
print("src/ exists?", os.path.isdir(sys.path[0] + "/src"))


sys.path[0] = /home/user_stel/AISB/Project
src/ exists? True


In [21]:
import sys, os
# assume this notebook lives in Project/notebooks/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
SRC_DIR      = os.path.join(PROJECT_ROOT, "src")

# add src/ to sys.path so Python can find your packages
sys.path.insert(0, SRC_DIR)

# sanity check
print("Project root:", PROJECT_ROOT)
print("Found src at:", os.path.isdir(SRC_DIR))
print("Found builder at:", os.path.isdir(os.path.join(SRC_DIR, "builder")))

# now import!
from builder.Builder import Builder


Project root: /home/user_stel/AISB/Project
Found src at: True
Found builder at: True


In [22]:
from builder.Builder import Builder


# assuming SETUP is the dict you built earlier
builder = Builder(
    SETUP,
    recalculate=False,  # False if you only want to download / preprocess data once
    update=True,        # True if you want to re‐download or overwrite existing files
    notebook=False,     # Set True if you want progress bars inline in a Jupyter notebook
    core=4              # Number of parallel workers (match your CPU count / preferences)
)

notebook = False
notebook = False
notebook = False


In [23]:
import builder.Builder as builderEngine
importlib.reload(builderEngine)

import pepr2ds.builder.Builder as builderEngine
builder = builderEngine.Builder(SETUP, recalculate = recalculation_widget.value, update=False, notebook = True, core=1)


importlib.reload(builderEngine)
builder = builderEngine.Builder(SETUP, recalculate = recalculation_widget.value, update=False, notebook = True, core=1)