In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, timedelta

casp_round = 16

def get_target_info(entry_date=None, ignore_list=['rna', 'server']):
    target_list_url = f'https://predictioncenter.org/casp{casp_round}/targetlist.cgi'
    response = requests.get(target_list_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')
    df = pd.read_html(str(table), match='Tar-id', header=0)[4] # data is stored in 5-level nested table
    
    entry_date = entry_date if entry_date else (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
    targets = df[df['Entry Date'] == entry_date]
    
    # ignore all targets belong to categories in skip list.
    if ignore_list:
        for keyword in ignore_list:
            targets = targets[~targets['Type'].str.contains(keyword, case=False)] # case insensitive
    
    # drop all subunits
    targets = targets[~targets['Tar-id'].str.contains('s', case=True)] # case sensitive
    
    # remove CAPRI target annotations
    targets['Tar-id'] = targets['Tar-id'].str.replace('[^a-zA-Z0-9]', '', regex=True)
    
    return targets

def save_subunits(target_id, sequences, save_fn='subunits.fasta'):
    save_path = os.path.join(target_id, save_fn)
    
    with open(save_path, 'w') as f:
        for i, sequence in enumerate(sequences):
            f.write(f'>subunit{i+1}\n')
            f.write(sequence)
            f.write('\n')
    
    print(f'Saved subunit sequences to\t{save_path}')
            
def save_target(target_id, sequences, stoichiometry_info, save_fn='target.fasta'):
    save_path = os.path.join(target_id, save_fn)
    
    matches = re.findall(r'([A-Z])(\d+)', stoichiometry_info)
    counts = [(ord(letter)-65, int(count)) for letter, count in matches]
    max_count = max(count for _, count in counts)
    total_count = sum(count for _, count in counts)
    current_count = 0
    
    with open(save_path, 'w') as f:
        f.write(f'>{target_id}\n')
        
        for i in range(max_count):
            for subunit_id, count in counts:
                if i < count:
                    f.write(sequences[subunit_id])
                    current_count += 1
                    if current_count < total_count:
                        f.write(':')
        
    print(f'Saved target sequence to\t{save_path}')

def get_target_sequence(target_id, stoichiometry='A1'):
    print('-'*50)
    print(f'TARGET\t\t{target_id}')
    print(f'STOICHIOMETRY\t{stoichiometry}')
    
    target_url = f'https://predictioncenter.org/casp{casp_round}/target.cgi?target={target_id}&view=sequence'
    response = requests.get(target_url)
    response.raise_for_status()
    
    os.makedirs(target_id, exist_ok=True)
    
    # filter out all headers and blank strings
    sequences = [seq for seq in response.text.split('\n') if not seq.startswith('>') and seq]
    
    if stoichiometry == 'A1':
        save_target(target_id, sequences, stoichiometry)
    elif stoichiometry == 'UNK':
        save_subunits(target_id, sequences)
    else:
        save_subunits(target_id, sequences)
        save_target(target_id, sequences, stoichiometry)

In [2]:
new_targets = get_target_info()
#new_targets = get_target_info('2024-05-07')
new_targets

Unnamed: 0,#,Tar-id,Type,Res,Stoichiom.,Entry Date,Server Expiration,Ligand Deadline,Human Expiration,QA Prediction,Description
17,18.0,T1210,All groups,1770,A1,2024-05-09,2024-05-12,2024-05-30,2024-05-30,-,Q868N5


In [3]:
for _, row in new_targets.iterrows():
    get_target_sequence(row['Tar-id'], row['Stoichiom.'].strip().replace(' ', ''))

--------------------------------------------------
TARGET		T1210
STOICHIOMETRY	A1
Saved target sequence to	T1210/target.fasta
