In [12]:
import copy
import json
import os
from pathlib import Path
import random

from Bio import SeqIO

cwd = os.getcwd()
if cwd.endswith('notebook/PG'):
    os.chdir('../..')
    cwd = os.getcwd()

from src.pg_modelling.ligand_utils import sanitize_protein_id

In [13]:
paulines_folder = Path(os.path.expanduser('~')) / 'Documents' / 'Pauline'
assert paulines_folder.is_dir()

In [14]:
def gen_model_seeds(n):
    return [int(random.uniform(1, 100)) for _ in range(n)]

In [17]:
json_data = {
    'sequences': [],
    'modelSeeds': gen_model_seeds(3),
    'dialect': 'alphafold3',
    'version': 1,
}
for record in SeqIO.parse(paulines_folder / 'HNS_Hfp_JGLRI_DBProka_Misson_Fitzgerald_unique.faa', 'fasta'):
    data = copy.deepcopy(json_data)

    name = sanitize_protein_id(record.id)
    data['name'] = name

    sequence = str(record.seq).upper()
    if sequence.endswith('*'):
        sequence = sequence[:-1]
    elif '*' in sequence:
        raise ValueError('* in the middle of the sequence')

    sequence = {
        'protein': {
            'id': 'A',
            'sequence': sequence
        },
    }
    data['sequences'].append(sequence)

    path = paulines_folder / 'af3_json_inputs' / f'{name}.json'

    with path.open('w') as f_out:
        json.dump(
            data, 
            f_out,
            indent=True,
        )