In [1]:
import os
from pathlib import Path

import pandas as pd
from Bio import SeqIO

if os.getcwd().endswith('notebook'):
    os.chdir('..')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
species = 'P_furiosus'

In [2]:
base_folder = Path(os.path.expanduser('~')) / 'Documents' / 'afpd'
base_folder.is_dir()

True

In [6]:
fasta_path = base_folder / f'{species}' / f'{species}.fasta'
fasta_path.is_file()

True

In [7]:
records_dict = SeqIO.to_dict(SeqIO.parse(fasta_path, 'fasta'))
len(records_dict)

2036

In [8]:
def compute_score(bait_id):
    scores_df = pd.read_csv(base_folder / f'{species}' / 'outputs' / f'{bait_id}_pulldown_scores.csv')

    scores_df['target_uniprot_id'] = scores_df['id'].apply(
        lambda idx: idx.split('__')[-1]
    )
    scores_df['target_description'] = scores_df['target_uniprot_id'].apply(
        lambda uniprot_id: records_dict[uniprot_id].description.replace(uniprot_id, '').strip()
    )

    scores_df.to_csv(
        base_folder / f'{species}' / 'outputs' / f'{bait_id}_pulldown_scores_annotated.csv',
        index=False,
    )

In [9]:
bait_ids = [
    'I6V1B6',
    'I6V3Z3',
    'I6V1B6__I6V1B6',
    'I6V3Z3__I6V3Z3',
    'I6V1B6__I6V3Z3',
]
for bait_id in bait_ids:
    compute_score(bait_id)

## extract top hits

Hits with ipTM > 0.7

In [8]:
input_dict = {
    'T_kodakarensis': [
        'Q9Y8I1',
        'Q9Y8I2',
        'Q9Y8I1__Q9Y8I1',
        'Q9Y8I2__Q9Y8I2',
        'Q9Y8I1__Q9Y8I2',
    ],
    'P_furiosus': [
        'I6V1B6',
        'I6V3Z3',
        'I6V1B6__I6V1B6',
        'I6V3Z3__I6V3Z3',
        'I6V1B6__I6V3Z3',
    ],
}
threshold = 0.75
for species, bait_ids in input_dict.items():
    top_hits = []
    for bait_id in bait_ids:
        df = pd.read_csv(base_folder / species / 'outputs' / f'{bait_id}_pulldown_scores_annotated.csv')

        top_hits.extend(
            df[df['iptm'] > threshold].sort_values('iptm', ascending=False)['id'].values.tolist()
        )

    output = pd.DataFrame.from_dict({'id': top_hits})
    output.to_csv(base_folder / species / 'top_hit_ids.txt', header=False, index=False)
    print(f'Hits with ipTM > {threshold} for {species}: {len(output):,}')

Hits with ipTM > 0.75 for T_kodakarensis: 31
Hits with ipTM > 0.75 for P_furiosus: 46
