In [59]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO

sns.set_theme(palette='colorblind', font_scale=1.1)
palette = sns.color_palette().as_hex()

base_folder = Path(os.path.expanduser('~')) / 'workspace' / 'chahrazad'

## Extract relevant candidates from proteome

In [60]:
proteins = SeqIO.to_dict(SeqIO.parse(base_folder / 'mass_spec_output' / 'merged_contigs_S5a1.fa', 'fasta'))
len(proteins)

3898

In [66]:
mass_spec_df = pd.read_csv(base_folder / 'mass_spec_output' / '2023_03_11_quant_table_for_ID_fr_2.csv')
mass_spec_df['protein_id'] = mass_spec_df['PG.FastaHeaders'].apply(lambda v: v[1:].split(' ')[0].strip())
mass_spec_df['description'] = mass_spec_df['PG.FastaHeaders'].apply(lambda v: v.split(' ')[1].strip())
mass_spec_df.head()

Unnamed: 0.1,Unnamed: 0,SignalP,PG.FastaHeaders,PG.MolecularWeight,A_-6,A_-5,A_-4,A_-3,A_-2,A_-1,...,C_1,C_2,C_3,C_4,C_5,C_6,C_7,C_8,protein_id,description
0,,,>gnl|extdb|pgaptmp_000061_3 twin-arginine tran...,18418.65,0.498328,0.344635,0.163307,0.064468,0.028929,0.009794,...,0.007038,0.005093,0.002355,0.003894,0.002137,0.000726,0.001508,0.000953,gnl|extdb|pgaptmp_000061_3,twin-arginine
1,,,>gnl|extdb|pgaptmp_000109 50S ribosomal protei...,10599.02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.310365,0.120477,0.0,0.0,0.0,0.0,0.0,0.0,gnl|extdb|pgaptmp_000109,50S
2,,,>gnl|extdb|pgaptmp_000146 multiprotein-bridgin...,18936.81,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,gnl|extdb|pgaptmp_000146,multiprotein-bridging
3,,,>gnl|extdb|pgaptmp_000165 CDC48 family AAA ATP...,82483.44,0.0,0.56985,1.0,0.596697,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,gnl|extdb|pgaptmp_000165,CDC48
4,,Sec/SPI,>gnl|extdb|pgaptmp_000187_2 hypothetical prote...,43409.58,0.7619,1.0,0.779876,0.312693,0.23329,0.119024,...,0.038653,0.022015,0.008348,0.0052,0.004847,0.003765,0.00309,0.00334,gnl|extdb|pgaptmp_000187_2,hypothetical


## Concatenate all-vs-all fasta files

`make_input_fasta.py` produces one file per bait. In this case it makes sense to pull them all into one file.

In [73]:
output_file = base_folder / 'S5a-1_candidate_analysis' / 'all_vs_all_pulldown.fasta'

all_records = []
for f in (base_folder / 'S5a-1_candidate_analysis').iterdir():
    if f.is_file() and f.name.endswith('.fasta') and f.name.startswith('gnl|extdb|'):
        for record in SeqIO.parse(f, 'fasta'):
            all_records.append(record)

with output_file.open('w') as f_out:
    SeqIO.write(all_records, f_out, 'fasta')