In [8]:
import polars as pl
import pandas as pd
import subprocess
from pathlib import Path

In [5]:
# dir = "/home/stef/quest_data/hiec/data/sprhea"
dir = "/projects/p30041/spn1560/hiec/data/sprhea"

df = pl.read_csv(f"{dir}/v3_folded_pt_ns.csv", separator='\t')
df = df.with_columns(
    pl.col("Sequence").str.len_chars().alias("Seq_Len")
)
df.head()

Entry,Label,Sequence,Seq_Len
str,str,str,u32
"""P0A6W3""","""1123""","""MLVWLAEHLVKYYSGFNVFSYLTFRAIVSL…",360
"""P9WMW7""","""1123""","""MRQILIAVAVAVTVSILLTPVLIRLFTKQG…",359
"""O66465""","""1123""","""MLYQLALLLKDYWFAFNVLKYITFRSFTAV…",359
"""Q8MJ30""","""3097;7946""","""MAAAAAGEARRVLVYGGRGALGSRCVQAFR…",243
"""P38489""","""3097;7946""","""MDIISVALKRHSTKAFDASKKLTPEQAEQI…",217


In [9]:
df = pd.read_csv(f"{dir}/v3_folded_pt_ns.csv", sep='\t')
df['seq_len'] = df['Sequence'].str.len()
df.head()

Unnamed: 0,Entry,Label,Sequence,seq_len
0,P0A6W3,1123,MLVWLAEHLVKYYSGFNVFSYLTFRAIVSLLTALFISLWMGPRMIA...,360
1,P9WMW7,1123,MRQILIAVAVAVTVSILLTPVLIRLFTKQGFGHQIREDGPPSHHTK...,359
2,O66465,1123,MLYQLALLLKDYWFAFNVLKYITFRSFTAVLIAFFLTLVLSPSFIN...,359
3,Q8MJ30,3097;7946,MAAAAAGEARRVLVYGGRGALGSRCVQAFRARNWWVASIDVVENEE...,243
4,P38489,3097;7946,MDIISVALKRHSTKAFDASKKLTPEQAEQIKTLLQYSPSSTNSQPW...,217


In [12]:
df = pd.read_csv(f"{dir}/v3_folded_n_100.csv", sep='\t')
df.head()

Unnamed: 0,Entry,Label,Sequence
0,Q569C4,10602;4644;2726;3150,MGTALVYHEDMTATRLLWDDPECEIECPERLTAALDGLRQRGLEER...
1,P58466,337;245,MDSSAVITQISKEEARGPLRGKGDQKSAVSQKPRSRGILHSLFCCV...
2,G2IJ05,7504,MAKSLQDVLDNAGNAVDFLRNQQTGPNVYPGVPAEYSNWRNEQRAW...
3,O23732,1462,MVGGCSSLSYSSSSTFIATTTLSSSLKLNPQSFIFHLNLRKRPPLR...
4,Q9D291,2838,MGANQLVVLNVYDMYWMNEYTSSIGIGVFHSGIEVYGREFAYGGHP...


In [10]:
df.loc[df['seq_len'] < 650, 'Entry'].to_list()

['P0A6W3',
 'P9WMW7',
 'O66465',
 'Q8MJ30',
 'P38489',
 'P09417',
 'Q3T0Z7',
 'Q8BVI4',
 'Q86A17',
 'P11348',
 'Q8R1K4',
 'Q8IUZ5',
 'Q46222',
 'B0B9V8',
 'F0T4D1',
 'A0A7W3N5X5',
 'Q9V6G5',
 'P97812',
 'Q98938',
 'O43323',
 'Q62226',
 'Q98862',
 'Q02936',
 'Q91610',
 'Q91035',
 'Q92000',
 'Q91611',
 'Q90385',
 'Q92008',
 'Q91612',
 'Q61488',
 'Q15465',
 'Q14623',
 'Q63673',
 'P54857',
 'Q16842',
 'Q6KB59',
 'Q11201',
 'Q6KB58',
 'Q11204',
 'P54751',
 'Q11205',
 'Q8K274',
 'Q9HA64',
 'Q9ER35',
 'Q9H479',
 'Q9EQC4',
 'Q9GZR5',
 'Q95K73',
 'P31210',
 'Q9TV64',
 'Q8VCX1',
 'P51857',
 'Q8NLB6',
 'Q9F131',
 'Q0QFQ1',
 'Q3S4B7',
 'Q5EXK1',
 'W8JWW7',
 'A8E5V7',
 'Q9FLF7',
 'Q9VSY4',
 'Q8CIB9',
 'Q9QXS4',
 'Q9C666',
 'Q6MG11',
 'P39979',
 'Q5R816',
 'A3KPA3',
 'A7YY28',
 'Q5SPR8',
 'P09163',
 'Q07794',
 'E0CYR6',
 'Q5XI06',
 'Q9UUK2',
 'Q6DGG1',
 'D3G9N3',
 'Q00403',
 'Q9ZV06',
 'Q9Y7Y5',
 'Q810T5',
 'Q5DD96',
 'Q96IU4',
 'Q5SQI0',
 'A0A364LXP7',
 'Q8VCR7',
 'P40963',
 'P62915',
 'P37664',
 '

In [7]:
ub = 650
n_valid = df.filter(pl.col("Seq_Len") < ub).shape[0]
print(f"Number of proteins with length < {ub}: {n_valid}")

Number of proteins with length < 650: 20156


In [5]:
upids = df["Entry"].to_list()
upids[:5]

['P0A6W3', 'P9WMW7', 'O66465', 'Q8MJ30', 'P38489']

In [8]:
esm2_upids = [elt.stem for elt in Path(f"{dir}/esm2").glob("*.pt")]
esm2_upids[:5]

['O59584', 'P11245', 'P19985', 'Q66LN0', 'P17443']

In [11]:
blacklist_adds = set(upids) - set(esm2_upids)
print(f"Blacklist adds: {len(blacklist_adds)}")

Blacklist adds: 0


In [4]:
file_paths = [f"gs://public-datasets-deepmind-alphafold-v4/AF-{u}-F1-model_v4.cif" for u in upids]
output_file = 'uniprot_cif_paths.txt' 
with open(output_file, 'w') as file:
    file.write('\n'.join(file_paths))

In [20]:
retrieved_af2 = []
retrieved_path = "/home/stef/quest_data/hiec/data/sprhea/af2_embeds.txt"
with open(retrieved_path, 'r') as file:
    for line in file:
        retrieved_af2.append(line.strip())

print(f"Retrieved {len(retrieved_af2)} AF2 paths")
retrieved_af2[:5]

Retrieved 23940 AF2 paths


['AF-A0A009IHW8-F1-model_v4.cif',
 'AF-A0A017SPL2-F1-model_v4.cif',
 'AF-A0A023GS28-F1-model_v4.cif',
 'AF-A0A023W421-F1-model_v4.cif',
 'AF-A0A023YYV9-F1-model_v4.cif']

In [21]:
tmp = []
for elt in retrieved_af2:
    split_elt = elt.split('-')
    try:
        tmp.append(split_elt[1])
    except:
        print(f"Could not split {elt}")

Could not split af2_embeds.txt


In [22]:
retrieved_af2 = set(tmp)
full_upids = set(upids)
blacklist = full_upids - retrieved_af2
print(f"Found {len(blacklist)} missing AF2 embeddings")

Found 584 missing AF2 embeddings


In [23]:
with open("/home/stef/quest_data/hiec/data/sprhea/af2_blacklist.txt", 'w') as file:
    for item in blacklist:
        file.write(f"{item}\n")

In [4]:
df = pl.read_csv(f"{dir}/v3_folded_n_100.csv", separator='\t')
df.head()

Entry,Label,Sequence
str,str,str
"""Q569C4""","""10602;4644;2726;3150""","""MGTALVYHEDMTATRLLWDDPECEIECPER…"
"""P58466""","""337;245""","""MDSSAVITQISKEEARGPLRGKGDQKSAVS…"
"""G2IJ05""","""7504""","""MAKSLQDVLDNAGNAVDFLRNQQTGPNVYP…"
"""O23732""","""1462""","""MVGGCSSLSYSSSSTFIATTTLSSSLKLNP…"
"""Q9D291""","""2838""","""MGANQLVVLNVYDMYWMNEYTSSIGIGVFH…"


In [5]:
sub_entries = df['Entry'].to_list()

In [6]:
with open(f"{dir}/n_100_upids.txt", 'w') as file:
    for item in sub_entries:
        file.write(f"{item}\n")

In [7]:
dir = "/home/stef/quest_data/hiec/scratch/sprhea_v3_folded_n_100_esm2/random_reaction_center_alternate_reaction_center/3fold"
import pandas as pd
import numpy as np
df = pd.read_parquet(f"{dir}/test.parquet")
df.head()
# df = pl.read_parquet(f"{dir}/test.parquet")
# df.head()

Unnamed: 0,protein_idx,reaction_idx,pid,rid,protein_embedding,smarts,am_smarts,reaction_center,y
0,6,10,Q13946,2777,"[[0.11307454, -0.1326546, 0.046492204, 0.12865...",Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O>>Nc1ncn...,[CH:1]1([OH:23])[CH:8]([CH2:7][O:4][P:3]([OH:2...,"[[[17, 19, 20]], [[19, 18], [0]]]",1
1,8,12,C0JB27,119,"[[0.11307454, -0.1326546, 0.046492204, 0.12865...",*C(=O)NC(COP(=O)(O)OCC[N+](C)(C)C)C(*)O>>C[N+]...,[O:1]([CH2:2][CH2:3][N+:4]([CH3:5])([CH3:6])[C...,"[[[19, 10, 7]], [[6], [10, 7]]]",1
2,8,13,C0JB27,53,"[[0.11307454, -0.1326546, 0.046492204, 0.12865...",NCCO.*C(=O)NC1COP(=O)(O)OC1*>>*C(=O)NC(COP(=O)...,[OH:2][CH2:4][CH2:5][NH2:6].[O:1]1[P:3](=[O:16...,"[[[3], [10, 7]], [[16, 10, 7]]]",1
3,8,14,C0JB27,632,"[[0.11307454, -0.1326546, 0.046492204, 0.12865...",C[N+](C)(C)CCO.*C(=O)OCC1COP(=O)(O)O1>>*C(=O)O...,[OH:2][CH2:4][CH2:5][N+:6]([CH3:7])([CH3:8])[C...,"[[[6], [11, 8]], [[6, 12, 9]]]",1
4,8,15,C0JB27,581,"[[0.11307454, -0.1326546, 0.046492204, 0.12865...",*C(=O)OCC(O)COP(=O)(O)OCCN>>NCCO.*C(=O)OCC1COP...,[O:1]([CH2:2][CH2:3][NH2:4])[P:6]([O:11][CH2:9...,"[[[6, 12, 9]], [[3], [11, 8]]]",1


In [9]:
np.vstack(df.loc[0, 'protein_embedding']).shape

(252, 1280)