In [2]:
import pandas as pd
import pickle as pkl
import os
from bioservices import UniProt
import requests


## Loading HotProtein Dataset Info

Opening the S_target pickle files

In [5]:
with open('S/S_target.pkl', 'rb') as f:
    S_target = pkl.load(f)
S_target_train = pd.DataFrame({key: S_target[key] for key in ['train_names', 'train_labels']})
S_target_test = pd.DataFrame({key: S_target[key] for key in ['test_names', 'test_labels']})

with open('S/S_target_classification.pkl', 'rb') as f:
    S_target_classification = pkl.load(f)
S_target_train_classification = pd.DataFrame({key: S_target_classification[key] for key in ['train_names', 'train_labels']})
S_target_test_classification = pd.DataFrame({key: S_target_classification[key] for key in ['test_names', 'test_labels']})


S_target_train_classification['train_labels'].value_counts()


train_labels
3    63126
1    28002
4    25310
2    24286
0     5120
Name: count, dtype: int64

Only saving the S_target names that have a downloaded AF2 structure

In [6]:
S_target_concat = pd.concat([S_target_train.rename(columns={'train_names': 'names', 'train_labels': 'labels'}), 
                             S_target_test.rename(columns={'test_names': 'names', 'test_labels': 'labels'})])
S_target_classification_concat = pd.concat([S_target_train_classification.rename(columns={'train_names': 'names', 'train_labels': 'labels'}), 
                             S_target_test_classification.rename(columns={'test_names': 'names', 'test_labels': 'labels'})])
S_target_AF2 = os.listdir("/stor/work/Ellington/ProteinMPNN/HotProtein/S/AF2/CIF")
S_target_AF2 = pd.Series(S_target_AF2)
S_target_AF2 = S_target_AF2.str.split('-').str[1]
S_target_names_not_in_AF2 = S_target_concat[~S_target_concat['names'].isin(S_target_AF2)]['names']

S_target_concat = S_target_concat[~S_target_concat['names'].isin(S_target_names_not_in_AF2)]
S_target_classification_concat = S_target_classification_concat[~S_target_classification_concat['names'].isin(S_target_names_not_in_AF2)]


## Loading ProteinMPNN Training Dataset Info

We want to ultimately remove HotProtein entries that are represented in the ProteinMPNN training dataset, to eliminate data leakage.

Step 1: Map ProteinMPNN PDB IDs to UniProt Accessions

In [7]:
proteinmpnn_data = pd.read_csv('/stor/work/Ellington/ProteinMPNN/training/ProteinMPNN_training_data/pdb_2021aug02/list.csv')
proteinmpnn_data = proteinmpnn_data['CHAINID'].str.split('_').str[0].unique()
proteinmpnn_data = [x.upper() for x in proteinmpnn_data]
accessions = []

uniprot = UniProt()
for pdb in proteinmpnn_data:
    if len(accessions) % 100 == 0:
        with open('/stor/work/Ellington/ProteinMPNN/HotProtein/ProteinMPNN_TrainingData_UniProtAccessions.txt', 'w') as f:
            for item in accessions:
                f.write("%s\n" % item)
    results = uniprot.mapping(fr="PDB", to="UniProtKB",query=pdb)  # ACC for UniProtKB AC (identifier)
    if results['results']!=[]:
        accession = results['results'][0]['to']['primaryAccession']
        accessions.append(accession)
    else:
        continue
with open('/stor/work/Ellington/ProteinMPNN/HotProtein/ProteinMPNN_TrainingData_UniProtAccessions.txt', 'w') as f:
            for item in accessions:
                f.write("%s\n" % item)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


KeyboardInterrupt: 

## Does the ProteinMPNN training dataset overlap with HotProtein?

Given the downloaded UniProt IDs mapped to the ProteinMPNN training data, does it overlap with HotProtein in...
* ID?
* Sequence? (requires MMSeqs2 and FASTA files for ProteinMPNN training data)

In [11]:
mpnn_uniprot = pd.read_csv('/stor/work/Ellington/ProteinMPNN/HotProtein/ProteinMPNN_TrainingData_Unique_UniProtAccessions.txt', header=None)
S_target_AF2

overlaps = S_target_concat[S_target_concat['names'].isin(mpnn_uniprot[0])]['names']
print(len(overlaps))


526


526 structures directly overlap between the two sets. We'll mark these structures and move on to sequence fetching.

Now, we want to see if the structures from the MPNN set overlap in sequence content. This will be done via MMSeqs2. First, we need to acquire the FASTA sequences from the MPNN dataset (~40k structures).

In [None]:
mpnn_fasta_directory = '/stor/work/Ellington/ProteinMPNN/HotProtein/ProteinMPNN_TrainingData_Fasta'

# Base URL for UniProt REST API
base_url = "https://rest.uniprot.org/uniprotkb/"

# Retrieve and save Fasta files
for accession in mpnn_uniprot[0]:
    url = f"{base_url}{accession}.fasta"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for error status codes
        with open(f"{mpnn_fasta_directory}/{accession}.fasta", "w") as f:
            f.write(response.text)
        print(f"Saved Fasta file for {accession}")
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving {accession}: {e}")
    except Exception as e:
        print(f"Error retrieving {accession}: {e}")


Organizing the fasta files for each split into subdirectories

s2c2 = temps [<45,45<] map to classes [[0,1,2],[3,4]]

s2c5 = temps [-20:5,5:25,25:45,45:75,75<] maps to classes [0,1,2,3,4]

In [4]:
uniprot = UniProt()
results = uniprot.mapping(fr="PDB", to="UniProtKB",query=proteinmpnn_data[27])  # ACC for UniProtKB AC (identifier)
if results['results']!=[]:
    accession = results['results'][0]['to']['primaryAccession']
    accessions.append(accession)


0it [00:00, ?it/s]


IndexError: list index out of range