# FIRST OUTLINE THINGS TO DO
## Connect To API
## Pull all features
## Add feature to column in DF
## Write DF to an Excel


In [1]:
import requests
import pandas as pd

# Define the API endpoint and query parameters
url = "https://search.rcsb.org/rcsbsearch/v2/query"
query = {
    "query": {
        "type": "terminal",
        "service": "text",
        "parameters": {
            "attribute": "rcsb_accession_info.deposit_date",
            "operator": "exists"
        }
    },
    "return_type": "entry",
    "request_options": {
        "paginate": {
            "start": 0,
            "rows": 100
        }
    }
}

# Send the request to the RCSB PDB API
response = requests.post(url, json=query)
data = response.json()

# Check the structure of the response
print(data)

# Extract the list of PDB IDs if 'result_set' is present
if 'result_set' in data:
    pdb_ids = [entry['identifier'] for entry in data['result_set']]
    # Convert the list to a DataFrame
    df = pd.DataFrame(pdb_ids, columns=['PDB_ID'])
    # Filter out CSM PDBs (assuming CSM PDBs have a specific pattern in their IDs)
    non_csm_df = df[~df['PDB_ID'].str.contains('CSM')]
    # Display the DataFrame
    display(non_csm_df)
else:
    print("The key 'result_set' was not found in the response.")


{'query_id': '222a8f80-e257-4782-b743-c7fc0f22d472', 'result_type': 'entry', 'total_count': 227111, 'result_set': [{'identifier': '100D', 'score': 1.0}, {'identifier': '101D', 'score': 1.0}, {'identifier': '101M', 'score': 1.0}, {'identifier': '102D', 'score': 1.0}, {'identifier': '102L', 'score': 1.0}, {'identifier': '102M', 'score': 1.0}, {'identifier': '103D', 'score': 1.0}, {'identifier': '103L', 'score': 1.0}, {'identifier': '103M', 'score': 1.0}, {'identifier': '104D', 'score': 1.0}, {'identifier': '104L', 'score': 1.0}, {'identifier': '104M', 'score': 1.0}, {'identifier': '105D', 'score': 1.0}, {'identifier': '105M', 'score': 1.0}, {'identifier': '106D', 'score': 1.0}, {'identifier': '106M', 'score': 1.0}, {'identifier': '107D', 'score': 1.0}, {'identifier': '107L', 'score': 1.0}, {'identifier': '107M', 'score': 1.0}, {'identifier': '108D', 'score': 1.0}, {'identifier': '108L', 'score': 1.0}, {'identifier': '108M', 'score': 1.0}, {'identifier': '109D', 'score': 1.0}, {'identifie

Unnamed: 0,PDB_ID
0,100D
1,101D
2,101M
3,102D
4,102L
...,...
95,140D
96,140L
97,141D
98,141L


In [2]:
import requests

def pdb_to_uniprot(pdb_id):
    # Define the API endpoint for the specific PDB ID
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    
    # Send the request to the PDBe API
    response = requests.get(url)
    if response.status_code == 404:
        print(f"PDB ID {pdb_id} not found (HTTP Error 404).")
        return None
    
    content = response.json()
    
    # Extract UniProt IDs
    uniprot_ids = []
    if pdb_id.lower() in content:
        for uniprot in content[pdb_id.lower()]['UniProt'].keys():
            uniprot_ids.append(uniprot)
    
    return uniprot_ids


In [3]:
import requests

def get_protein_info(uniprot_id):
    # Define the API endpoint for the specific UniProt ID
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    
    # Send the request to the UniProt API
    response = requests.get(url)
    data = response.json()
    return data

def get_protein_sequence(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve sequence for UniProt ID {uniprot_id}")
        return 'N/A'

    data = response.json()
    sequence = data.get('sequence', {}).get('value', 'N/A')
    return sequence
        
def extract_ec_numbers(protein_info):
    ec_numbers = []
    for comment in protein_info.get('comments', []):
        if comment.get('commentType') == 'CATALYTIC ACTIVITY':
            reaction = comment.get('reaction', {})
            ec_number = reaction.get('ecNumber')
            if ec_number:
                ec_numbers.append(ec_number)
    return ec_numbers

def do_all(uniprot_id):
    protein_info = get_protein_info(uniprot_id)
    if not protein_info:
        return None
    
    # Extract relevant information from the protein_info JSON
    protein_name = protein_info.get('proteinDescription', {}).get('recommendedName', {}).get('fullName', {}).get('value', 'N/A')
    organism = protein_info.get('organism', {}).get('scientificName', 'N/A')
    gene_name = protein_info.get('genes', [{}])[0].get('geneName', {}).get('value', 'N/A')
    ec_numbers = extract_ec_numbers(protein_info)
    sequence = get_protein_sequence(uniprot_id)
    
    return {
        'protein_name': protein_name,
        'organism': organism,
        'gene_name': gene_name,
        'ec_numbers': ec_numbers,
        'sequence': sequence
    }


In [4]:
!pip install kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("samiraalipour/rcsb-pdb-macromolecular-structure-dataset")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import pandas as pd

df = pd.read_csv('/Users/carmenshero/Desktop/2/rcsb_pdb_Files/rcsb_pdb_Files/rcsb_pdb_StructureData.csv')
df2 = df["Entry ID"]
print(df2)

0        6SJZ
1        5C37
2        6NNA
3        7M4C
4        7M4E
         ... 
11847    6T3M
11848    6WLX
11849    8D6J
11850    7C6B
11851    7D7U
Name: Entry ID, Length: 11852, dtype: object


In [10]:
import pandas as pd

output_data = []

for pdb_id in df2[:30]:
    uniprot_id = pdb_to_uniprot(pdb_id)
    if uniprot_id:  # Ensure uniprot_id is not empty
        print(f"Processing PDB ID: {pdb_id}, UniProt IDs: {uniprot_id}")
        protein_info = do_all(uniprot_id[0])
        
        if protein_info is None:
            print(f"No data returned for UniProt ID {uniprot_id[0]}")
            output_data.append({
                'PDB_ID': pdb_id,
                'UniProt_IDs': ', '.join(uniprot_id),
                'Protein_Name': 'N/A',
                'Organism': 'N/A',
                'Gene_Name': 'N/A',
                'EC_Numbers': 'N/A',
                'Sequence': 'N/A'
            })
        else:
            # Extract data
            protein_name = protein_info['protein_name']
            organism = protein_info['organism']
            gene_name = protein_info['gene_name']
            ec_numbers = protein_info['ec_numbers']
            sequence = protein_info['sequence']
            #print(protein_info)
            print() #Blank line for readability
            # Print the data for verification
            #print(f"Protein Name: {protein_name}")
           # print(f"Organism: {organism}")
           # print(f"Gene Name: {gene_name}")
           # print(f"The EC numbers for UniProt ID {uniprot_id[0]} are: {ec_numbers if ec_numbers else 'N/A'}")
            
            
            output_data.append({
                'PDB_ID': pdb_id,
                'UniProt_IDs': ', '.join(uniprot_id),
                'Protein_Name': protein_name,
                'Organism': organism,
                'Gene_Name': gene_name,
                'EC_Numbers': ', '.join(ec_numbers) if ec_numbers else 'N/A',
                'Sequence': sequence
            })
    else:
        print(f"No UniProt ID found for PDB ID {pdb_id}")


Processing PDB ID: 6SJZ, UniProt IDs: ['P30419', 'Q96NN9']

Processing PDB ID: 5C37, UniProt IDs: ['P49327']

Processing PDB ID: 6NNA, UniProt IDs: ['P49327']

Processing PDB ID: 7M4C, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4E, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4F, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4H, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4I, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4L, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 5OOG, UniProt IDs: ['P30043']

Processing PDB ID: 6QRM, UniProt IDs: ['P30419', 'Q96NN9']

Processing PDB ID: 6SK2, UniProt IDs: ['P30419', 'Q96NN9']

Processing PDB ID: 6SKJ, UniProt IDs: ['P30419', 'Q96NN9']

Processing PDB ID: 7M44, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M46, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M47, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4A, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M4B, UniProt IDs: ['Q9UGP5']

Processing PDB ID: 7M49, UniProt IDs: ['Q9UGP5']

Processing

In [11]:
import os

# Create the RCSB_Datasets directory if it doesn't exist
output_directory = 'RCSB_Datasets'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Convert the collected data to a DataFrame
print(output_data)
output_df = pd.DataFrame(output_data)

# Specify the output file path
output_file_path = f'{output_directory}/RCSB_updated.csv'

# Save the DataFrame to a CSV file
output_df.to_csv(output_file_path, index=False)

print(f"Data successfully saved to {output_file_path}")


[{'PDB_ID': '6SJZ', 'UniProt_IDs': 'P30419, Q96NN9', 'Protein_Name': 'Glycylpeptide N-tetradecanoyltransferase 1', 'Organism': 'Homo sapiens', 'Gene_Name': 'NMT1', 'EC_Numbers': '2.3.1.97', 'Sequence': 'MADESETAVKPPAPPLPQMMEGNGNGHEHCSDCENEEDNSYNRGGLSPANDTGAKKKKKKQKKKKEKGSETDSAQDQPVKMNSLPAERIQEIQKAIELFSVGQGPAKTMEEASKRSYQFWDTQPVPKLGEVVNTHGPVEPDKDNIRQEPYTLPQGFTWDALDLGDRGVLKELYTLLNENYVEDDDNMFRFDYSPEFLLWALRPPGWLPQWHCGVRVVSSRKLVGFISAIPANIHIYDTEKKMVEINFLCVHKKLRSKRVAPVLIREITRRVHLEGIFQAVYTAGVVLPKPVGTCRYWHRSLNPRKLIEVKFSHLSRNMTMQRTMKLYRLPETPKTAGLRPMETKDIPVVHQLLTRYLKQFHLTPVMSQEEVEHWFYPQENIIDTFVVENANGEVTDFLSFYTLPSTIMNHPTHKSLKAAYSFYNVHTQTPLLDLMSDALVLAKMKGFDVFNALDLMENKTFLEKLKFGIGDGNLQYYLYNWKCPSMGAEKVGLVLQ'}, {'PDB_ID': '5C37', 'UniProt_IDs': 'P49327', 'Protein_Name': 'Fatty acid synthase', 'Organism': 'Homo sapiens', 'Gene_Name': 'FASN', 'EC_Numbers': '2.3.1.85, 2.3.1.38, 2.3.1.39, 2.3.1.41, 1.1.1.100, 4.2.1.59, 1.3.1.39, 3.1.2.14, 3.1.2.14', 'Sequence': 'MEEVVIAGMSGKLPESENLQEFWDNLIGGVDMVTDDDRRWKAGLY

In [None]:

import pandas as pd

output_data = []

for pdb_id in df2:
    uniprot_id = pdb_to_uniprot(pdb_id)
    if uniprot_id:  # Ensure uniprot_id is not empty
        print(f"Processing PDB ID: {pdb_id}, UniProt IDs: {uniprot_id}")
        protein_info = do_all(uniprot_id[0])
        
        if protein_info is None:
            print(f"No data returned for UniProt ID {uniprot_id[0]}")
            output_data.append({
                'PDB_ID': pdb_id,
                'UniProt_IDs': ', '.join(uniprot_id),
                'Protein_Name': 'N/A',
                'Organism': 'N/A',
                'Gene_Name': 'N/A',
                'EC_Numbers': 'N/A'
            })
        else:
            # Extract data
            protein_name = protein_info['protein_name']
            organism = protein_info['organism']
            gene_name = protein_info['gene_name']
            ec_numbers = protein_info['ec_numbers']
            
            # Print the data for verification
            print(f"Protein Name: {protein_name}")
            print(f"Organism: {organism}")
            print(f"Gene Name: {gene_name}")
            print(f"The EC numbers for UniProt ID {uniprot_id[0]} are: {ec_numbers if ec_numbers else 'N/A'}")
            print() #Blank line for readability
            
            output_data.append({
                'PDB_ID': pdb_id,
                'UniProt_IDs': ', '.join(uniprot_id),
                'Protein_Name': protein_name,
                'Organism': organism,
                'Gene_Name': gene_name,
                'EC_Numbers': ', '.join(ec_numbers) if ec_numbers else 'N/A'
            })
    else:
        print(f"No UniProt ID found for PDB ID {pdb_id}")
*\