In [46]:
!pip install requests biopython




In [47]:
from Bio import Entrez
import xml.etree.ElementTree as ET
import time
import os 

# Always tell NCBI who you are
Entrez.email = "riboseq@gmail.com"  # Replace with your email

In [51]:
def get_pubmed_id_from_bioproject(bioproject_id, max_retries=3, delay=1):
    for attempt in range(max_retries):
        try:
            # First, use esearch to get the BioProject ID
            handle = Entrez.esearch(db="bioproject", term=bioproject_id)
            record = Entrez.read(handle)
            handle.close()

            if record["IdList"]:
                project_id = record["IdList"][0]
                
                # Then use elink to find related PubMed entries
                handle = Entrez.elink(dbfrom="bioproject", db="pubmed", id=project_id)
                record = Entrez.read(handle)
                handle.close()

                # Check if there are any linked PubMed IDs
                if record[0]['LinkSetDb']:
                    pubmed_id = record[0]['LinkSetDb'][0]['Link'][0]['Id']
                    return pubmed_id
            
            return None  # No linked PubMed ID found

        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"Failed to retrieve data after {max_retries} attempts.")
                print(f"Error: {str(e)}")
                return None


def get_bioproject_metadata(bioproject_id, max_retries=3, delay=1, save_xml=False, xml_file_path=None):
    for attempt in range(max_retries):
            # First, use esearch to get the BioProject ID
        handle = Entrez.esearch(db="bioproject", term=bioproject_id)
        record = Entrez.read(handle)
        handle.close()

        if record["IdList"]:
            project_id = record["IdList"][0]
            
            # Then use efetch to get the full BioProject record
            handle = Entrez.efetch(db="bioproject", id=project_id, retmode="xml")
            xml_content = handle.read()
            handle.close()

            # Save XML to file if requested
            if save_xml:
                if xml_file_path is None:
                    xml_file_path = f"bioproject_{bioproject_id}.xml"
                
                            
                with open(xml_file_path, 'wb') as xml_file:
                    xml_file.write(xml_content)
                print(f"XML content saved to {xml_file_path}")

            root = ET.fromstring(xml_content)
            
            # Extract project-level metadata
            metadata = {}
            
            # Project ID
            project = root.find(".//Project")
            if project is not None:
                metadata['ProjectID'] = project.get('accession')
            
            # Project Type
            project_type = root.find(".//ProjectType")
            if project_type is not None:
                metadata['ProjectType'] = project_type.find('ProjectTypeSubmission').get('ProjectDataTypeSet')
            
            # Project Name
            name = root.find(".//Project/ProjectDescr/Name")
            if name is not None:
                metadata['ProjectName'] = name.text
            
            # Project Title
            title = root.find(".//Project/ProjectDescr/Title")
            if title is not None:
                metadata['ProjectTitle'] = title.text
            
            # Project Description
            description = root.find(".//Project/ProjectDescr/Description")
            if description is not None:
                metadata['ProjectDescription'] = description.text
            
            # Relevance
            relevance = root.find(".//Project/ProjectDescr/Relevance")
            if relevance is not None:
                metadata['Relevance'] = relevance.text
            
            # Submitter
            submitter = root.find(".//Submission/Description/Organization/Name")
            if submitter is not None:
                metadata['Submitter'] = submitter.text
            
            # Submission Date
            submitted = root.find(".//Submission")
            if submitted is not None:
                metadata['SubmissionDate'] = submitted.get('submitted')
            
            return metadata
        else:
            print(f"No BioProject found with ID: {bioproject_id}")
            return None



In [49]:
# Example usage
bioproject_id = "PRJDB10544"  # Replace with your BioProject ID

pubmed_id = get_pubmed_id_from_bioproject(bioproject_id)
if pubmed_id:
    print(f"The PubMed ID for BioProject {bioproject_id} is: {pubmed_id}")
else:
    print(f"No PubMed ID found for BioProject {bioproject_id} or an error occurred")


No PubMed ID found for BioProject PRJDB10544 or an error occurred


In [52]:
metadata = get_bioproject_metadata(bioproject_id, save_xml=True, xml_file_path=f"bioproject_{bioproject_id}.xml")
if metadata:
    print("\nBioProject Metadata:")
    for key, value in metadata.items():
        print(f"{key}: {value}")
else:
    print(f"Failed to retrieve metadata for BioProject {bioproject_id}")

XML content saved to bioproject_PRJDB10544.xml

BioProject Metadata:
ProjectID: None
ProjectType: None
ProjectName: Arabidopsis thaliana strain:T87 cultured cells
ProjectTitle: Ribosome profiling in Arabidopsis T87 cultured cell
ProjectDescription: In gene expression, DNA is transcribed to mRNA, and mRNA is translated to protein. In these steps, translation is one of the important steps to determine the protein abundance. To evaluate translation efficiencies, ribosome profiling is reported in previous study. This method is based on deep sequencing of ribosome-protected mRNA fragments, and provides a "snapshot" of all the ribosomes abundance at a specific time point. This information provide the ribosome position or ribosome occupancy on mRNAs. Thus, we conducted ribosome profiling using 3 day after inoculation of Arabidopsis T87 cultured cell to evaluate translation efficiencies and analyze the relationships between sequence feature and translation process.
Relevance: 
                

In [62]:
import gzip
import urllib.request
import io

In [63]:
def download_and_parse_geo_soft(gse_id):
    # Construct the URL for the SOFT file
    url = f"https://ftp.ncbi.nlm.nih.gov/geo/series/{gse_id[:-3]}nnn/{gse_id}/soft/{gse_id}_family.soft.gz"
    
    try:
        # Download the gzipped file
        with urllib.request.urlopen(url) as response:
            with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as unzipped:
                content = unzipped.read().decode('utf-8')
    
        # Parse the SOFT file content
        series_info = {}
        in_series_section = False
        
        for line in content.split('\n'):
            if line.startswith('^SERIES'):
                in_series_section = True
                continue
            elif line.startswith('^'):
                in_series_section = False
            
            if in_series_section and line.startswith('!'):
                key, value = line.split(' = ', 1)
                key = key.lstrip('!')
                series_info[key] = value.strip()
        
        return series_info
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [67]:
gse_id = "GSE241473"

# Get metadata and save XML
series_metadata = download_and_parse_geo_soft(gse_id)

if series_metadata:
    print(f"Series Information for {gse_id}:")
    for key, value in series_metadata.items():
        print(f"{key[7:]}: {value}")
else:
    print(f"Failed to retrieve series information for {gse_id}")

Series Information for GSE241473:
title: Yeast eIF2A plays a minimal role in translation initiation in vivo
geo_accession: GSE241473
status: Public on Jan 03 2024
submission_date: Aug 22 2023
last_update_date: Apr 02 2024
pubmed_id: 38266075
summary: Earlier investigations have associated mammalian eIF2A with Met-tRNAi binding to the 40S subunit and its recruitment to specialized mRNAs in a GTP-independent manner. Additionally, eIF2A has been implicated in non-AUG start codon initiation, particularly under conditions where eIF2 function is attenuated by phosphorylation of its α-subunit during stress or starvation. However, the precise role of eIF2A in vivo translation remains unclear. Moreover, it's uncertain if the conserved ortholog in budding yeast can functionally substitute for eIF2 during stress. To address these questions, we conducted ribosome profiling on a yeast deletion mutant lacking eIF2A, alongside isogenic wild-type (WT) cells, both in the presence or absence of eIF2α ph

In [68]:
import pandas as pd

In [69]:
rdp_metadata = pd.read_csv("https://rdp.ucc.ie/static2/RiboSeqOrg_Metadata_v2024.10.csv")

  rdp_metadata = pd.read_csv("https://rdp.ucc.ie/static2/RiboSeqOrg_Metadata_v2024.10.csv")


In [76]:
rdp_metadata.shape

(15465, 89)

In [79]:
bioproject_gses = rdp_metadata[['BioProject', 'GEO']].dropna().drop_duplicates().reset_index(drop=True)
bioproject_gses.shape

(748, 2)

In [83]:
geo_series_metadata = pd.DataFrame()

for i, row in bioproject_gses.iterrows():
    bioproject_id = row['BioProject']
    gse_id = row['GEO']
    
    # Retrieve GEO Series metadata
    series_metadata = download_and_parse_geo_soft(gse_id)
    print(i, bioproject_id, gse_id, series_metadata)
    
    if series_metadata:
        # Combine metadata into a single dictionary
        combined_metadata = {**series_metadata}
        
        # Convert combined_metadata to DataFrame and concatenate
        geo_series_metadata = pd.concat([geo_series_metadata, pd.DataFrame([combined_metadata])], ignore_index=True)

geo_series_metadata.shape

0 PRJNA1004241 GSE240563 {'Series_title': 'Regulation by the RNA-binding protein Unkempt at its effector interface  [Ribo]', 'Series_geo_accession': 'GSE240563', 'Series_status': 'Public on Mar 17 2024', 'Series_submission_date': 'Aug 10 2023', 'Series_last_update_date': 'May 01 2024', 'Series_pubmed_id': '38605040', 'Series_summary': 'How RNA-binding proteins (RBPs) convey regulatory instructions to the core effectors of RNA processing is unclear. Here we document the existence and functions of a multivalent RBP–effector interface. We show that the effector interface of a conserved RBP with an essential role in metazoan development, Unkempt, is mediated by a novel type of ‘dual-purpose’ peptide motifs that can contact two different surfaces of interacting proteins. Unexpectedly, we find that the multivalent contacts do not merely serve effector recruitment but are required for the accuracy of RNA recognition by Unkempt. Systems analyses reveal that multivalent RBP–effector contacts ca

(748, 31)

In [84]:
non_empty_pubmed_ids_count = geo_series_metadata['Series_pubmed_id'].dropna().shape[0]
print(f"Number of non-empty Series_pubmed_id entries: {non_empty_pubmed_ids_count}")

Number of non-empty Series_pubmed_id entries: 658
