## Add missing BioProjects

- Patch 16534 bioprojects that were missing from Logan database in bioprojects table
- Call NCBI to retrieve missing info


In [1]:
import sys
import xml.etree.ElementTree as ET
import time

if '../' not in sys.path:
    sys.path.append("../")
%reload_ext dotenv
%dotenv

from queries import serratus_queries, logan_queries
from datasources.neo4j import get_connection

from Bio import Entrez

In [24]:
def get_runs_with_missing_bioproject():
    neo4j = get_connection()
    query = '''
        MATCH (n:SRA)
        WHERE NOT (n)-[:HAS_BIOPROJECT]->() AND n.bioProject IS NOT NULL
        RETURN DISTINCT n.bioProject
    '''
    return neo4j.query(query)

missing = get_runs_with_missing_bioproject()
print(len(missing))
# 16534

63


In [23]:
Entrez.email = 'lj.pereira@mail.utoronto.ca'



escapes = ''.join([chr(char) for char in range(1, 32)])
translator = str.maketrans('', '', escapes)

def get_bioproject_details(bioproject):
    handle = Entrez.esearch(db='bioproject', term=bioproject)
    search_record = Entrez.read(handle)
    handle.close()
    if search_record['Count'] == '0':
        return None
    handle = Entrez.esummary(db='bioproject', id=search_record['IdList'][0])

    summary_record = Entrez.read(handle)
    handle.close()
    document = summary_record['DocumentSummarySet']['DocumentSummary'][0]

    resp = {
        'bioproject': bioproject,
        'title': None,
        'description': None,
        'name': None
    }

    def clean_text(text):
        # escape quotes
        # escale escape characters
        text = text.translate(translator)
        text = text.replace('\\', '')
        text = text.replace('"', "'")
        text = text.replace('\r', ' ')
        text = text.replace('\t', ' ')  
        text = text.replace('\n', ' ')
        return text
    

    if 'Project_Title' in document:
        resp['title'] = clean_text(document['Project_Title'])
    if 'Project_Description' in document:
        resp['description'] = clean_text(document['Project_Description'])
    if 'Project_Name' in document:
        resp['name'] = clean_text(document['Project_Name'])
    
    return resp

def create_bioproject_node(bioproject_details):
    neo4j = get_connection()
    query = f'''
        MERGE (b:BioProject {{bioProject: "{bioproject_details['bioproject']}" }} )
        SET b += {{
            title: "{bioproject_details['title']}",
            description: "{bioproject_details['description']}",
            name: "{bioproject_details['name']}"
        }}
    '''
    return neo4j.query(query)


def create_bioproject_relationships(bioproject):
    neo4j = get_connection()
    query = f'''
        MATCH (n:SRA {{bioProject: "{bioproject}"}})
        MATCH (b:BioProject {{bioProject: "{bioproject}"}})
        MERGE (n)-[:HAS_BIOPROJECT]->(b)
    '''
    return neo4j.query(query)

# handle 429: Too Many Requests rate limit
last_completed_time = time.time()
for i, record in enumerate(missing):
    time.sleep(0.5)
    bioproject = record['n.bioProject']
    details = get_bioproject_details(bioproject)
    print(i, bioproject)

    if not details:
        print(f'No details found for {bioproject}')
        continue
    create_bioproject_node(details)
    create_bioproject_relationships(bioproject)

0 PRJEB40027
1 PRJNA170677
No details found for PRJNA170677
2 PRJNA173491
No details found for PRJNA173491
3 PRJNA176872
No details found for PRJNA176872
4 PRJNA198686
No details found for PRJNA198686
5 PRJNA20965
No details found for PRJNA20965
6 PRJNA222417
No details found for PRJNA222417
7 PRJNA241409
No details found for PRJNA241409
8 PRJNA261660
No details found for PRJNA261660
9 PRJNA290844
No details found for PRJNA290844
10 PRJNA295687
No details found for PRJNA295687
11 PRJNA297265
No details found for PRJNA297265
12 PRJNA327834
No details found for PRJNA327834
13 PRJNA352040
No details found for PRJNA352040
14 PRJNA371783
No details found for PRJNA371783
15 PRJNA377138
No details found for PRJNA377138
16 PRJNA391175
No details found for PRJNA391175
17 PRJNA398653
No details found for PRJNA398653
18 PRJNA415907
No details found for PRJNA415907
19 PRJNA422288
No details found for PRJNA422288
20 PRJNA422297
No details found for PRJNA422297
21 PRJNA428835
No details found for PR

In [3]:
df_sra = serratus_queries.get_sra_df()
df_bioproject = logan_queries.get_bioproject_df()
df_biosample = logan_queries.get_biosample_df()

Reading local cached file /mnt/graphdata/query_cache/sql/sql_sra_nodes.csv
Reading local cached file /mnt/graphdata/query_cache/sql/sql_bioproject_nodes.csv
Reading local cached file /mnt/graphdata/query_cache/sql/sql_biosample_nodes.csv
