In [1]:
import yaml
import json

import pandas as pd

from Bio import Entrez

In [None]:
with open("apikeys.yaml", "r") as yamlfile:
  keys = yaml.load(yamlfile, Loader=yaml.FullLoader)
  print("Read Successful")

In [2]:
# Handles the eSearch endpoint for Entrez
def get_pmid(contact, key, term, **dates):
    ''' Using the Entrez search term, it queries the eSearch endpoint of the Entrez api to retrieve the corresponding pmids and join them to the input df. '''
    for_efetch = []
    Entrez.email = contact
    Entrez.api_key = key
    
    # Get total number of records
    handle = Entrez.esearch(db='pubmed', term=term, retmax=1, mindate=dates.get("mindate"), maxdate=dates.get("maxdate"))
    record = Entrez.read(handle)
    count = int(record['Count'])

    # Get all pmids with updated retmax
    handle = Entrez.esearch(db='pubmed', term=term, retmax=count, mindate=dates.get("mindate"), maxdate=dates.get("maxdate"))
    record = Entrez.read(handle)
    for_efetch.append(record['IdList'])

    # Change output from being a 1 item list
    for_efetch = pd.Series(for_efetch[0]).str.split(pat=",", expand=True).values.tolist()

    return for_efetch

# Handles the eFetch endpoint for Entrez
def get_data(pmid_list, contact, key):
    ''' Using the pmids, it queries the eFetch endpoint to retrieve the details for the corresponding citation as a list of dictionaries. ''' 
    to_clean = []
    counter = 0
    for i in range(len(pmid_list)):
            Entrez.email = contact
            Entrez.api_key = key
            handle = Entrez.efetch(db='pubmed', id=pmid_list[i], retmode='xml')
            record = Entrez.read(handle)
            to_clean.append(record)
            if counter == 600:
                print(f"Number of records retrieved is {len(to_clean)}")
                time.sleep(60)
                counter = 0
            
            counter += 1

    return to_clean

def clean_data(records):
    ''' Using a list of dictionaries (that contains all citation data for the dataset), on a per citation basis, it extracts the following information about the citations where possible:
    title, abstract, date, authors. The extracted information is saved as a list which is then converted into a df. 
    ''' 
    for record in records:
        if record.get("PubmedArticle") != []:
            a = record["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleTitle"]
            if "Abstract" in record["PubmedArticle"][0]["MedlineCitation"]["Article"].keys():
                b = record["PubmedArticle"][0]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
            else:
                b = []
            if "ArticleDate" in record["PubmedArticle"][0]["MedlineCitation"]["Article"].keys():
                clean_date = pd.json_normalize(record["PubmedArticle"][0]["MedlineCitation"]["Article"]["ArticleDate"]).values.tolist()
                clean_date = [item for sublist in clean_date for item in sublist]
                c = "-".join(clean_date)
            else:
                c = []
            if "AuthorList" in record["PubmedArticle"][0]["MedlineCitation"]["Article"].keys():
                clean_name = pd.json_normalize(record["PubmedArticle"][0]["MedlineCitation"]["Article"]["AuthorList"])
                if "LastName" in clean_name and "ForeName" in clean_name:
                    clean_name = clean_name["LastName"] + " " + clean_name["ForeName"]
                elif "CollectiveName" in clean_name:
                    clean_name = clean_name["CollectiveName"]
                elif "ForeName" not in clean_name or "CollectiveName" not in clean_name:
                    clean_name = clean_name["LastName"]
                elif "LastName" not in clean_name or "CollectiveName" not in clean_name:
                    clean_name = clean_name["ForeName"]
                d = clean_name.values.tolist()
            else:
                d = []
            e = record["PubmedArticle"][0]["MedlineCitation"]["PMID"]
        elif record.get("PubmedArticle") == []:
            a = record["PubmedBookArticle"][0]["BookDocument"]["Book"]["BookTitle"]
            if "Abstract" in record["PubmedBookArticle"][0]["BookDocument"].keys():
                b = record["PubmedBookArticle"][0]["BookDocument"]["Abstract"]["AbstractText"]
            else:
                b = []
            if "PubDate" in record["PubmedBookArticle"][0]["BookDocument"]["Book"].keys():
                clean_date = pd.json_normalize(record["PubmedBookArticle"][0]["BookDocument"]["Book"]["PubDate"]).values.tolist()
                clean_date = [item for sublist in clean_date for item in sublist]
                c = "-".join(clean_date)
            else:
                c = []
            if ("AuthorList" in record["PubmedBookArticle"][0]["BookDocument"]["Book"].keys()) and (record["PubmedBookArticle"][0]["BookDocument"]["AuthorList"] != []):
                clean = record["PubmedBookArticle"][0]["BookDocument"]["AuthorList"]
                clean_name = [item for sublist in clean for item in sublist]
                clean_name = pd.json_normalize(clean_name)
                if "LastName" in clean_name and "ForeName" in clean_name:
                    clean_name = clean_name["LastName"] + " " + clean_name["ForeName"]
                elif "CollectiveName" in clean_name:
                    clean_name = clean_name["CollectiveName"]
                elif "ForeName" not in clean_name or "CollectiveName" not in clean_name:
                    clean_name = clean_name["LastName"]
                elif "LastName" not in clean_name or "CollectiveName" not in clean_name:
                    clean_name = clean_name["ForeName"]
                d = clean_name.values.tolist()
            else:
                d = []
            e = record["PubmedBookArticle"][0]["BookDocument"]["PMID"]

        v = [e,a,b,c,d]

        data_tmp = pd.DataFrame(v).transpose().rename(columns={0:"pmid",1:"title",2:"abstract",3:"date",4:"author(s)"})

        if records.index(record) == 0:
            data = data_tmp
        else:
            data = pd.concat([data,data_tmp])

    return data

In [None]:
email = "rachit.sabharwal@uth.tmc.edu"
search = "HIV"
hiv_pmids = get_pmid(contact=email, key=keys["apikeys"]["ncbikey"]["key"], term=search, mindate="2020/01/01", maxdate="2020/09/01")

In [3]:
with open('D:\Dell_Desktop\Documents\Python Projects\ph_1975_capstone_project\webapp\hiv_records.json', 'r') as outfile:
   hiv_records = json.load(outfile)

In [4]:
hiv_clean = clean_data(hiv_records)

In [5]:
hiv_clean = hiv_clean.reset_index(drop=True)
hiv_clean.to_csv("hiv_records_clean.csv", index=False)