In [1]:
# Dec 2023
# Author: SBN
# This notebook shows a pathway and libraries needed to fetch data from PubMed.

In [None]:
## uncomment for insalling:
# !pip install biopython
# !pip install pytz
# !pip install transformers datasets
# !pip install torch torchvision torchaudio
# !pip install accelerate
# !pip install --upgrade accelerate
# !pip install --upgrade transformers


In [2]:
from Bio import Entrez
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')



KeyboardInterrupt



In [None]:
# NCBI API settings
Entrez.email = "mirtahmid@gmail.com"  # Required by NCBI to use the API

<a id="1"></a>
# <div style="text-align:center; border-radius:3px 50px; padding:7px; color:white; margin:0; font-size:100%; font-family:Pacifico; background-color:#a0b1b0; "><b>⭐ Fetching data⭐</b></div>

In [None]:
num_article = 1000
handle = Entrez.esearch(db="pubmed", term="skin cancer", retmax=num_article)
record = Entrez.read(handle)
ids = record['IdList']
handle.close()


In [None]:
ids[:-3:-1]

In [None]:
# Fetch articles with the specified PubMed IDs
handle = Entrez.efetch(db="pubmed", id=",".join(ids), rettype="xml", retmode="text")
records = Entrez.read(handle)
handle.close()

In [None]:
records.keys()

<a id="1"></a>
# <div style="text-align:center; border-radius:3px 50px; padding:7px; color:white; margin:0; font-size:100%; font-family:Pacifico; background-color:#a0b1b0; "><b>⭐ Observe the data parts⭐</b></div>

In [None]:
# pick a sample to see the output of fetching:
article_num = 0
records["PubmedArticle"][article_num]

In [None]:
records["PubmedArticle"][article_num].keys()

In [None]:
records["PubmedArticle"][article_num]['MedlineCitation'].keys()

In [None]:
records["PubmedArticle"][article_num]['PubmedData'].keys()

In [None]:
records["PubmedArticle"][article_num]['MedlineCitation']["Article"].keys()

In [None]:
records["PubmedArticle"][article_num]['MedlineCitation']["Article"]["Abstract"].keys()

In [None]:
records["PubmedArticle"][article_num]['MedlineCitation']["Article"]["ArticleTitle"]

In [None]:
records["PubmedArticle"][article_num]['MedlineCitation']["Article"]["Abstract"]["AbstractText"]

<a id="1"></a>
# <div style="text-align:center; border-radius:3px 50px; padding:7px; color:white; margin:0; font-size:100%; font-family:Pacifico; background-color:#a0b1b0; "><b>⭐ Saving data⭐</b></div>

In [None]:
abstract_bank = []
failed_fetech = 0
for i in range(num_article):
    try:
        abstract_text = records["PubmedArticle"][i]['MedlineCitation']["Article"]["Abstract"]["AbstractText"]
        if len(abstract_text)>1: # get rid of those abstracts which has more than 1 part. They are not of standard order.
            continue
    except:
        # print('one article is not fetched!')
        failed_fetech+=1
    abstract_bank.append(abstract_text[0])
    # print(abstract_text[0])
print(f"{failed_fetech/num_article *100}% of the data is corrput and failed!")    

In [None]:
file_output = "pubmed_skin_cancer_articles.txt" # where to save the data.
with open(file_output, "w") as f:
    for abstract in abstract_bank:
        f.write(abstract + "\n\n")
print("Fetched articles saved to 'pubmed_skin_cancer_articles.txt'")

In [None]:
len(records["PubmedArticle"])