In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [1]:
import json
import numpy as np
import torch

def read_question_answer_file(file_path):
    """Reads a JSONL file with question-answer data and returns a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))  # Parse each line as JSON
    return data

# Load your dataset

dataset_path = 'med_data/phrases_no_exclude_train.jsonl'  # Replace with the path to your JSON file
questions_data = read_question_answer_file(dataset_path)

print(questions_data[0]['question'])
print(questions_data[0]['answer_idx'])
print(questions_data[0]['options'])

A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
D
{'A': 'Ampicillin', 'B': 'Ceftriaxone', 'C': 'Doxycycline', 'D': 'Nitrofurantoin'}


In [10]:
import re

def clean_text(text):
  """Removes special characters and extra whitespace from text.

  Args:
    text: The input text to be cleaned.

  Returns:
    The cleaned text.
  """
  special_chars = r"[\u03b2\u00b5\u03b4\u03c5\u03bb\u0394\u00f6]"
  # Remove special characters using regular expression
  text = re.sub(r"[^\w\s]", "", text)

  # Remove extra whitespace
  text = " ".join(text.split())

  return text

In [11]:
def save_string_to_file(data, filename):
  """Saves a string to a text file.

  Args:
    text: The string to be saved.
    filename: The name of the file to create.
  """

  with open(filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4)

# # Example usage:
# my_string = "This is the text I want to save."
# save_string_to_file(my_string, "output.txt")

In [12]:
def sanitize_filename(filename):
  """Sanitizes a filename by replacing special characters with underscores.

  Args:
    filename: The original filename.

  Returns:
    The sanitized filename.
  """

  # Replace non-alphanumeric characters with underscores
  filename = re.sub(r'[^\w]', '_', filename)

  # Remove leading and trailing underscores
  filename = filename.strip('_')

  return filename

In [13]:
!pip install metapub

Collecting metapub
  Downloading metapub-0.5.12-py2.py3-none-any.whl.metadata (16 kB)
Collecting lxml-html-clean (from metapub)
  Downloading lxml_html_clean-0.2.0-py3-none-any.whl.metadata (1.8 kB)
Collecting eutils (from metapub)
  Downloading eutils-0.6.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting habanero (from metapub)
  Downloading habanero-1.2.6-py2.py3-none-any.whl.metadata (14 kB)
Collecting tabulate (from metapub)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting cssselect (from metapub)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting unidecode (from metapub)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting docopt (from metapub)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting coloredlogs (from metapub)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-Levenshtein (from metapub)
  Downloading python_

In [14]:
from metapub import PubMedFetcher
import time

access_key_id = "18acd1db794f7de35c1c83811bc106c6a509"

def fetch_and_process_data(keyword, api_key):
    """Fetches PubMed articles and returns a dictionary of PMIDs and abstracts.

    Args:
        keyword: The search keyword.
        api_key: The NCBI API key.

    Returns:
        A dictionary containing PMIDs as keys and abstracts as values.
    """

    fetch = PubMedFetcher(api_key=api_key)
    pmids = fetch.pmids_for_query(keyword, retmax=10)

    abstracts = {}
    for pmid in pmids:
        article = fetch.article_by_pmid(pmid)
        abstracts[pmid] = article.abstract
        time.sleep(0.2)
    return abstracts

# Example usage:
# keyword = "Ceftriaxone"
# abstracts_dict = fetch_and_process_data(keyword, access_key_id)
# print(abstracts_dict)



In [15]:
from tqdm import tqdm

questionLoader = questions_data[787:1000]
for question in  tqdm(questionLoader):
    options = question['options']
    for key, value in options.items():
        query = value
        bioc_data = []
        abstracts_dict = fetch_and_process_data(query,access_key_id)
        save_string_to_file(abstracts_dict, "Pubmed _Abstract/" + sanitize_filename(query) + ".json")

100%|██████████| 213/213 [2:04:25<00:00, 35.05s/it]  


In [7]:
questionLoader = questions_data[800:1000]
questionLoader[0]

{'question': 'A 41-year-old G3P3 woman presents with acute on chronic right upper quadrant abdominal pain. She says that her current symptoms acutely onset 8 hours ago after eating a large meal and have not improved. She describes the pain as severe, sharp and cramping in character, and localized to the right upper quadrant. She also describes feeling nauseous. The patient says she has had similar less severe episodes intermittently for the past 2 years, usually precipitated by the intake of fatty foods. She denies any history of fever or jaundice. Vital signs are stable. Physical examination is unremarkable, and laboratory findings show normal liver function tests and normal serum bilirubin and serum amylase levels. Ultrasonography of the abdomen reveals multiple stones in the gallbladder. The patient is managed symptomatically for this episode, and after a few months, undergoes elective cholecystectomy, which reveals multiple stones in her gallbladder as shown in the figure (see imag