In [3]:
!pip install requests beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL of the page containing the links
url = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'  # Replace with the actual URL

# Destination folder to save the downloaded files
destination_folder = 'Downloads/project work'  # For Google Colab, use /content as the base directory

# Create the folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status()  # Raise an error for bad status codes

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <a> tags
links = soup.find_all('a')

# Download each XML link
for link in links:
    href = link.get('href')
    if href and (href.endswith('.xml') or 'xml' in href):
        full_url = urljoin(url, href)  # Handle relative URLs
        file_name = os.path.join(destination_folder, os.path.basename(full_url))
        print(f'Downloading {full_url} to {file_name}')
        try:
            file_response = requests.get(full_url)
            file_response.raise_for_status()
            with open(file_name, 'wb') as file:
                file.write(file_response.content)
            print(f'Successfully downloaded {file_name}')
        except requests.exceptions.RequestException as e:
            print(f'Failed to download {full_url}: {e}')



Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0001.xml.gz to Downloads/project work\pubmed24n0001.xml.gz
Successfully downloaded Downloads/project work\pubmed24n0001.xml.gz
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0001.xml.gz.md5 to Downloads/project work\pubmed24n0001.xml.gz.md5
Successfully downloaded Downloads/project work\pubmed24n0001.xml.gz.md5
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0002.xml.gz to Downloads/project work\pubmed24n0002.xml.gz
Successfully downloaded Downloads/project work\pubmed24n0002.xml.gz
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0002.xml.gz.md5 to Downloads/project work\pubmed24n0002.xml.gz.md5
Successfully downloaded Downloads/project work\pubmed24n0002.xml.gz.md5
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0003.xml.gz to Downloads/project work\pubmed24n0003.xml.gz
Successfully downloaded Downloads/project work\pubmed24n0003.xml.gz
Downloading

In [None]:
!pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


only to extract abstract from a single zip to json converted file

In [None]:
import gzip
import xmltodict
import json

# Path to the .xml.gz file
gz_file_path = '/content/downloads/pubmed24n0012.xml.gz'  # Update this with the path to your .xml.gz file

# Function to extract abstract from JSON data
def extract_abstract(json_data):
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            if key.lower() == 'abstract':
                if isinstance(value, dict) and 'AbstractText' in value:
                    return value['AbstractText']
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict) and 'AbstractText' in item:
                            return item['AbstractText']
            else:
                result = extract_abstract(value)
                if result:
                    return result
    elif isinstance(json_data, list):
        for item in json_data:
            result = extract_abstract(item)
            if result:
                return result
    return None

# Read the .xml.gz file, convert to JSON, and extract abstract
try:
    with gzip.open(gz_file_path, 'rt', encoding='utf-8') as xml_file:
        xml_content = xml_file.read()
        xml_dict = xmltodict.parse(xml_content)
        json_content = json.dumps(xml_dict, indent=4)

        # Print the JSON content (optional)
        # print(json_content)

        # Extract abstract from JSON
        abstract = extract_abstract(xml_dict)
        if abstract:
            print(f'Abstract found in {gz_file_path}:')
            print(abstract)
        else:
            print(f'No abstract found in {gz_file_path}')

except Exception as e:
    print(f'Failed to process {gz_file_path}: {e}')


ModuleNotFoundError: No module named 'xmltodict'

code to extract abstract from all the zip files but only extract one abstract the first one not the all

In [None]:
import os
import gzip
import xmltodict

# Source directory containing .xml.gz files
source_dir = '/content/downloads'  # Update this with your source directory

# Destination directory to save extracted abstracts
destination_abstracts_dir = '/content/drive/MyDrive/abstract_text'  # Update this with your abstracts destination directory

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_abstracts_dir):
    os.makedirs(destination_abstracts_dir)

# Function to extract abstract from JSON data
def extract_abstract(json_data):
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            if key.lower() == 'abstract':
                if isinstance(value, dict) and 'AbstractText' in value:
                    return value['AbstractText']
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict) and 'AbstractText' in item:
                            return item['AbstractText']
            else:
                result = extract_abstract(value)
                if result:
                    return result
    elif isinstance(json_data, list):
        for item in json_data:
            result = extract_abstract(item)
            if result:
                return result
    return None

# Function to process abstract text
def process_abstract_text(abstract_text):
    if isinstance(abstract_text, str):
        return abstract_text
    elif isinstance(abstract_text, list):
        processed_list = [process_abstract_text(item) for item in abstract_text]
        return ' '.join(filter(None, processed_list))
    elif isinstance(abstract_text, dict):
        return process_abstract_text(abstract_text.get('#text', ''))
    return ''

# Process each .xml.gz file in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith('.xml.gz'):
        gz_file_path = os.path.join(source_dir, file_name)
        abstract_file_name = os.path.splitext(os.path.splitext(file_name)[0])[0] + '_abstract.txt'
        abstract_file_path = os.path.join(destination_abstracts_dir, abstract_file_name)

        try:
            with gzip.open(gz_file_path, 'rt', encoding='utf-8') as xml_file:
                xml_content = xml_file.read()
                xml_dict = xmltodict.parse(xml_content)

                # Extract abstract from JSON
                abstract = extract_abstract(xml_dict)
                if abstract:
                    abstract_text = process_abstract_text(abstract)
                    with open(abstract_file_path, 'w', encoding='utf-8') as abstract_file:
                        abstract_file.write(abstract_text)
                    print(f'Abstract extracted and saved for {gz_file_path}')
                else:
                    print(f'No abstract found in {gz_file_path}')

        except Exception as e:
            print(f'Failed to process {gz_file_path}: {e}')


Abstract extracted and saved for /content/downloads/pubmed24n0885.xml.gz
Abstract extracted and saved for /content/downloads/pubmed24n0397.xml.gz


KeyboardInterrupt: 

# To extract all the abstract from a file

In [None]:
import os
import gzip
import xmltodict

# Source directory containing .xml.gz files
source_dir = '/content/downloads'  # Update this with your source directory

# Destination directory to save extracted abstracts
destination_abstracts_dir = '/content/abstracts'  # Update this with your abstracts destination directory

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_abstracts_dir):
    os.makedirs(destination_abstracts_dir)

# Function to extract abstracts from JSON data
def extract_abstracts(json_data):
    abstracts = []
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            if key.lower() == 'abstract':
                if isinstance(value, dict) and 'AbstractText' in value:
                    abstracts.append(value['AbstractText'])
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict) and 'AbstractText' in item:
                            abstracts.append(item['AbstractText'])
            else:
                abstracts.extend(extract_abstracts(value))
    elif isinstance(json_data, list):
        for item in json_data:
            abstracts.extend(extract_abstracts(item))
    return abstracts

# Function to process abstract text
def process_abstract_text(abstract_text):
    if isinstance(abstract_text, str):
        return abstract_text
    elif isinstance(abstract_text, list):
        processed_list = [process_abstract_text(item) for item in abstract_text]
        return ' '.join(filter(None, processed_list))
    elif isinstance(abstract_text, dict):
        return process_abstract_text(abstract_text.get('#text', ''))
    return ''

# Process each .xml.gz file in the source directory
for file_name in os.listdir(source_dir):
    if file_name.endswith('.xml.gz'):
        gz_file_path = os.path.join(source_dir, file_name)
        base_name = os.path.splitext(os.path.splitext(file_name)[0])[0]

        try:
            with gzip.open(gz_file_path, 'rt', encoding='utf-8') as xml_file:
                xml_content = xml_file.read()
                xml_dict = xmltodict.parse(xml_content)

                # Extract abstracts from JSON
                abstracts = extract_abstracts(xml_dict)
                for idx, abstract in enumerate(abstracts):
                    if abstract:
                        abstract_text = process_abstract_text(abstract)
                        abstract_file_name = f"{base_name}_abstract_{idx + 1}.txt"
                        abstract_file_path = os.path.join(destination_abstracts_dir, abstract_file_name)
                        with open(abstract_file_path, 'w', encoding='utf-8') as abstract_file:
                            abstract_file.write(abstract_text)
                        print(f'Abstract {idx + 1} extracted and saved for {gz_file_path}')
                    else:
                        print(f'No abstract found in {gz_file_path}')

        except Exception as e:
            print(f'Failed to process {gz_file_path}: {e}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Abstract 8416 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8417 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8418 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8419 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8420 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8421 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8422 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8423 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8424 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8425 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8426 extracted and saved for /content/downloads/pubmed24n0002.xml.gz
Abstract 8427 extracted and saved for /content/downloads/pubmed24n0002.xml.gz

To further extract the disseasse and symptoms

In [None]:
import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define patterns for diseases and symptoms (adjust these as needed)
disease_patterns = [r"(disease|illness|condition)\s+([A-Za-z]+)",
                    r"([A-Za-z]+)\s+fever|([A-Za-z]+)\s+cancer|([A-Za-z]+)\s+syndrome"]
symptom_patterns = [r"([A-Za-z]+)\s+pain|headache|fever|nausea|cough|rash"]

def extract_entities(text):
  """
  This function extracts disease and symptom mentions from text using patterns and spaCy.

  Args:
      text: The text to process.

  Returns:
      A dictionary containing lists of disease and symptom mentions.
  """
  entities = {"diseases": [], "symptoms": []}
  for pattern in disease_patterns:
    matches = re.findall(pattern, text, re.IGNORECASE)
    for match in matches:
      entities["diseases"].append(match[0] if len(match) == 1 else match[1])

  for pattern in symptom_patterns:
    matches = re.findall(pattern, text, re.IGNORECASE)
    for match in matches:
      entities["symptoms"].append(match)

  # Optional: Use spaCy for named entity recognition (might improve results)
  doc = nlp(text)
  for ent in doc.ents:
    if ent.label_ in ("NOUN", "PROPN"):  # Match broader entity types
      entities["diseases"].append(ent.text)

  return entities

# Example usage
text = "I have a disease headache and a symptom fever, could it be the flu?"
entities = extract_entities(text)

print("Diseases:", entities["diseases"])
print("Symptoms:", entities["symptoms"])


Diseases: ['headache', '']
Symptoms: ['', '']
