In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re
import random
import time

# Function to extract abstract from a research paper link
def extract_abstract(link):
    try:
        response = requests.get(link, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        abstract = soup.find("div", class_="abstract-content selected")
        return abstract.get_text(strip=True) if abstract else None
    except requests.RequestException:
        return None

# Function to clean text by removing non-alphanumeric characters except years (19xx, 20xx)
def clean_text(text):
    text = re.sub(r'\b(?!19\d{2}\b)(?!20\d{2}\b)\d+\b', '', text)  # Remove numbers except for years
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text

# Function to partition text into 150-word segments
def partition_text(text, words_per_partition=150):
    words = text.split()
    partitions = []
    
    if len(words) < words_per_partition:
        return []  # Ignore abstracts that are too short

    for i in range(0, len(words) - words_per_partition + 1, words_per_partition // 2):
        partition = " ".join(words[i:i + words_per_partition])
        if len(partition.split()) == words_per_partition:
            partitions.append(partition)

    return partitions

# Function to scrape PubMed abstracts for a given query
def scrape_pubmed(query, max_results=1500):
    base_url = "https://pubmed.ncbi.nlm.nih.gov"
    query_url = f"{base_url}/?term={query.replace(' ', '+')}&size=200"
    abstracts = []
    
    total_pages = (max_results // 200) + (1 if max_results % 200 else 0)

    for page_num in range(total_pages):
        url = f"{query_url}&page={page_num+1}"
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            results = soup.find_all("article", class_="full-docsum")
            links = [base_url + article.find("a", class_="docsum-title")["href"] for article in results]

            if not links:  # If a page has no results, retry
                continue  

            with ThreadPoolExecutor(max_workers=10) as executor:
                for abstract in executor.map(extract_abstract, links):
                    if abstract:
                        abstracts.append(clean_text(abstract))
                    
                    if len(abstracts) >= max_results:
                        return abstracts  # Stop fetching once limit is reached

            time.sleep(2)  # Prevent overwhelming the server
        except requests.RequestException:
            continue  # Skip if request fails

    return abstracts

# Define categories and queries
categories = {
    "Clinical_Depression": "clinical depression",
    "Bipolar_Disorder": "bipolar disorder",
    "Anxiety_Disorder": "anxiety disorder",
    "PTSD": "post-traumatic stress disorder",
    "Schizophrenia": "schizophrenia"
}

scraped_data = []

for label, category in categories.items():
    print(f"Scraping abstracts for: {label}")

    abstracts = scrape_pubmed(category, max_results=1500)  # Fetch more abstracts
    
    partitions = []
    for abstract in abstracts:
        partitions.extend(partition_text(abstract, words_per_partition=150))
        if len(partitions) >= 200:
            break  

    # Ensure exactly 200 partitions per category
    while len(partitions) < 200:
        partitions.append(random.choice(partitions))  # Duplicate random samples if needed

    partitions = partitions[:200]

    for partition in partitions:
        scraped_data.append({"Label": label, "Abstract": partition})

# Save to CSV
df_scraped = pd.DataFrame(scraped_data, columns=["Label", "Abstract"])
df_scraped.to_csv("scraped_pubmed_abstract_1000.csv", index=False)

print("Scraped data saved to 'scraped_pubmed_abstract_1000.csv'.")
print(df_scraped['Label'].value_counts())  # Verify category balance
print(df_scraped.head(5))


Scraping abstracts for: Clinical_Depression
Scraping abstracts for: Bipolar_Disorder
Scraping abstracts for: Anxiety_Disorder
Scraping abstracts for: PTSD
Scraping abstracts for: Schizophrenia
Scraped data saved to 'scraped_pubmed_abstract_1000.csv'.
Label
Clinical_Depression    200
Bipolar_Disorder       200
Anxiety_Disorder       200
PTSD                   200
Schizophrenia          200
Name: count, dtype: int64
                 Label                                           Abstract
0  Clinical_Depression  ObjectiveAdolescence is a formative and turbul...
1  Clinical_Depression  Subgroup analysis were performed for year of p...
2  Clinical_Depression  Major depressive disorder MDD is considered a ...
3  Clinical_Depression  BackgroundBurnout and clinical depression have...
4  Clinical_Depression  The interaction of physical and mental vulnera...
