In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from datetime import datetime

# To store data
data = []

# Function to scrape Healthline with pagination
def scrape_healthline():
    print("Scraping Healthline...")
    base_url = "https://www.healthline.com/health-news"
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    page = 1
    articles_count = 0
    while True:
        url = f"{base_url}?page={page}"
        driver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        articles = soup.find_all('a', class_='css-j5n0jw.css-1w777ku')
        
        if not articles:
            break
        
        for article in articles:
            title = article.get_text(strip=True)
            link = article['href']
            data.append({
                'Source': 'Healthline',
                'Title': title,
                'Link': f'https://www.healthline.com{link}',
                'Category': 'Health News'
            })
            articles_count += 1
        
        page += 1
    
    driver.quit()
    print(f"Healthline: Collected {articles_count} articles ✅")

# Function to scrape WebMD with pagination
def scrape_webmd():
    print("Scraping WebMD...")
    base_url = "https://www.webmd.com/news/default.htm"
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(base_url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    articles = soup.find_all('a', class_='article-title')
    
    articles_count = 0
    for article in articles:
        title = article.get_text(strip=True)
        link = article['href']
        data.append({
            'Source': 'WebMD',
            'Title': title,
            'Link': link if link.startswith('http') else f'https://www.webmd.com{link}',
            'Category': 'Health News'
        })
        articles_count += 1
    
    driver.quit()
    print(f"WebMD: Collected {articles_count} articles ✅")

# Function to scrape Medical News Today with pagination
def scrape_medical_news_today():
    print("Scraping Medical News Today...")
    base_url = "https://www.medicalnewstoday.com"
    page = 1
    articles_count = 0
    
    while True:
        url = f"{base_url}/news?page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('a', class_='css-1wnwphl')
        if not articles:
            break
        
        for article in articles:
            title = article.get_text(strip=True)
            link = article['href']
            data.append({
                'Source': 'Medical News Today',
                'Title': title,
                'Link': f"{base_url}{link}" if link.startswith('/') else link,
                'Category': 'Health News'
            })
            articles_count += 1
        
        page += 1
        time.sleep(random.uniform(1, 3))  # Avoid rate limiting
    
    print(f"Medical News Today: Collected {articles_count} articles ✅")

# Function to fetch PubMed articles with error handling
def fetch_pubmed_article(pmid):
    retries = 5
    backoff = 1
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }

    for attempt in range(retries):
        try:
            url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
            response = requests.get(url, headers=headers)
            if response.status_code == 429:
                print(f"🔴 Rate limit hit for ID {pmid}. Retrying in {backoff} seconds...")
                time.sleep(backoff)
                backoff *= 2
                continue
            elif response.status_code != 200:
                raise Exception(f"Invalid Response Code: {response.status_code}")

            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('h1', class_='heading-title').get_text(strip=True) if soup.find('h1', class_='heading-title') else 'No Title'
            data.append({
                'Source': 'PubMed',
                'Title': title,
                'Link': url,
                'Category': 'Research'
            })
            break
        
        except Exception as e:
            if attempt == retries - 1:
                print(f"❌ Error fetching PubMed data for ID {pmid}: {e}")
            time.sleep(random.uniform(0.5, 2))

# Function to scrape PubMed using multithreading
def scrape_pubmed():
    print("Scraping PubMed...")
    start_id = 40090000
    end_id = start_id + 10000

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_pubmed_article, pmid) for pmid in range(start_id, end_id)]
        for future in as_completed(futures):
            future.result()

# Save data to CSV with timestamp
def save_to_csv():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"healthcare_data_{timestamp}.csv"
    df = pd.DataFrame(data)
    
    try:
        df.to_csv(filename, index=False)
        print(f"\n✅ Data saved to '{filename}' ✅")
    except PermissionError:
        print("\n❌ File is open. Close the file and try again.")
    except Exception as e:
        print(f"\n❌ Failed to save data: {e}")

# Start scraping with parallel processing
def main():
    start_time = time.time()

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [
            executor.submit(scrape_healthline),
            executor.submit(scrape_webmd),
            executor.submit(scrape_medical_news_today),
            executor.submit(scrape_pubmed)
        ]
        for future in as_completed(futures):
            future.result()

    save_to_csv()
    
    total_time = time.time() - start_time
    print(f"\n✅ Total records collected: {len(data)}")
    print(f"⏳ Total time taken: {total_time:.2f} seconds")

if __name__ == "__main__":
    main()


Scraping Healthline...
Scraping WebMD...
Scraping Medical News Today...
Scraping PubMed...
Medical News Today: Collected 0 articles ✅
🔴 Rate limit hit for ID 40090031. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090030. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090032. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090033. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090034. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090035. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090037. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090036. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090038. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090039. Retrying in 1 seconds...
🔴 Rate limit hit for ID 40090030. Retrying in 2 seconds...
🔴 Rate limit hit for ID 40090031. Retrying in 2 seconds...
🔴 Rate limit hit for ID 40090032. Retrying in 2 seconds...
🔴 Rate limit hit for ID 40090034. Retrying in 2 seconds...
🔴 Rate limit hit for ID 40090033. Retryi