In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from concurrent.futures import ThreadPoolExecutor

DATABASE = "drug_data.db"
NUM_THREADS = 10  # Adjust based on system capability

# Step 1: Scraping Drug Data
def fetch_page(letter):
    """Fetch a single page of drugs for a given letter."""
    base_url = "https://www.drugs.com/drug_information.html"
    letter_url = f"{base_url}?letter={letter}"
    response = requests.get(letter_url)
    soup = BeautifulSoup(response.content, "html.parser")
    drug_list_section = soup.find("ul", {"class": "ddc-list-column-4"})
    if drug_list_section:
        drug_links = [f"https://www.drugs.com{link['href']}" for link in drug_list_section.find_all("a", href=True)[:50]]
        return drug_links
    return []

def fetch_drug_data(drug_url):
    """Fetch details for a single drug."""
    response = requests.get(drug_url)
    soup = BeautifulSoup(response.content, "html.parser")
    drug_name = soup.find("h1").text.strip() if soup.find("h1") else "N/A"
    uses_tag = soup.find("h2", string=lambda text: text and "What is" in text)
    uses = uses_tag.find_next("p").text.strip() if uses_tag and uses_tag.find_next("p") else "N/A"
    side_effects = []
    side_effects_tag = soup.find("h2", id="side-effects")
    if side_effects_tag:
        element = side_effects_tag.find_next_sibling()
        while element:
            if element.name == "h2":
                break
            side_effects.append(element.text.strip())
            element = element.find_next_sibling()
    side_effects = ", ".join(side_effects) if side_effects else "N/A"
    return {"Drug": drug_name, "Uses": uses, "SideEffect": side_effects}

def scrape_drugs_parallel():
    """Scrape drugs using multithreading."""
    drugs_data = []
    letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    
    # Fetch drug links in parallel
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        all_drug_links = executor.map(fetch_page, letters)
    
    # Flatten the list of drug links
    all_drug_links = [link for links in all_drug_links for link in links]
    
    # Fetch drug details in parallel
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        drugs_data = list(executor.map(fetch_drug_data, all_drug_links))
    
    return pd.DataFrame(drugs_data)

# Step 2: Cleaning Data
def clean_data(df):
    df["Drug"] = df["Drug"].str.replace(r"[^a-zA-Z0-9\s]", "", regex=True).str.strip()
    df["Uses"] = df["Uses"].str.lower()
    df["SideEffect"] = df["SideEffect"].str.lower()
    df["SideEffectCount"] = df["SideEffect"].apply(lambda x: len(x.split(", ")) if x != "n/a" else 0)
    df.drop_duplicates(subset=["Drug"], inplace=True)
    return df

# Step 3: SQL Analysis
def save_to_sqlite(df):
    conn = sqlite3.connect(DATABASE)
    df.to_sql("drugs", conn, if_exists="replace", index=False)
    conn.close()

def perform_sql_analysis():
    conn = sqlite3.connect(DATABASE)
    
    # Top 5 Drugs with the Highest Mentions of "Severe" in Side Effects
    severe_query = """
    SELECT Drug, SideEffect
    FROM drugs
    WHERE SideEffect LIKE '%severe%'
    ORDER BY LENGTH(SideEffect) - LENGTH(REPLACE(SideEffect, 'severe', '')) DESC
    LIMIT 5;
    """
    top_severe = pd.read_sql_query(severe_query, conn)
    print("\nTop 5 Drugs with 'Severe' in Side Effects:")
    print(top_severe)
    
    # Average Number of Side Effects per Drug
    avg_side_effect_query = "SELECT AVG(SideEffectCount) AS AverageSideEffects FROM drugs;"
    avg_side_effects = pd.read_sql_query(avg_side_effect_query, conn)
    print(f"\nAverage Number of Side Effects per Drug: {avg_side_effects['AverageSideEffects'][0]:.2f}")

    conn.close()

# Main Execution
if __name__ == "__main__":
    print("Scraping drug data...")
    raw_data = scrape_drugs_parallel()
    print("Cleaning drug data...")
    cleaned_data = clean_data(raw_data)
    print("Saving data to SQLite database...")
    save_to_sqlite(cleaned_data)
    print("Performing SQL analysis...")
    perform_sql_analysis()


Scraping drug data...
Cleaning drug data...
Saving data to SQLite database...
Performing SQL analysis...

Top 5 Drugs with 'Severe' in Side Effects:
            Drug                                         SideEffect
0       Keytruda  the most common side effects of keytruda when ...
1  Ciprofloxacin  get emergency medical help if you have signs o...
2    Doxycycline  common doxycycline side effects, the most comm...
3      Imbruvica  common side effects of imbruvica, common imbru...
4        Januvia  get emergency medical help if you have signs o...

Average Number of Side Effects per Drug: 42.06
