In [27]:
import sys
import os
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Log\data_loader.log"

# Log fayli mavjud papka borligini tekshirish
os.makedirs(os.path.dirname(log_path), exist_ok=True)

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)


| Drug        | Brand name | Species | Use        | Notes       |
| ----------- | ---------- | ------- | ---------- | ----------- |
| Amoxicillin | Clamoxyl   | Dog     | Antibiotic | Common use  |
| Ketamine    | Ketavet    | Cat     | Anesthetic | Widely used |
| ...         | ...        | ...     | ...        | ...         |


In [None]:
#!/usr/bin/env python3
"""
Drugs.com — Veterinary section scraper (1000 ta sahifa limit)
"""

import time
import csv
import os
import random
import string
import sys
import logging
from urllib.parse import urljoin
import urllib.robotparser as robotparser

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# --------------- Configuration ---------------
BASE = "https://www.drugs.com"
VET_BASE = "https://www.drugs.com/vet/"
OUTPUT_CSV = "drugs_com_vet_1000.csv"
CHECKPOINT = "drugs_com_vet_progress.txt"  # store visited URLs
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
DELAY_MIN = 0.5
DELAY_MAX = 1.5
MAX_ITEMS = 1000  # limit to 1000 items
HEADLESS = True

SPECIES_PAGES = [
    ("dogs", "/vet/dogs-a.html"),
    ("cats", "/vet/cats-a.html"),
    ("horses", "/vet/horses-a.html"),
    ("dairy-cattle", "/vet/dairy-cattle-a.html"),
    ("beef-cattle", "/vet/beef-cattle-a.html"),
    ("poultry", "/vet/poultry-a.html"),
    ("swine", "/vet/swine-a.html"),
    ("goats", "/vet/goats-a.html"),
    ("sheep", "/vet/sheep-a.html"),
    ("exotic", "/vet/exotic-a.html"),
]

# --------------- Logging ---------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("drugscom_vet_scraper")

# The rest of the code remains the same, only MAX_ITEMS changed to 1000
# ... [rest of the scraping code stays identical to previous version]


# --------------- Helper functions ---------------

def obeys_robots(target_url, user_agent="*"):
    """Check robots.txt for allowed access to target_url."""
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(BASE, "/robots.txt"))
    try:
        rp.read()
    except Exception as e:
        logger.warning("Could not read robots.txt: %s", e)
        return False
    return rp.can_fetch(user_agent, target_url)


def random_delay():
    time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))


def ensure_output():
    header = ["Drug", "Brand name", "Species", "Use", "Notes", "Source URL"]
    if not os.path.exists(OUTPUT_CSV):
        with open(OUTPUT_CSV, "w", newline='', encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(header)


def load_checkpoint():
    seen = set()
    if os.path.exists(CHECKPOINT):
        with open(CHECKPOINT, "r", encoding="utf-8") as f:
            for line in f:
                seen.add(line.strip())
    return seen


def save_checkpoint(url):
    with open(CHECKPOINT, "a", encoding="utf-8") as f:
        f.write(url + "\n")


# --------------- Selenium setup ---------------

def make_driver(headless=True, user_agent=None):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--window-size=1200,900")
    if user_agent:
        opts.add_argument(f"--user-agent={user_agent}")
    # optional: reduce image loading to save bandwidth
    prefs = {"profile.managed_default_content_settings.images": 2}
    opts.add_experimental_option("prefs", prefs)

    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=opts)
    driver.set_page_load_timeout(60)
    return driver


# --------------- Parsing helpers ---------------

def parse_index_page(html, species_name):
    """Parse an AZ index page to extract product links.
    Drugs.com index pages list product names as <ul>/<li> or <div class="contentBox"> links.
    Selector logic may need adjustment if website changes.
    Returns list of absolute URLs.
    """
    soup = BeautifulSoup(html, "html.parser")
    links = []

    # Try common containers — this is resilient but may need tuning
    for a in soup.select("#content a, .contentBox a, .azindex a"):
        href = a.get('href')
        if not href:
            continue
        # skip anchors and javascript
        if href.startswith('#') or href.startswith('javascript:'):
            continue
        # product pages tend to contain '/vet/' and end with '.html' or product slug
        if '/vet/' in href or '/drug/' in href or href.startswith('/vet'):
            full = urljoin(BASE, href)
            links.append(full)

    # deduplicate
    return list(dict.fromkeys(links))


def parse_product_page(html, source_url, species_hint=None):
    """Extract Drug, Brand name, Species, Use, Notes from a product page HTML."""
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ", strip=True)

    # Drug name: try <h1>
    drug = None
    h1 = soup.find('h1')
    if h1:
        drug = h1.get_text(strip=True)

    # Brand names: search for 'Brand names' or small descriptors
    brand = None
    brand_label = soup.find(string=lambda s: s and 'Brand name' in s or s and 'Brand names' in s)
    if brand_label:
        # parent paragraph likely contains the content
        parent = brand_label.parent
        brand = parent.get_text(separator=' ', strip=True).replace('Brand names:', '').replace('Brand name:', '').strip()

    # Species: some pages include a species header or list. Fall back to species_hint
    species = species_hint or ''
    species_candidates = []
    # try to locate an info box
    for strong in soup.select('strong'):
        txt = strong.get_text(strip=True)
        if 'Species' in txt:
            # grab next sibling text
            nxt = strong.next_sibling
            if nxt:
                species_candidates.append(str(nxt).strip())
    if not species_candidates:
        # heuristic: look for known species words
        for word in ['Dog', 'Cat', 'Horse', 'Cattle', 'Sheep', 'Goat', 'Pig', 'Poultry', 'Bird', 'Rabbit']:
            if word.lower() in text.lower():
                species_candidates.append(word)
    if species_candidates:
        species = ", ".join(dict.fromkeys([s for s in species_candidates if s]))

    # Use (indication) — try headings
    use = None
    for header_text in ['Uses', 'Indications', 'Indicated for', 'Indication']:
        h = soup.find(lambda tag: tag.name in ['h2', 'h3'] and tag.get_text(strip=True).startswith(header_text))
        if h:
            p = h.find_next('p')
            if p:
                use = p.get_text(separator=' ', strip=True)
                break
    if not use:
        # fallback: look for 'used to' pattern
        idx = text.lower().find('used to')
        if idx != -1:
            use = text[idx: idx+200]

    # Notes / side effects / comments
    notes = None
    for header_text in ['Side effects', 'Adverse reactions', 'Notes', 'Cautions', 'Warnings']:
        h = soup.find(lambda tag: tag.name in ['h2', 'h3'] and header_text.lower() in tag.get_text(strip=True).lower())
        if h:
            p = h.find_next('p')
            if p:
                notes = p.get_text(separator=' ', strip=True)
                break

    return {
        'Drug': drug,
        'Brand name': brand,
        'Species': species,
        'Use': use,
        'Notes': notes,
        'Source URL': source_url,
    }


# --------------- Main crawl logic ---------------

def main():
    logger.info('Starting Drugs.com veterinary scraper')

    # quick robots.txt check on the base section
    if not obeys_robots(VET_BASE, user_agent='*'):
        logger.error('robots.txt disallows scraping %s — aborting. Check site policy.', VET_BASE)
        sys.exit(1)

    ensure_output()
    seen = load_checkpoint()
    logger.info('Already-seen URLs loaded: %d', len(seen))

    driver = make_driver(headless=HEADLESS, user_agent=random.choice(USER_AGENTS))

    total_saved = 0
    try:
        for species_key, species_path in SPECIES_PAGES:
            # iterate A-Z index letters — drugs.com uses paths like /vet/dogs-a.html .. -z
            for letter in list(string.ascii_lowercase):
                idx_url = urljoin(BASE, species_path.replace('-a', f'-{letter}'))
                logger.info('Fetching index: %s (species: %s)', idx_url, species_key)

                if not obeys_robots(idx_url, user_agent='*'):
                    logger.warning('Blocked by robots.txt: %s - skipping', idx_url)
                    continue

                try:
                    driver.get(idx_url)
                except Exception as e:
                    logger.warning('Failed to load %s: %s', idx_url, e)
                    continue

                random_delay()

                links = parse_index_page(driver.page_source, species_key)
                logger.info('Found %d candidate links on index %s', len(links), idx_url)

                for link in links:
                    if total_saved >= MAX_ITEMS:
                        logger.info('Reached MAX_ITEMS=%d - finishing', MAX_ITEMS)
                        return
                    if link in seen:
                        continue

                    if not obeys_robots(link, user_agent='*'):
                        logger.debug('Link disallowed by robots: %s', link)
                        seen.add(link)
                        save_checkpoint(link)
                        continue

                    # fetch product page
                    try:
                        driver.get(link)
                    except Exception as e:
                        logger.warning('Failed to open product %s: %s', link, e)
                        seen.add(link)
                        save_checkpoint(link)
                        continue

                    random_delay()

                    item = parse_product_page(driver.page_source, link, species_hint=species_key.capitalize())

                    # write to CSV
                    with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8') as f:
                        writer = csv.writer(f)
                        writer.writerow([item.get('Drug'), item.get('Brand name'), item.get('Species'), item.get('Use'), item.get('Notes'), item.get('Source URL')])

                    seen.add(link)
                    save_checkpoint(link)
                    total_saved += 1

                    logger.info('[%d] Saved: %s', total_saved, item.get('Drug'))

                    # small backoff
                    random_delay()

    finally:
        driver.quit()
        logger.info('Driver closed. Total saved: %d', total_saved)


if __name__ == '__main__':
    main()


2025-11-06 14:11:41,563 INFO: Starting Drugs.com veterinary scraper


2025-11-06 14:11:42,039 INFO: Already-seen URLs loaded: 0
2025-11-06 14:11:43,153 INFO: Get LATEST chromedriver version for google-chrome
2025-11-06 14:11:43,854 INFO: Get LATEST chromedriver version for google-chrome
2025-11-06 14:11:44,571 INFO: Driver [C:\Users\Rasulbek907\.wdm\drivers\chromedriver\win64\142.0.7444.61\chromedriver-win32/chromedriver.exe] found in cache
2025-11-06 14:11:45,725 INFO: Fetching index: https://www.drugs.com/vet/dogs-a.html (species: dogs)
2025-11-06 14:11:49,628 INFO: Found 327 candidate links on index https://www.drugs.com/vet/dogs-a.html
2025-11-06 14:11:54,620 INFO: [1] Saved: Veterinary Products
2025-11-06 14:12:00,919 INFO: [2] Saved: A O E All Purpose Deodorizing Pet Wipes
2025-11-06 14:12:08,433 INFO: [3] Saved: A O E All Purpose Deodorizing Pet Wipes (Canada)
2025-11-06 14:12:13,970 INFO: [4] Saved: A O E, Animal Odor Eliminator
2025-11-06 14:12:19,019 INFO: [5] Saved: A O E, Animal Odor Eliminator (Canada)
2025-11-06 14:24:14,284 INFO: [6] Saved