In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Clutch.co crawler (reviews + social links) with anti-bot mitigation.

- curl_cffi Session + HTTP/2
- Rotate impersonate (chrome141 -> chrome120 -> safari17_0)
- Retry with exponential backoff + jitter on 403/429
- Random sleep between requests
- Graceful skip when blocked
- CLI: --limit, --out, --start-url

Usage:
    python crawl_clutch.py --limit 20 --out clutch_reviews.xlsx
"""

from __future__ import annotations

import argparse
import random
import sys
import time
from typing import Optional, List
from urllib.parse import urljoin
from sklearn.model_selection import train_test_split
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from typing import Tuple, List, Dict, Any
import shutil, random, tempfile, json
try:
    from tqdm import tqdm
except Exception:  # pragma: no cover
    # Fallback khi kh√¥ng c√†i tqdm
    def tqdm(x, **kwargs):
        return x
from concurrent.futures import ThreadPoolExecutor, as_completed
from curl_cffi import requests as creq
import fcntl
import shutil, os


# =========================
# Config m·∫∑c ƒë·ªãnh
# =========================
DEFAULT_START_URL = "https://clutch.co/it-services"

# Kh√¥ng t·ª± set User-Agent n·∫øu d√πng impersonate
COMMON_HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://clutch.co/",
    "Upgrade-Insecure-Requests": "1",
}

# Th·ª© t·ª± th·ª≠ impersonate khi b·ªã ch·∫∑n
IMP_CHOICES = ["chrome141", "chrome120", "safari17_0"]

# Kho·∫£ng ng·ªß ng·∫´u nhi√™n gi·ªØa c√°c request (gi√¢y)
SLEEP_MIN = 6.5
SLEEP_MAX = 12.0

# =========================
# Session to√†n c·ª•c
# =========================
SESS = creq.Session(
    headers=COMMON_HEADERS,
    timeout=30,
)


# =========================
# HTTP helper
# =========================
def http_get(url: str, max_retries: int = 4) -> Optional[creq.Response]:
    """
    GET v·ªõi retry/backoff. ƒê·ªïi impersonate n·∫øu 403/429.
    Tr·∫£ v·ªÅ Response 200 ho·∫∑c None n·∫øu th·∫•t b·∫°i.
    """
    delay = 2.0
    for i in range(max_retries):
        imp = IMP_CHOICES[min(i, len(IMP_CHOICES) - 1)]
        try:
            r = SESS.get(url, impersonate=imp, allow_redirects=True)
            code = r.status_code
            if code == 200:
                return r
            if code in (403, 429):
                # B·ªã ch·∫∑n / rate limit ‚Üí backoff + jitter r·ªìi th·ª≠ ti·∫øp
                time.sleep(delay + random.uniform(0.5, 1.8))
                delay *= 1.8
                continue
            # L·ªói kh√°c: th·ª≠ 1-2 l·∫ßn nh·∫π
            time.sleep(1.0)
        except Exception:
            time.sleep(delay)
            delay *= 1.8
    return None


# =========================
# Parsers
# =========================
def extract_review_data(soup: BeautifulSoup) -> dict:
    container = soup.find("div", class_="profile-review__data")
    if not container:
        return {}

    result: dict = {}
    li_items = container.select("ul.data--list > li.data--item")

    for li in li_items:
        tooltip_html = li.get("data-tooltip-content", "")
        tooltip_label = (
            BeautifulSoup(tooltip_html, "html.parser").get_text(strip=True)
            if tooltip_html
            else None
        )
        text_parts = [
            t.strip()
            for t in li.stripped_strings
            if not t.lower().startswith("show more")
        ]
        if not text_parts:
            continue

        if tooltip_label:
            result[tooltip_label] = " ".join(text_parts)
        else:
            result.setdefault("unknown", []).append(" ".join(text_parts))
    return result


def extract_reviewer_info(soup: BeautifulSoup) -> dict:
    container = soup.find("div", class_="profile-review__reviewer")
    if not container:
        return {}

    result: dict = {}

    # T√™n reviewer
    name_tag = container.find("div", class_="reviewer_card--name")
    if name_tag:
        result["reviewer_name"] = name_tag.get_text(strip=True)

    # Ch·ª©c v·ª• & c√¥ng ty
    position_tag = container.find("div", class_="reviewer_position")
    if position_tag:
        text = position_tag.get_text(strip=True)
        result["reviewer_position_raw"] = text
        if "," in text:
            parts = [p.strip() for p in text.split(",", 1)]
            result["reviewer_role"] = parts[0]
            result["reviewer_company"] = parts[1]
        else:
            result["reviewer_role"] = text

    # Tr·∫°ng th√°i verified
    verified_tag = container.find(
        "span", class_="profile-review__reviewer-verification-badge-title"
    )
    if verified_tag:
        result["verified_status"] = verified_tag.get_text(strip=True)

    # Industry, Location, Client size, Review type
    list_items = container.select("ul.reviewer_list > li.reviewer_list--item")
    for li in list_items:
        tooltip_html = li.get("data-tooltip-content", "")
        label = (
            BeautifulSoup(tooltip_html, "html.parser").get_text(strip=True)
            if tooltip_html
            else None
        )
        value_tag = li.find("span", class_="reviewer_list__details-title")
        value = value_tag.get_text(strip=True) if value_tag else None
        if label and value:
            result[label] = value

    return result


def extract_social_links(scope: BeautifulSoup) -> dict:
    """
    L·∫•y social links ·ªü v√πng 'scope' (th∆∞·ªùng l√† section#contact; n·∫øu kh√¥ng c√≥ th√¨ truy·ªÅn c·∫£ trang).
    """
    links: dict = {}
    social_links = scope.select(
        "div.profile-social-media__wrap a.profile-social-media__link"
    )
    for a in social_links:
        label = a.get("data-type") or a.get_text(strip=True)
        href = a.get("href")
        if label and href:
            links[f"{label.lower()} Company Outsource"] = href
    return links


# =========================
# Scrapers
# =========================
def get_base_url_company(start_url: str) -> List[str]:
    """
    L·∫•y danh s√°ch URL company profile t·ª´ trang directory.
    """
    ans: List[str] = []
    resp = http_get(start_url)
    if not resp:
        print("‚ùå Kh√¥ng l·∫•y ƒë∆∞·ª£c trang list (403/timeout).", file=sys.stderr)
        return ans

    soup = BeautifulSoup(resp.text, "html.parser")
    providers_list = soup.find(id="providers__list")
    if not providers_list:
        print("‚ùå Kh√¥ng t√¨m th·∫•y providers__list (c√≥ th·ªÉ b·ªã anti-bot).", file=sys.stderr)
        return ans

    for li in providers_list.find_all("li", class_="provider-list-item"):
        cta = li.find("div", class_="provider__cta-container")
        if not cta:
            continue
        a = cta.find(
            "a",
            class_="provider__cta-link sg-button-v2 sg-button-v2--secondary directory_profile",
        )
        if a and a.get("href"):
            ans.append(urljoin("https://clutch.co", a["href"]))
    return ans


def get_detail_information(company_url: str) -> tuple[str,pd.DataFrame]:
    """
    L·∫•y b·∫£ng reviews + social links t·ª´ 1 trang company.
    """
    resp = http_get(company_url)
    if not resp:
        print(f"‚ö†Ô∏è B·ªè qua (403/timeout): {company_url}", file=sys.stderr)
        return "TIME_OUT",pd.DataFrame()

    soup = BeautifulSoup(resp.text, "html.parser")

    contact_scope = soup.find("section", id="contact") or soup  # fallback
    link_social = extract_social_links(contact_scope)

    reviews_wrap = soup.find("div", class_="profile-reviews--list__wrapper")
    if not reviews_wrap:
        print(f"‚ÑπÔ∏è Kh√¥ng th·∫•y reviews: {company_url}")
        return "NO_REVIEW",pd.DataFrame()

    rows = []
    elements = reviews_wrap.find_all("article", class_="profile-review")
    print(f"üîç {company_url} ‚Üí {len(elements)} reviews")

    for e in elements:
        project_data = extract_review_data(e)
        desc_el = e.find("div", class_="profile-review__summary mobile_hide")
        viewmore_data = e.find("div", class_="profile-review__extra mobile_hide desktop-review-to-hide hidden")
        background_text = ""
        if viewmore_data:
            background_el = viewmore_data.find("div", class_="profile-review__text with-border profile-review__extra-section")
            background_text = background_el.get_text(strip=True) if background_el else ""
        reviewer_data = extract_reviewer_info(e)

        row: dict = {}
        row.update(project_data or {})
        row.update(reviewer_data or {})
        row["Project description"] = desc_el.get_text(strip=True) if desc_el else None
        row["background"] = background_text
        row.update(link_social or {})
        # print('ROW :', row)
        rows.append(row)
    
    df = pd.DataFrame(rows)
    if df.empty:
        return "NO_REVIEW", df

    preferred_cols = [
        "reviewer_name",
        "reviewer_role",
        "reviewer_company",
        "verified_status",
        "Industry",
        "Location",
        "Client size",
        "Review type",
        "Services",
        "Project size",
        "Project length",
        "Project description",
        "background"
    ]
    cols = [c for c in preferred_cols if c in df.columns] + [
        c for c in df.columns if c not in preferred_cols
    ]
    return "HAVE_REVIEW",df.loc[:, cols]


# =========================
# Main
# =========================
def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Clutch.co crawler")
    p.add_argument(
        "--start-url",
        type=str,
        default=DEFAULT_START_URL,
        help=f"Directory URL b·∫Øt ƒë·∫ßu (default: {DEFAULT_START_URL})",
    )
    p.add_argument(
        "--limit",
        type=int,
        default=20,
        help="S·ªë company t·ªëi ƒëa ƒë·ªÉ crawl (default: 20)",
    )
    p.add_argument(
        "--out",
        type=str,
        default="clutch_reviews.csv",
        help="ƒê∆∞·ªùng d·∫´n file CSV xu·∫•t ra (default: clutch_reviews.csv)",
    )
    p.add_argument(
        '--checkpoint',
        type=str,
        default='checkpoint.json',
        help='File checkpoint ƒë·ªÉ l∆∞u tr·∫°ng th√°i crawl (default: checkpoint.json)',
    )
    p.add_argument(
        '--workers',
        type=int,
        default=8,
        help='S·ªë worker threads ƒë·ªÉ crawl ƒë·ªìng th·ªùi (default: 8)',
    )
    p.add_argument(
        '--flush-every',
        type=int,
        default=20,
        help='Flush k·∫øt qu·∫£ ra file sau m·ªói N company (default: 20)',
    )
    p.add_argument(
        '--last_page',
        type=int,
        default=1,
        help='Trang cu·ªëi c√πng crawl (default: 1)',
    )
    return p.parse_args()   

SLEEP_MIN, SLEEP_MAX = 0.5, 1.5  

# ==== Helper: atomic write CSV ====
def atomic_write_csv(df: pd.DataFrame, path: str, header: bool = True) -> None:
    """
    Append-only, kh√¥ng bao gi·ªù replace file hi·ªán c√≥.
    Ghi header ch·ªâ khi file tr·ªëng.
    C√≥ kh√≥a file ƒë·ªÉ tr√°nh ghi ƒë√® khi multi-thread.
    """
    path = Path(path)
    tmp = path.with_suffix(path.suffix + ".tmp")

    # M·ªü file ·ªü ch·∫ø ƒë·ªô append-binary; t·ª± t·∫°o n·∫øu ch∆∞a t·ªìn t·∫°i
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "ab+") as fout:
        # Kh√≥a ƒë·ªôc quy·ªÅn
        fcntl.flock(fout, fcntl.LOCK_EX)

        # Ki·ªÉm tra k√≠ch th∆∞·ªõc sau khi ƒë√£ kh√≥a (tr√°nh race)
        fout.seek(0, os.SEEK_END)
        is_empty = (fout.tell() == 0)

        # Ch·ªâ ghi header khi file tr·ªëng v√† caller cho ph√©p header
        write_header = header and is_empty

        # Ghi ra file t·∫°m
        df.to_csv(tmp, index=False, header=write_header)

        # Append n·ªôi dung tmp v√†o cu·ªëi file ƒë√≠ch
        with open(tmp, "rb") as fin:
            shutil.copyfileobj(fin, fout)

        # M·ªü kh√≥a & d·ªçn file t·∫°m
        fcntl.flock(fout, fcntl.LOCK_UN)
    tmp.unlink(missing_ok=True)
# ==== Checkpoint ====
def load_checkpoint(ckpt_file: str) -> Dict[str, Any]:
    p = Path(ckpt_file)
    if p.exists():
        try:
            return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            return {"done_urls": [], "last_page": 1}
    return {"done_urls": [], "last_page": 1}

def save_checkpoint(ckpt_file: str, state: Dict[str, Any]) -> None:
    p = Path(ckpt_file)
    tmp = p.with_suffix(p.suffix + ".tmp")
    tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
    tmp.replace(p)

# ==== Retry wrapper cho get_detail_information ====
def get_detail_information_with_retry(url: str, max_retry: int = 3, backoff_base: float = 0.8) -> tuple[str, pd.DataFrame]:
    
    for attempt in range(1, max_retry + 1):
        try:
            status, df = get_detail_information(url)  # <- d√πng h√†m g·ªëc c·ªßa b·∫°n
            if df is None:
                df = pd.DataFrame()
            return status, df
        except Exception as e:
            if attempt == max_retry:
                return "ERROR", pd.DataFrame()
            sleep_s = (backoff_base ** attempt) + random.uniform(0.05, 0.25)
            time.sleep(sleep_s)
    return "TIME_OUT", pd.DataFrame()

def process_company(url: str) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
    """
    Tr·∫£ v·ªÅ (train_df, test_df, url) cho 1 company.
    """
    train_parts: List[pd.DataFrame] = []
    test_parts: List[pd.DataFrame] = []

    for i in range(5):
        if i == 0:
            url_review = url
        else:
            url_review = f"{url}?page={i}#reviews"

        status, df = get_detail_information_with_retry(url_review)
        if status in ("TIME_OUT", "ERROR", "NO_REVIEW"):
            train_df = pd.concat(train_parts, ignore_index=True) if train_parts else pd.DataFrame()
            test_df  = pd.concat(test_parts,  ignore_index=True) if test_parts  else pd.DataFrame()
            return train_df, test_df, url
        if df is not None and not df.empty and len(df) > 5:
            tr, te = train_test_split(df, test_size=0.3, shuffle=True, random_state=42)
            train_parts.append(tr)
            test_parts.append(te)

        time.sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))

    train_df = pd.concat(train_parts, ignore_index=True) if train_parts else pd.DataFrame()
    test_df  = pd.concat(test_parts,  ignore_index=True) if test_parts  else pd.DataFrame()
    return train_df, test_df, url

def main() -> None:
    args = parse_args()
    out_path = args.out
    test_out_path = f"{out_path.replace('.csv', '_test.csv')}"
    ckpt_file = getattr(args, "checkpoint", "checkpoint.json")
    workers = max(1, getattr(args, "workers", 8))
    flush_every = max(1, getattr(args, "flush_every", 20)) 

    state = load_checkpoint(ckpt_file)
    done_urls = set(state.get("done_urls", []))
    last_page = int(state.get("last_page", 1))

    for com in range(last_page, 100):
        start_url = args.start_url
        # (G·ª£i √Ω) C√≥ th·ªÉ mu·ªën set page lu√¥n (kh√¥ng ch·ªâ com==1) t√πy site:
        if com > 1:
            start_url = f"{start_url}?page={com}"

        company_urls = get_base_url_company(start_url)
        if not company_urls:
            print("‚ö†Ô∏è Kh√¥ng c√≥ URL c√¥ng ty n√†o ƒë∆∞·ª£c t√¨m th·∫•y. K·∫øt th√∫c.", file=sys.stderr)
            state["last_page"] = com + 1
            save_checkpoint(ckpt_file, state)
            return

        company_urls = [u for u in company_urls if u not in done_urls]
        print("=========================>")
        print(f"üîé Trang {com}: c√≤n {len(company_urls)} c√¥ng ty c·∫ßn crawl (t·ªïng trang c√≥ th·ªÉ l·ªõn h∆°n).")

        batch_trains: List[pd.DataFrame] = []
        batch_tests: List[pd.DataFrame] = []
        futures = []

        with ThreadPoolExecutor(max_workers=workers) as ex:
            for url in company_urls:
                futures.append(ex.submit(process_company, url))

            # Thu k·∫øt qu·∫£ theo t·ª´ng company, flush m·ªói 'flush_every'
            pbar = tqdm(as_completed(futures), total=len(futures), desc=f"Page {com}")
            for fut in pbar:
                tr_df, te_df, url = fut.result()
                if tr_df is not None and not tr_df.empty:
                    batch_trains.append(tr_df)
                if te_df is not None and not te_df.empty:
                    batch_tests.append(te_df)
                done_urls.add(url)
                # print('URL done:', url)
                # print('TR_df :',len(tr_df), 'TE_df :', len(te_df))
                # print('Batch trains:', len(batch_trains), 'Batch tests:', len(batch_tests))
                # print('Flush every:', flush_every)
                if len(batch_trains) >= flush_every or len(batch_tests) >= flush_every:
                    if batch_trains:
                        big_tr = pd.concat(batch_trains, ignore_index=True)
                        atomic_write_csv(big_tr, out_path,  header=True)
                        batch_trains.clear()
                    if batch_tests:
                        big_te = pd.concat(batch_tests, ignore_index=True)
                        atomic_write_csv(big_te, test_out_path, header=True)
                        batch_tests.clear()

                    state["done_urls"] = list(done_urls)
                    state["last_page"] = com
                    save_checkpoint(ckpt_file, state)

        if batch_trains:
            big_tr = pd.concat(batch_trains, ignore_index=True)
            atomic_write_csv(big_tr, out_path,  header=True)
        if batch_tests:
            big_te = pd.concat(batch_tests, ignore_index=True)
            atomic_write_csv(big_te, test_out_path, header=True)

        state["done_urls"] = list(done_urls)
        state["last_page"] = com + 1
        save_checkpoint(ckpt_file, state)

    print(f"‚úÖ Ho√†n t·∫•t. K·∫øt qu·∫£ ·ªü: {out_path} v√† {test_out_path}")
company_urls = get_base_url_company(f"{DEFAULT_START_URL}?page=410")


In [2]:
company_urls

['https://clutch.co/profile/yantra',
 'https://clutch.co/profile/hostedpci',
 'https://clutch.co/profile/synergy-technologies',
 'https://clutch.co/profile/emagined-security',
 'https://clutch.co/profile/eo-security-sro',
 'https://clutch.co/profile/soil-institute-management',
 'https://clutch.co/profile/appsecure-security',
 'https://clutch.co/profile/salesbabu-business-solutions-0',
 'https://clutch.co/profile/ctc-technologies',
 'https://clutch.co/profile/twintel-solutions',
 'https://clutch.co/profile/relevance-management',
 'https://clutch.co/profile/acecloud-rtds',
 'https://clutch.co/profile/threadgold-consulting',
 'https://clutch.co/profile/belue-creative',
 'https://clutch.co/profile/sprocket-security',
 'https://clutch.co/profile/gac-business-solutions',
 'https://clutch.co/profile/axxis-consulting',
 'https://clutch.co/profile/konsulko-group',
 'https://clutch.co/profile/isa-cybersecurity',
 'https://clutch.co/profile/powersoft-computer-solutions',
 'https://clutch.co/profi