In [None]:
import sys
sys.argv = [sys.argv[0]]  # ignore Colab's extra arguments


The code cell is importing the sys module, which provides access to system-specific parameters and functions. The line sys.argv = [sys.argv[0]] is resetting the sys.argv list to contain only the name of the script being executed. This is often done in environments like Google Colab to remove command-line arguments that are automatically added by the environment, which can interfere with scripts that expect a specific number of arguments.

In [None]:
#!/usr/bin/env python3
"""
scrape_livingcost_india_inr_hardening.py
- Usage:
    python scrape_livingcost_india_inr_hardening.py           # full run
    python scrape_livingcost_india_inr_hardening.py --test 10 # run only first 10 page candidates
- Requirements:
    pip install requests beautifulsoup4 pandas openpyxl
- Output:
    - livingcost_india_all_inr.csv
    - debug_logs.txt
    - optional ./html_snippets/<city_slug>.html (for debugging)
"""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time, re, csv, os, argparse, sys, traceback
import pandas as pd

BASE = "https://livingcost.org"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117 Safari/537.36"}
OUT_CSV = "livingcost_india_all_inr.csv"
LOGFILE = "debug_logs.txt"
SNIPPET_DIR = "html_snippets"

# fallback rate (used if exchange API fails)
FALLBACK_USD_TO_INR = 87.8638

# small helper logging
def log(msg):
    ts = time.strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] {msg}")
    with open(LOGFILE, "a", encoding="utf-8") as f:
        f.write(f"[{ts}] {msg}\n")

def get(url, timeout=15):
    for attempt in range(3):
        try:
            r = requests.get(url, headers=HEADERS, timeout=timeout)
            if r.status_code == 200:
                return r
            log(f"Non-200 for {url}: {r.status_code}")
        except Exception as e:
            log(f"Request error for {url}: {e} (attempt {attempt+1})")
        time.sleep(1 + attempt)
    return None

def fetch_usd_to_inr():
    try:
        r = requests.get("https://api.exchangerate.host/latest?base=USD&symbols=INR", timeout=8)
        r.raise_for_status()
        data = r.json()
        rate = data.get("rates", {}).get("INR")
        if rate and rate > 0:
            log(f"Fetched live USD->INR rate: {rate}")
            return float(rate)
    except Exception as e:
        log(f"Could not fetch live rate: {e}")
    log(f"Using fallback USD->INR rate: {FALLBACK_USD_TO_INR}")
    return FALLBACK_USD_TO_INR

def find_india_city_links(limit=None):
    # Attempt multiple places that might list India city links
    candidates = [
        "/cost/india",
        "/cost/country/india",   # sometimes sites use different index paths
        "/cost/asia/india"
    ]
    links = set()
    for p in candidates:
        idx_url = urljoin(BASE, p)
        r = get(idx_url)
        if not r:
            log(f"Index page not reachable: {idx_url}")
            continue
        soup = BeautifulSoup(r.text, "html.parser")
        # 1) common anchor pattern
        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            if "/cost/india/" in href and len(href.split("/")) >= 4:
                links.add(urljoin(BASE, href.split("#")[0]))
        # 2) try data attributes or JSON-LD (rare)
        # 3) also look for sitemap link on the page
        # quick stop if we have many links
        if limit and len(links) >= limit:
            break
        time.sleep(0.2)
    links = sorted(links)
    log(f"Found {len(links)} candidate links (sample 10): {links[:10]}")
    return links[:limit] if limit else links

# robust numeric extractor: picks first $-prefixed or bare number in given piece of text
def extract_first_number(text):
    if not text:
        return None
    # remove non-ascii noise
    txt = text.replace("\xa0", " ").replace(",", "")
    # try $ followed by number
    m = re.search(r"\$\s*([0-9]+(?:\.[0-9]+)?)", txt)
    if m:
        return float(m.group(1))
    # otherwise find any standalone number
    m2 = re.search(r"([0-9]+(?:\.[0-9]+)?)", txt)
    if m2:
        return float(m2.group(1))
    return None

def parse_city_page(url, save_snippet=False):
    r = get(url)
    if not r:
        log(f"Failed to fetch page {url}")
        return None
    html = r.text
    if save_snippet:
        os.makedirs(SNIPPET_DIR, exist_ok=True)
        slug = urlparse(url).path.strip("/").replace("/", "_")
        try:
            with open(os.path.join(SNIPPET_DIR, f"{slug}.html"), "w", encoding="utf-8") as f:
                f.write(html)
        except Exception as e:
            log(f"Could not save snippet for {url}: {e}")
    soup = BeautifulSoup(html, "html.parser")
    # city name
    h1 = soup.find("h1")
    city = h1.get_text(strip=True) if h1 else url.split("/")[-1]

    text = soup.get_text(separator=" ", strip=True)
    # heuristics: look for phrases that site uses
    # cost of living one person
    cost_patterns = [
        r"Cost of living(?: for)?(?: one person| - one person| \(one person\))?:?\s*\$?([0-9,]+(?:\.[0-9]+)?)",
        r"Cost of living(?:.*?)(?:\$\s*([0-9,]+(?:\.[0-9]+)?))"
    ]
    rent_patterns = [
        r"Rent & Utilities(?:.*?)(?:\$\s*([0-9,]+(?:\.[0-9]+)?))",
        r"Rent(?:.*?)(?:\$\s*([0-9,]+(?:\.[0-9]+)?))"
    ]
    salary_patterns = [
        r"Monthly salary after tax(?:.*?)(?:\$\s*([0-9,]+(?:\.[0-9]+)?))",
        r"Monthly salary(?:.*?)(?:\$\s*([0-9,]+(?:\.[0-9]+)?))"
    ]
    def find_first(patterns):
        for p in patterns:
            m = re.search(p, text, re.I|re.S)
            if m:
                try:
                    val = float(m.group(1).replace(",",""))
                    return val
                except:
                    pass
        return None

    cost = find_first(cost_patterns)
    rent = find_first(rent_patterns)
    salary = find_first(salary_patterns)

    # fallback: look near labels in DOM. Search for label text nodes then siblings
    if cost is None:
        label_nodes = soup.find_all(text=re.compile(r"cost of living", re.I))
        for node in label_nodes:
            parent = node.parent
            snippet = parent.get_text(" ", strip=True)
            cost = extract_first_number(snippet)
            if cost:
                break
    if rent is None:
        label_nodes = soup.find_all(text=re.compile(r"rent & utilities|rent", re.I))
        for node in label_nodes:
            parent = node.parent
            snippet = parent.get_text(" ", strip=True)
            rent = extract_first_number(snippet)
            if rent:
                break
    if salary is None:
        label_nodes = soup.find_all(text=re.compile(r"monthly salary", re.I))
        for node in label_nodes:
            parent = node.parent
            snippet = parent.get_text(" ", strip=True)
            salary = extract_first_number(snippet)
            if salary:
                break

    # months covered sentence
    months = None
    mmon = re.search(r"enough to cover.*?([0-9]*\.?[0-9]+)\s*months?", text, re.I)
    if mmon:
        months = float(mmon.group(1))

    # If none of primary numbers found, bail out (no data)
    if not any([cost, rent, salary, months]):
        log(f"No numeric fields found for {url} ({city}); skipping.")
        return None

    return {
        "city": city,
        "cost_one_person_usd": cost,
        "rent_one_person_usd": rent,
        "monthly_salary_after_tax_usd": salary,
        "months_covered": months,
        "source_url": url
    }

def main(limit=None, test_mode=False, save_snippets=False):
    open(LOGFILE, "w").close()
    log("START SCRAPE run")
    links = find_india_city_links(limit=limit or (10 if test_mode else None))
    if not links:
        log("No city links found — possible reasons: site structure changed or index page different. See debug_logs.txt.")
        return

    rows = []
    for i, url in enumerate(links, 1):
        log(f"[{i}/{len(links)}] Processing: {url}")
        try:
            parsed = parse_city_page(url, save_snippet=save_snippets)
            if parsed:
                # compute income after rent (if possible)
                rent = parsed.get("rent_one_person_usd") or 0.0
                salary = parsed.get("monthly_salary_after_tax_usd") or 0.0
                parsed["income_after_rent_usd"] = None if parsed.get("monthly_salary_after_tax_usd") is None else (salary - rent)
                rows.append(parsed)
                log(f"Parsed {parsed['city']}: cost={parsed.get('cost_one_person_usd')}, rent={rent}, salary={salary}, months={parsed.get('months_covered')}")
            else:
                log(f"Skipped (no relevant numeric data): {url}")
        except Exception as e:
            log(f"Exception parsing {url}: {e}")
            traceback.print_exc()
        time.sleep(0.8)  # politeness

    if not rows:
        log("No rows parsed. Exiting. Check debug_logs.txt and consider running with --test to get snippets.")
        return

    rate = fetch_usd_to_inr()
    # Build final rows and write CSV
    fieldnames = ["city","cost_one_person_usd","rent_one_person_usd","monthly_salary_after_tax_usd","income_after_rent_usd","months_covered","cost_one_person_inr","rent_one_person_inr","monthly_salary_after_tax_inr","income_after_rent_inr","usd_to_inr_rate_used","source_url"]
    with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in rows:
            cost = r.get("cost_one_person_usd")
            rent = r.get("rent_one_person_usd") or 0.0
            salary = r.get("monthly_salary_after_tax_usd") or 0.0
            inc = r.get("income_after_rent_usd")
            out = {
                "city": r.get("city"),
                "cost_one_person_usd": cost,
                "rent_one_person_usd": rent if rent != 0.0 else None,
                "monthly_salary_after_tax_usd": salary if salary != 0.0 else None,
                "income_after_rent_usd": inc,
                "months_covered": r.get("months_covered"),
                "cost_one_person_inr": round(cost * rate, 2) if cost else None,
                "rent_one_person_inr": round(rent * rate, 2) if rent else None,
                "monthly_salary_after_tax_inr": round(salary * rate, 2) if salary else None,
                "income_after_rent_inr": round(inc * rate, 2) if inc is not None else None,
                "usd_to_inr_rate_used": rate,
                "source_url": r.get("source_url")
            }
            writer.writerow(out)
    log(f"Wrote {OUT_CSV} with {len(rows)} rows. Done.")

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--limit", type=int, default=None, help="Limit number of candidate links to scan (for testing)")
    ap.add_argument("--test", action="store_true", help="Test mode: only check a small set of candidates")
    ap.add_argument("--save-snippets", action="store_true", help="Save HTML snippets for debugging")
    args = ap.parse_args()
    main(limit=args.limit, test_mode=args.test, save_snippets=args.save_snippets)


The internet = a giant library. Each webpage = a book.

The script = a helpful robot librarian that visits pages and copies specific facts into a spreadsheet.

How the robot works, in 6 tiny steps:

Gets its tools — imports requests (to fetch pages), BeautifulSoup (to read messy HTML), pandas (to make a spreadsheet), and sets some settings (like the site address and polite headers).

Knocks and retries — when it asks for a page, it tries a few times if the page doesn’t respond, and logs what happened.

Finds city pages — looks at index/catalog pages and grabs all links that look like /cost/india/<city>.

Reads each city page — opens the page, uses BeautifulSoup to turn messy HTML into something readable, finds the city name and the numbers for cost/rent/salary by searching for clues and nearby numbers.

Converts money — asks a currency service for USD→INR rate (or uses yesterday’s saved rate if needed) and converts dollar amounts to rupees.

Saves everything — calculates “income after rent,” collects all city records, and writes them into a CSV spreadsheet.