# Test Configuration Settings from config.py
This notebook is used to test and validate the configuration settings and URL templates defined in `config.py` before integrating them into the main project files.

In [None]:
# Import configuration settings from config.py
from config import CATEGORIES, CATEGORY_URL_TEMPLATE, CATEGORY_PAGINATION_URL_TEMPLATE, MAX_RETRIES, REQUEST_TIMEOUT

# Display all imported configuration variables
print("CATEGORIES:", CATEGORIES)
print("CATEGORY_URL_TEMPLATE:", CATEGORY_URL_TEMPLATE)
print("CATEGORY_PAGINATION_URL_TEMPLATE:", CATEGORY_PAGINATION_URL_TEMPLATE)
print("MAX_RETRIES:", MAX_RETRIES)
print("REQUEST_TIMEOUT:", REQUEST_TIMEOUT)
# print("USER_AGENT:", USER_AGENT)

CATEGORIES: ['aar', 'all-judgements', 'high-court', 'others', 'supreme-court', 'tribunal']
CATEGORY_URL_TEMPLATE: https://itatonline.org/archives/category/{category}/?judges&section&counsel&court&catchwords&genre
CATEGORY_PAGINATION_URL_TEMPLATE: https://itatonline.org/archives/category/{category}/page/{page}/?judges&section&counsel&court&catchwords&genre
MAX_RETRIES: 3
REQUEST_TIMEOUT: 10


: 

## Test Category URL Templates
Test the URL templates by formatting them with sample category and page values to ensure they generate the correct URLs.

In [3]:
# Test CATEGORY_URL_TEMPLATE and CATEGORY_PAGINATION_URL_TEMPLATE
sample_category = CATEGORIES[0]  # e.g., 'aar'
sample_page = 3

first_page_url = CATEGORY_URL_TEMPLATE.format(category=sample_category)
paginated_url = CATEGORY_PAGINATION_URL_TEMPLATE.format(category=sample_category, page=sample_page)

print("First page URL:", first_page_url)
print("Paginated page URL (page 3):", paginated_url)

First page URL: https://itatonline.org/archives/category/aar/?judges&section&counsel&court&catchwords&genre
Paginated page URL (page 3): https://itatonline.org/archives/category/aar/page/3/?judges&section&counsel&court&catchwords&genre


## Test Retry and Timeout Settings
Verify that MAX_RETRIES and REQUEST_TIMEOUT are set to expected values and can be used in HTTP request logic.

In [4]:
# Test MAX_RETRIES and REQUEST_TIMEOUT
assert isinstance(MAX_RETRIES, int) and MAX_RETRIES > 0, "MAX_RETRIES should be a positive integer"
assert isinstance(REQUEST_TIMEOUT, int) and REQUEST_TIMEOUT > 0, "REQUEST_TIMEOUT should be a positive integer"

print(f"MAX_RETRIES is set to: {MAX_RETRIES}")
print(f"REQUEST_TIMEOUT is set to: {REQUEST_TIMEOUT} seconds")

MAX_RETRIES is set to: 3
REQUEST_TIMEOUT is set to: 10 seconds


## Test User-Agent String
Check the USER_AGENT string for correctness and test its usage in a sample HTTP request header.

In [5]:
# Test auto-generated User-Agent string using fake-useragent and usage in a sample HTTP request header
from fake_useragent import UserAgent
import requests

# Generate a random User-Agent string
ua = UserAgent()
user_agent = ua.random
print("Randomly generated USER_AGENT:", user_agent)

# Prepare a sample request header
headers = {"User-Agent": user_agent}
print("Sample request headers:", headers)

# (Optional) Test a real HTTP request to httpbin.org/user-agent
try:
    response = requests.get("https://httpbin.org/user-agent", headers=headers, timeout=REQUEST_TIMEOUT)
    print("Response from httpbin.org/user-agent:", response.json())
except Exception as e:
    print("HTTP request failed:", e)

Randomly generated USER_AGENT: Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Mobile/15E148 Safari/604.1
Sample request headers: {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Mobile/15E148 Safari/604.1'}
Response from httpbin.org/user-agent: {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Mobile/15E148 Safari/604.1'}
Response from httpbin.org/user-agent: {'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Mobile/15E148 Safari/604.1'}


## Test Fetching and Saving a Category Page
Fetch a sample category page using requests and save the raw HTML response to a file for later processing.

In [6]:
import os
from fake_useragent import UserAgent
import requests

# Choose a sample category and construct the URL
sample_category = CATEGORIES[0]  # e.g., 'aar'
category_url = CATEGORY_URL_TEMPLATE.format(category=sample_category)

# Generate a random User-Agent
ua = UserAgent()
headers = {"User-Agent": ua.random}

# Fetch the category page
try:
    response = requests.get(category_url, headers=headers, timeout=REQUEST_TIMEOUT)
    response.raise_for_status()
    html_content = response.text
    print(f"Fetched {category_url} (length: {len(html_content)})")
    # Save the raw HTML to a file
    os.makedirs(f"{sample_category}/page_1", exist_ok=True)
    with open(f"{sample_category}/page_1/category_response.html", "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"Saved HTML to {sample_category}/page_1/category_response.html")
except Exception as e:
    print("Error fetching or saving category page:", e)

Fetched https://itatonline.org/archives/category/aar/?judges&section&counsel&court&catchwords&genre (length: 314964)
Saved HTML to aar/page_1/category_response.html


## Test Generating Unique ID and Saving Post HTML
Generate a unique ID for a post and save a sample HTML response using that ID as the filename.

In [7]:
import uuid

# Generate a unique ID for a post
unique_id = uuid.uuid4().hex
print("Generated unique ID:", unique_id)

# Save a sample HTML response using the unique ID as the filename
sample_html = "<html><body><h1>Sample Post</h1><p>This is a test post.</p></body></html>"
post_path = f"{sample_category}/page_1/{unique_id}.html"
with open(post_path, "w", encoding="utf-8") as f:
    f.write(sample_html)
print(f"Saved sample post HTML to {post_path}")

Generated unique ID: 46c02839e82c46b89403dc4080d8d769
Saved sample post HTML to aar/page_1/46c02839e82c46b89403dc4080d8d769.html


## Test Updating the Ledger CSV
Update the ledger CSV file with the unique ID, file path, and post URL for tracking.

In [8]:
import csv

ledger_path = "ledger.csv"
post_url = f"https://itatonline.org/archives/category/{sample_category}/post/{unique_id}/"  # Example post URL

# Append to the ledger CSV
with open(ledger_path, "a", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([unique_id, post_path, post_url])
print(f"Updated ledger.csv with: {unique_id}, {post_path}, {post_url}")

Updated ledger.csv with: 46c02839e82c46b89403dc4080d8d769, aar/page_1/46c02839e82c46b89403dc4080d8d769.html, https://itatonline.org/archives/category/aar/post/46c02839e82c46b89403dc4080d8d769/


## Test Extracting and Fetching 'Read More' Links from Page 1
Parse the saved category_response.html for page 1, extract all 'read more' links, and fetch the HTML for each link.

In [9]:
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import os

# Path to the saved category_response.html
category_html_path = f"{sample_category}/page_1/category_response.html"

# Read the HTML content
with open(category_html_path, "r", encoding="utf-8") as f:
    html_content = f.read()

# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Find all 'read more' links (adjust selector as needed)
read_more_links = [a['href'] for a in soup.find_all('a', string=lambda s: s and 'read more' in s.lower())]
print(f"Found {len(read_more_links)} 'read more' links:", read_more_links)

# Fetch and save each 'read more' response
ua = UserAgent()
for idx, link in enumerate(read_more_links):
    try:
        headers = {"User-Agent": ua.random}
        response = requests.get(link, headers=headers, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        post_html = response.text
        # Generate a unique ID for each post
        post_id = uuid.uuid4().hex
        post_path = f"{sample_category}/page_1/{post_id}.html"
        with open(post_path, "w", encoding="utf-8") as f:
            f.write(post_html)
        print(f"Saved 'read more' HTML to {post_path}")
        # Optionally, update ledger here as well
    except Exception as e:
        print(f"Error fetching 'read more' link {link}: {e}")

Found 10 'read more' links: ['https://itatonline.org/archives/cit-vs-vodafone-essar-gujarat-ltd-gujarat-high-court-full-bench/', 'https://itatonline.org/archives/dow-agrosciences-agricultural-products-ltd-in-re-aar-transfer-of-shares-of-an-indian-co-by-a-mauritius-entity-to-a-singapore-entity-due-to-group-reorganization-is-not-a-scheme-for-avoidance-of-tax-th/', 'https://itatonline.org/archives/in-re-cummins-limited-aar-managerial-services-rendered-by-a-uk-co-to-an-indian-co-even-if-technical-in-nature-is-not-assessable-as-fees-for-technical-services-under-article-13-of-i/', 'https://itatonline.org/archives/in-re-aberdeen-claims-administration-inc-aar-amount-received-by-a-fii-under-a-settlement-for-giving-up-right-to-sue-is-not-assessable-as-either-capital-gains-or-as-business-profits-in-principle-a-fi/', 'https://itatonline.org/archives/in-re-tiong-woon-project-contracting-pte-limited-aar-an-installation-project-which-does-not-last-more-than-183-days-in-a-fiscal-year-is-not-a-permanen

## Modular Scraping Functions
Define and test modular functions for fetching, saving, and tracking category and post data.

In [10]:
import os
import uuid
import csv
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

def get_headers():
    """Generate headers with a random User-Agent."""
    ua = UserAgent()
    return {"User-Agent": ua.random}

def fetch_url(url, timeout):
    """Fetch a URL and return the response text, or None on error."""
    try:
        headers = get_headers()
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_html(content, path):
    """Save HTML content to a file, creating directories as needed."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Saved HTML to {path}")

def generate_unique_id():
    """Generate a unique hex ID."""
    return uuid.uuid4().hex

def update_ledger(ledger_path, unique_id, file_path, url):
    """Append a record to the ledger CSV."""
    with open(ledger_path, "a", newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([unique_id, file_path, url])
    print(f"Updated ledger: {unique_id}, {file_path}, {url}")

def extract_readmore_links(category_html):
    """Extract all 'read more' links from a category page HTML."""
    soup = BeautifulSoup(category_html, "html.parser")
    return [a['href'] for a in soup.find_all('a', string=lambda s: s and 'read more' in s.lower())]

## Test Each Modular Function
Test the modular scraping functions one by one to ensure they work as expected.

In [None]:
# Test get_headers()
print("Headers:", get_headers())

# Test generate_unique_id()
uid = generate_unique_id()
print("Unique ID:", uid)

# Test save_html()
test_html = "<html><body><h2>Test Save</h2></body></html>"
test_path = "test_dir/test_file.html"
save_html(test_html, test_path)

# Test fetch_url() (using a simple, fast URL)
test_url = "https://httpbin.org/html"
html = fetch_url(test_url, 10)
print("Fetched HTML length:", len(html) if html else None)

# Test extract_readmore_links() (using a sample HTML)
sample_html = '''<html><body><a href="https://example.com/1">Read More</a><a href="https://example.com/2">read more</a></body></html>'''
links = extract_readmore_links(sample_html)
print("Extracted 'read more' links:", links)

# Test update_ledger()
update_ledger("test_ledger.csv", uid, test_path, test_url)