In [4]:
import re
import time
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

ROOT_URL = "https://www.latimes.com"
MAX_PAGES = 20000 
CRAWL_DELAY = 1  

fetch_data = []
visit_data = []
url_data = []
visited_urls = set()

filters = re.compile(r'.*\.(css|js|json|xml|zip|gz|mp3|mp4|ico|png|jpg|jpeg|svg)$')

def fetch_page(url, retries=3):
    for i in range(retries):
        try:
            res = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
            fetch_data.append([url, res.status_code])
            return res
        except requests.RequestException:
            time.sleep(1)  
    fetch_data.append([url, 'FAILED'])
    return None

def parse_page(response):
    content_type = response.headers.get('Content-Type', '').split(';')[0]
    size = len(response.content)
    out_links = set()

    if 'text/html' in content_type:
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            absolute_link = urljoin(ROOT_URL, link['href'])
            if absolute_link.startswith(ROOT_URL) and not filters.match(absolute_link):
                if absolute_link not in visited_urls:
                    out_links.add(absolute_link)
                url_data.append([absolute_link, 'OK' if absolute_link.startswith(ROOT_URL) else 'N_OK'])

    visit_data.append([response.url, size, len(out_links), content_type])
    return out_links

def crawl(url, max_pages):
    pages_visited = 0
    to_visit = [url]

    while to_visit and pages_visited < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited_urls:
            continue

        response = fetch_page(current_url)
        if response and response.status_code == 200:
            new_links = parse_page(response)
            to_visit.extend(new_links)

        visited_urls.add(current_url)
        pages_visited += 1

        if pages_visited % 1000 == 0:
            save_progress()

        time.sleep(CRAWL_DELAY) 

def save():
    write_csv('fetch_LATimes.csv', fetch_data, ['URL', 'Status'])
    write_csv('visit_LATimes.csv', visit_data, ['URL', 'Size(Bytes)', 'OutLinks', 'ContentType'])
    write_csv('urls_LATimes.csv', url_data, ['URL', 'Status'])

def write_csv(file_name, data, headers):
    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(data)

crawl(ROOT_URL, MAX_PAGES)
save()

In [13]:
import pandas as pd

urls_df = pd.read_csv('urls_LATimes.csv')
unique_within = urls_df[urls_df['Status'].str.strip().str.upper() == 'OK']['URL'].nunique()
unique_outside = urls_df[urls_df['Status'].str.strip().str.upper() == 'N_OK']['URL'].nunique()
total_unique_urls = urls_df['URL'].nunique()
total_urls_extracted = len(urls_df)

print("\nOutgoing URLs:")
print("================")
print(f"Total URLs extracted: {total_urls_extracted}")
print(f"Total unique URLs extracted: {total_unique_urls}")
print(f"Unique URLs within News Site: {unique_within}")
print(f"Unique URLs outside News Site: {unique_outside}")

fetch_df = pd.read_csv('fetch_LATimes.csv')
total_fetches_attempted = len(fetch_df)
fetches_succeeded = len(fetch_df[fetch_df['Status'] == '200'])
fetches_failed_or_aborted = total_fetches_attempted - fetches_succeeded

print("\nFetch Statistics:")
print("================")
print(f"Fetches attempted: {total_fetches_attempted}")
print(f"Fetches succeeded: {fetches_succeeded}")
print(f"Fetches failed or aborted: {fetches_failed_or_aborted}")

status_counts = fetch_df['Status'].value_counts()
print("\nStatus Codes:")
print("=============")
status_messages = {
    '200': 'OK',
    '403': 'Forbidden',
    '404': 'Not Found',
    '500': 'Internal Server Error',
    '405': 'Method Not Allowed'
}

for status, count in status_counts.items():
    message = status_messages.get(status, 'Unknown Status')
    print(f"{status} {message}: {count}")

visit_df = pd.read_csv('visit_LATimes.csv')
file_size_buckets = {
    '< 1KB': len(visit_df[visit_df['Size(Bytes)'] < 1024]),
    '1KB ~ <10KB': len(visit_df[(visit_df['Size(Bytes)'] >= 1024) & (visit_df['Size(Bytes)'] < 10240)]),
    '10KB ~ <100KB': len(visit_df[(visit_df['Size(Bytes)'] >= 10240) & (visit_df['Size(Bytes)'] < 102400)]),
    '100KB ~ <1MB': len(visit_df[(visit_df['Size(Bytes)'] >= 102400) & (visit_df['Size(Bytes)'] < 1048576)]),
    '>= 1MB': len(visit_df[visit_df['Size(Bytes)'] >= 1048576])
}

print("\nFile Sizes:")
print("=============")
for size_range, count in file_size_buckets.items():
    print(f"{size_range}: {count}")

content_type_stats = visit_df['ContentType'].value_counts()
print("\nContent Types:")
print("=============")
print(content_type_stats)


Outgoing URLs:
Total URLs extracted: 3548799
Total unique URLs extracted: 164700
Unique URLs within News Site: 164700
Unique URLs outside News Site: 0

Fetch Statistics:
Fetches attempted: 20000
Fetches succeeded: 18841
Fetches failed or aborted: 1159

Status Codes:
200 OK: 18841
404 Not Found: 1143
FAILED Unknown Status: 8
429 Unknown Status: 4
403 Forbidden: 2
999 Unknown Status: 2

File Sizes:
< 1KB: 1
1KB ~ <10KB: 5
10KB ~ <100KB: 160
100KB ~ <1MB: 18240
>= 1MB: 435

Content Types:
text/html    18840
Name: ContentType, dtype: int64
