# HW0 — GaN CSV URL collection
Author: Soujanya Choppala  
Course: MCIS6273 (SAU Fall 2025)

This notebook scrapes the *Globe at Night* maps-data page for CSV links that contain "GaN" and a year, then writes full URLs to `data/gan_urls.txt`.


In [1]:
import sys, subprocess, pkgutil

def pip_install(pkg):
    if pkg not in {m.name for m in pkgutil.iter_modules()}:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", pkg])

for p in ["requests", "beautifulsoup4"]:
    pip_install(p)

print("ready")


ready


In [2]:
import re
from pathlib import Path
import requests
from bs4 import BeautifulSoup

# Point directly to your repo root (adjust if your folder name differs)
REPO_ROOT = Path("/home/jovyan/mcis6273-f25-datamining")
DATA_DIR = REPO_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)
OUTFILE = DATA_DIR / "gan_urls.txt"

REPO_ROOT, DATA_DIR, OUTFILE


(PosixPath('/home/jovyan/mcis6273-f25-datamining'),
 PosixPath('/home/jovyan/mcis6273-f25-datamining/data'),
 PosixPath('/home/jovyan/mcis6273-f25-datamining/data/gan_urls.txt'))

In [3]:
# Step 1: fetch the web page and create soup
URL = "https://globeatnight.org/maps-data/"
resp = requests.get(URL, timeout=30)
resp.raise_for_status()

soup = BeautifulSoup(resp.content, "html.parser")
print("Page title:", soup.title.string if soup.title else "(none)")

# Step 2: find the CSV links using a broader regex
# Match /documents/... with "GaN" (case-insensitive) and a year (20xx) in the name
pat = re.compile(r"/documents/.*gan.*20\d{2}.*\.csv", re.IGNORECASE)

candidates = []
for a in soup.select("a[href]"):
    href = a["href"].strip()
    if pat.search(href):
        candidates.append("https://globeatnight.org" + href)

# Deduplicate while preserving order
urls = list(dict.fromkeys(candidates))

print("Found", len(urls), "CSV URLs")
urls[:10]


Page title: 
    
      Maps & Data
    
    
      
      - Globe At Night
    
  
Found 19 CSV URLs


['https://globeatnight.org/documents/926/GaN2024.csv',
 'https://globeatnight.org/documents/661/GaN2023.csv',
 'https://globeatnight.org/documents/662/GaN2022.csv',
 'https://globeatnight.org/documents/663/GaN2021.csv',
 'https://globeatnight.org/documents/679/GaN2020.csv',
 'https://globeatnight.org/documents/665/GaN2019.csv',
 'https://globeatnight.org/documents/666/GaN2018.csv',
 'https://globeatnight.org/documents/667/GaN2017.csv',
 'https://globeatnight.org/documents/668/GaN2016.csv',
 'https://globeatnight.org/documents/669/GaN2015.csv']

In [4]:
OUTFILE.write_text("\n".join(urls) + ("\n" if urls else ""))
print(f"Wrote {len(urls)} URLs to {OUTFILE.relative_to(REPO_ROOT)}")


Wrote 19 URLs to data/gan_urls.txt


In [5]:
lines = OUTFILE.read_text().splitlines()
assert all(u.startswith("https://globeatnight.org/") for u in lines), "Non-absolute URL found"
print("Lines in file:", len(lines))
print("Sample:", lines[:5])


Lines in file: 19
Sample: ['https://globeatnight.org/documents/926/GaN2024.csv', 'https://globeatnight.org/documents/661/GaN2023.csv', 'https://globeatnight.org/documents/662/GaN2022.csv', 'https://globeatnight.org/documents/663/GaN2021.csv', 'https://globeatnight.org/documents/679/GaN2020.csv']
