In [1]:
import asyncio
import aiohttp
import pandas as pd
import nest_asyncio
nest_asyncio.apply()

# Socrata app token 
APP_TOKEN = "ILyt2BcXrbu1P13T4ldf55GT8"

# Endpoint and config
BASE_URL = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"
LIMIT = 10000
CONCURRENT_REQUESTS = 5

semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)

In [2]:
async def fetch_batch(session, offset, retries=5):
    headers = {"X-App-Token": APP_TOKEN}
    params = {
        "$select": "ward, date",  # ✅ Fetch both 'ward' and 'date'
        "$limit": LIMIT,
        "$offset": offset,
        "$where": "date IS NOT NULL AND ward IS NOT NULL"  # ✅ Filter out nulls for both
    }
    async with semaphore:
        for attempt in range(retries):
            try:
                async with session.get(BASE_URL, headers=headers, params=params) as resp:
                    if resp.status == 200:
                        return await resp.json()
                    elif resp.status in [429, 500, 502, 503, 504]:
                        wait = 2 ** attempt
                        print(f"Retryable error {resp.status} at offset {offset}. Retrying in {wait}s...")
                        await asyncio.sleep(wait)
                    else:
                        print(f"Non-retryable error at offset {offset}: HTTP {resp.status}")
                        return []
            except Exception as e:
                print(f"Exception at offset {offset}: {e}")
        return []

async def fetch_all_data():
    all_records = []
    offset = 0

    async with aiohttp.ClientSession() as session:
        while True:
            tasks = [fetch_batch(session, offset + i * LIMIT) for i in range(CONCURRENT_REQUESTS)]
            results = await asyncio.gather(*tasks)

            batch_records = [record for batch in results if batch for record in batch]
            all_records.extend(batch_records)

            print(f"Retrieved: {len(all_records)} records...")

            if any(len(batch) < LIMIT for batch in results):
                break

            offset += CONCURRENT_REQUESTS * LIMIT

    # Convert to DataFrame
    df = pd.DataFrame(all_records)

    # ✅ Parse 'date' column and convert 'ward' to numeric
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["ward"] = pd.to_numeric(df["ward"], errors="coerce")

    return df

# ✅ Run this in an async context (e.g., a Jupyter cell)
df = await fetch_all_data()
print(f"Total records: {len(df)}")
df.head()



Retrieved: 50000 records...
Retrieved: 100000 records...
Retrieved: 150000 records...
Retrieved: 200000 records...
Retrieved: 250000 records...
Retrieved: 300000 records...
Retrieved: 350000 records...
Retrieved: 400000 records...
Retrieved: 450000 records...
Retrieved: 500000 records...
Retrieved: 550000 records...
Retrieved: 600000 records...
Retrieved: 650000 records...
Retrieved: 700000 records...
Retrieved: 750000 records...
Retrieved: 800000 records...
Retrieved: 850000 records...
Retrieved: 900000 records...
Retrieved: 950000 records...
Retrieved: 1000000 records...
Retrieved: 1050000 records...
Retrieved: 1100000 records...
Retrieved: 1150000 records...
Retrieved: 1200000 records...
Retrieved: 1250000 records...
Retrieved: 1300000 records...
Retrieved: 1350000 records...
Retrieved: 1400000 records...
Retrieved: 1450000 records...
Retrieved: 1500000 records...
Retrieved: 1550000 records...
Retrieved: 1600000 records...
Retrieved: 1650000 records...
Retrieved: 1700000 records...


Unnamed: 0,ward,date
0,25,2022-07-29 03:39:00
1,28,2023-01-03 16:44:00
2,1,2020-08-10 09:45:00
3,42,2017-08-26 10:00:00
4,42,2023-09-06 17:00:00


In [None]:
# Ensure date is in datetime format
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with missing date or ward
df = df.dropna(subset=['date', 'ward'])

# Convert ward to numeric (if not already)
df['ward'] = pd.to_numeric(df['ward'], errors='coerce')

# Filter to include only years 2010–2019
df_filtered = df[(df['date'].dt.year >= 2010) & (df['date'].dt.year <= 2019)]

# Group by ward and count
ward_counts = df_filtered['ward'].value_counts().sort_index().reset_index()
ward_counts.columns = ['ward', 'record_count']

# Display result
print(ward_counts)


ward_counts.to_csv("ward_record_counts_2010_2019.csv", index=False)
print("Exported to 'ward_record_counts_2010_2019.csv'")