In [1]:
import asyncio
import aiohttp
import pandas as pd
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
load_dotenv()
import os

# Socrata app token 
APP_TOKEN = os.getenv("CHICAGO_API_TOKEN")

# Endpoint and config
BASE_URL = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"
LIMIT = 10000
CONCURRENT_REQUESTS = 5

semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)

In [2]:
async def fetch_batch(session, offset, retries=5):
    headers = {"X-App-Token": APP_TOKEN}
    params = {
        "$select": "primary_type",
        "$limit": LIMIT,
        "$offset": offset,
        "$where": "date IS NOT NULL"
    }
    async with semaphore:
        for attempt in range(retries):
            try:
                async with session.get(BASE_URL, headers=headers, params=params) as resp:
                    if resp.status == 200:
                        return await resp.json()
                    elif resp.status in [429, 500, 502, 503, 504]:
                        wait = 2 ** attempt
                        print(f"Retryable error {resp.status} at offset {offset}. Retrying in {wait}s...")
                        await asyncio.sleep(wait)
                    else:
                        print(f"Non-retryable error at offset {offset}: HTTP {resp.status}")
                        return []
            except Exception as e:
                print(f"Exception at offset {offset}: {e}")
        return []

async def fetch_all_data():
    all_records = []
    offset = 0

    async with aiohttp.ClientSession() as session:
        while True:
            tasks = [fetch_batch(session, offset + i * LIMIT) for i in range(CONCURRENT_REQUESTS)]
            results = await asyncio.gather(*tasks)

            batch_records = [record for batch in results if batch for record in batch]
            all_records.extend(batch_records)

            print(f"Retrieved: {len(all_records)} records...")

            if any(len(batch) < LIMIT for batch in results):
                break

            offset += CONCURRENT_REQUESTS * LIMIT

    # Convert to DataFrame with only primary_type column
    df = pd.DataFrame(all_records)
    return df

# ✅ Run this inside an async context (e.g., Jupyter cell)
df = await fetch_all_data()
print(f"Total records: {len(df)}")
df.head()


Retrieved: 50000 records...
Retrieved: 100000 records...
Retrieved: 150000 records...
Retrieved: 200000 records...
Retrieved: 250000 records...
Retrieved: 300000 records...
Retrieved: 350000 records...
Retrieved: 400000 records...
Retrieved: 450000 records...
Retrieved: 500000 records...
Retrieved: 550000 records...
Retrieved: 600000 records...
Retrieved: 650000 records...
Retrieved: 700000 records...
Retrieved: 750000 records...
Retrieved: 800000 records...
Retrieved: 850000 records...
Retrieved: 900000 records...
Retrieved: 950000 records...
Retrieved: 1000000 records...
Retrieved: 1050000 records...
Retrieved: 1100000 records...
Retrieved: 1150000 records...
Retrieved: 1200000 records...
Retrieved: 1250000 records...
Retrieved: 1300000 records...
Retrieved: 1350000 records...
Retrieved: 1400000 records...
Retrieved: 1450000 records...
Retrieved: 1500000 records...
Retrieved: 1550000 records...
Retrieved: 1600000 records...
Retrieved: 1650000 records...
Retrieved: 1700000 records...


Unnamed: 0,primary_type
0,OFFENSE INVOLVING CHILDREN
1,NARCOTICS
2,ROBBERY
3,CRIM SEXUAL ASSAULT
4,CRIMINAL DAMAGE


In [3]:

# Group by 'primary_type' and count occurrences
primary_type_counts = df['primary_type'].value_counts().reset_index()

# Rename columns for clarity
primary_type_counts.columns = ['primary_type', 'count']

# Sort descending
primary_type_counts = primary_type_counts.sort_values(by='count', ascending=False)

# Display the result
print(primary_type_counts)

                         primary_type    count
0                               THEFT  1773710
1                             BATTERY  1523512
2                     CRIMINAL DAMAGE   950729
3                           NARCOTICS   761946
4                             ASSAULT   558516
5                       OTHER OFFENSE   521161
6                            BURGLARY   442644
7                 MOTOR VEHICLE THEFT   426113
8                  DECEPTIVE PRACTICE   383773
9                             ROBBERY   313025
10                  CRIMINAL TRESPASS   225296
11                  WEAPONS VIOLATION   123552
12                       PROSTITUTION    70346
13         OFFENSE INVOLVING CHILDREN    59993
14             PUBLIC PEACE VIOLATION    54564
15                        SEX OFFENSE    33859
16                CRIM SEXUAL ASSAULT    27304
17   INTERFERENCE WITH PUBLIC OFFICER    19965
18               LIQUOR LAW VIOLATION    15334
19                           GAMBLING    14654
20           

In [None]:

# Step 1: Count and sort primary types
primary_type_counts = df['primary_type'].value_counts().reset_index()
primary_type_counts.columns = ['primary_type', 'count']
primary_type_counts = primary_type_counts.sort_values(by='count', ascending=False)

# Step 2: Take top 11
top_11_primary_types = primary_type_counts.head(11)

# Step 3: Export to CSV
top_11_primary_types.to_csv("top_11_primary_types.csv", index=False)

# Confirm
print("Exported top 11 primary types to 'top_11_primary_types.csv'")