In [None]:
import requests
import zipfile
import io
import pandas as pd
from collections import Counter

url = 'https://s3.amazonaws.com/tripdata/2023-citibike-tripdata.zip'
chunk_size = 1_000_000
station_counter = Counter()

# Step 1: Download and read outer ZIP
response = requests.get(url)
response.raise_for_status()
outer_zip = zipfile.ZipFile(io.BytesIO(response.content))

# Step 2: Loop through inner ZIPs
inner_zip_names = [f for f in outer_zip.namelist() if f.endswith('.zip')]

print(f"Found {len(inner_zip_names)} monthly zip files.")

# First pass — count top 3 stations
for inner_name in inner_zip_names:
    print(f"Processing inner ZIP: {inner_name}")
    with outer_zip.open(inner_name) as inner_file:
        inner_zip_data = inner_file.read()
        with zipfile.ZipFile(io.BytesIO(inner_zip_data)) as inner_zip:
            inner_csv_names = [f for f in inner_zip.namelist() if f.endswith('.csv')]
            if not inner_csv_names:
                continue  # skip if no CSV inside
            with inner_zip.open(inner_csv_names[0]) as csv_file:
                for chunk in pd.read_csv(csv_file, chunksize=chunk_size, low_memory=False,
                                         dtype={'start_station_id': str, 'end_station_id': str}):
                    if 'start_station_name' in chunk.columns:
                        station_counter.update(chunk['start_station_name'].dropna())

# Get top 3
top3_stations = [station for station, _ in station_counter.most_common(3)]
print("\nTop 3 Start Stations:")
for station in top3_stations:
    print(f"- {station}")

# Second pass — filter and write matching rows
output_file = 'top3_stations_output.csv'
is_first_chunk = True

for inner_name in inner_zip_names:
    print(f"Filtering rows in: {inner_name}")
    with outer_zip.open(inner_name) as inner_file:
        inner_zip_data = inner_file.read()
        with zipfile.ZipFile(io.BytesIO(inner_zip_data)) as inner_zip:
            inner_csv_names = [f for f in inner_zip.namelist() if f.endswith('.csv')]
            if not inner_csv_names:
                continue
            with inner_zip.open(inner_csv_names[0]) as csv_file:
                for chunk in pd.read_csv(csv_file, chunksize=chunk_size, low_memory=False,
                                         dtype={'start_station_id': str, 'end_station_id': str}):
                    if 'start_station_name' not in chunk.columns:
                        continue
                    filtered = chunk[chunk['start_station_name'].isin(top3_stations)]
                    if not filtered.empty:
                        filtered.to_csv(output_file, mode='w' if is_first_chunk else 'a',
                                        index=False, header=is_first_chunk)
                        is_first_chunk = False


Found 12 monthly zip files.
Processing inner ZIP: 2023-citibike-tripdata/202302-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202308-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202306-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202310-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202304-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202312-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202303-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202301-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202309-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202311-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202307-citibike-tripdata.zip
Processing inner ZIP: 2023-citibike-tripdata/202305-citibike-tripdata.zip

Top 3 Start Stations:
- W 21 St & 6 Ave
- W 31 St & 7 Ave
- University Pl & E 14 St