In [9]:
# Cell 1 — imports
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
import json  # stdlib, no install needed

# Cell 2 — paths & file list (points at your 12 monthly ZIPs)
data_dir = Path("/Users/renatabatista/Other Docs/Germany/CareerFoundry/Data Specialization/JupyterLab/citibike_2022/extracted/2022-citibike-tripdata")
zips = sorted(data_dir.glob("2022??-citibike-tripdata.zip"))
len(zips), zips[:3]  # quick check

# Cell 3 — choose columns & dtypes to save memory (adjust if needed)
usecols = [
    "ride_id", "rideable_type", "started_at", "ended_at",
    "start_station_id", "start_station_name",
    "end_station_id", "end_station_name",
    "start_lat", "start_lng", "end_lat", "end_lng",
    "member_casual"
]
dtypes = {
    "ride_id": "string",
    "rideable_type": "category",
    "start_station_id": "string",
    "start_station_name": "string",
    "end_station_id": "string",
    "end_station_name": "string",
    "member_casual": "category",
    # lat/lng as float64 by default; leave them out of dtypes
}
parse_dates = ["started_at", "ended_at"]

# Cell 4 — function that reads all CSVs inside one monthly ZIP
def read_month_zip(zpath: Path) -> pd.DataFrame:
    dfs = []
    with ZipFile(zpath) as zf:
        csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
        for name in csv_names:
            with zf.open(name) as f:
                df = pd.read_csv(
                    f,
                    usecols=usecols,
                    dtype=dtypes,
                    parse_dates=parse_dates,
                    low_memory=False
                )
                dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# Cell 5 — read all months and vertically join (concatenate)
monthly_frames = [read_month_zip(z) for z in zips]
rides = pd.concat(monthly_frames, ignore_index=True)

# quick sanity checks
rides.shape, rides.head(), rides.dtypes.head(10)


((29838806, 13),
             ride_id  rideable_type              started_at  \
 0  63AF72AB3CD47753   classic_bike 2022-01-13 21:36:47.689   
 1  9C0DAD8C1E0EA571   classic_bike 2022-01-16 17:56:23.889   
 2  9576DDD8920974F5  electric_bike 2022-01-18 07:10:04.799   
 3  962A466CC3AC6781   classic_bike 2022-01-22 12:10:10.225   
 4  C2585407BA0FE3E9   classic_bike 2022-01-08 16:35:16.497   
 
                  ended_at                start_station_name start_station_id  \
 0 2022-01-13 21:46:02.024                   5 Ave & E 63 St          6904.06   
 1 2022-01-16 18:03:50.269  Grand Army Plaza & Plaza St West          4010.15   
 2 2022-01-18 07:20:54.450                  W 20 St & 10 Ave          6306.01   
 3 2022-01-22 12:20:06.899                   W 54 St & 9 Ave          6920.03   
 4 2022-01-08 16:45:33.279              Sharon St & Olive St          5323.05   
 
               end_station_name end_station_id  start_lat  start_lng  \
 0           Broadway & W 51 St        6779

## How the data-loading code works
- I point data_dir at the folder containing the 12 monthly ZIP files for 2022 and glob them with the pattern 2022??-citibike-tripdata.zip.
- To be memory-efficient, I define usecols (only the columns I need) and dtypes (categories/strings where appropriate) and parse the timestamp columns (started_at, ended_at) as dates.
- For each month, I open the ZIP with zipfile.ZipFile and iterate over the CSV(s) inside. I read each CSV directly from the ZIP stream using pd.read_csv(zf.open(name), ...), avoiding temporary extraction to disk.
- I collect the monthly DataFrames in a list and concatenate them into one big DataFrame rides using pd.concat(..., ignore_index=True).
- Finally, I do a quick shape/head/dtypes check and (optionally) save the merged dataset to Parquet (recommended for size and speed) or a compressed CSV.

In [11]:
# Pick ONE of these:
# Smaller & faster for future work
rides.to_parquet("citibike_2022.parquet", index=False)
