In [2]:
import os
import requests
import zipfile
import pandas as pd
import shutil
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from io import BytesIO

TEMP_FOLDER = "tripdata_temp"
OUTPUT_FILE = "top3_stations_output.csv"
CHUNK_SIZE = 500_000

def get_last_12_months_est():
    eastern = pytz.timezone("US/Eastern")
    now_est = datetime.now(eastern)
    return [(now_est - relativedelta(months=i)).strftime('%Y%m') for i in range(12)]

def download_zip_to_memory(ym):
    base_url = "https://s3.amazonaws.com/tripdata/"
    filenames = [
        f"{ym}-citibike-tripdata.zip",
        f"{ym}-citibike-tripdata.csv.zip"
    ]
    for fname in filenames:
        url = base_url + fname
        try:
            print(f"🌐 Trying: {url}")
            r = requests.get(url, timeout=20)
            if r.status_code == 200:
                print(f"✅ Downloaded: {fname}")
                return BytesIO(r.content)
            else:
                print(f"❌ Not found: {url}")
        except Exception as e:
            print(f"⚠️ Error downloading {url}: {e}")
    return None

def extract_all_csvs(zip_bytes_io, extract_to):
    """Extracts all .csv files from zip, even if nested zip files or folders."""
    try:
        with zipfile.ZipFile(zip_bytes_io) as zf:
            for member in zf.namelist():
                if member.endswith('.zip'):
                    # Nested zip file: extract, open, extract .csv from it
                    nested_zip_data = zf.read(member)
                    with zipfile.ZipFile(BytesIO(nested_zip_data)) as nested_zf:
                        for nested_member in nested_zf.namelist():
                            if nested_member.endswith('.csv'):
                                print(f"📦 Extracting nested CSV: {nested_member}")
                                nested_zf.extract(nested_member, extract_to)
                elif member.endswith('.csv'):
                    print(f"📁 Extracting CSV: {member}")
                    zf.extract(member, extract_to)
    except Exception as e:
        print(f"⚠️ Error extracting zip: {e}")

def flatten_csvs_folder(root_folder):
    flat_files = []
    for root, dirs, files in os.walk(root_folder):
        for fname in files:
            if fname.endswith(".csv"):
                full_path = os.path.join(root, fname)
                flat_files.append(full_path)
    return flat_files

def get_top3_station_names(filepaths):
    freq = {}
    for path in filepaths:
        try:
            for chunk in pd.read_csv(path, usecols=["start_station_name"], chunksize=CHUNK_SIZE):
                chunk = chunk.dropna(subset=["start_station_name"])
                for station in chunk["start_station_name"]:
                    freq[station] = freq.get(station, 0) + 1
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")
    top3 = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:3]
    return [x[0] for x in top3]

def write_top3_data(filepaths, top3, output=OUTPUT_FILE):
    first_write = True
    for path in filepaths:
        try:
            for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
                if "start_station_name" not in chunk.columns:
                    continue
                filtered = chunk[chunk["start_station_name"].isin(top3)]
                if not filtered.empty:
                    filtered.to_csv(output, index=False, mode='a' if not first_write else 'w', header=first_write)
                    first_write = False
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")

def run_super_pipeline():
    if os.path.exists(TEMP_FOLDER):
        shutil.rmtree(TEMP_FOLDER)
    os.makedirs(TEMP_FOLDER, exist_ok=True)

    months = get_last_12_months_est()
    print("🚀 Starting download + extraction...")

    for ym in months:
        zip_mem = download_zip_to_memory(ym)
        if zip_mem:
            extract_all_csvs(zip_mem, TEMP_FOLDER)

    all_csvs = flatten_csvs_folder(TEMP_FOLDER)
    if not all_csvs:
        print("❌ No CSV files found.")
        return

    print("\n🔍 Counting top 3 stations...")
    top3 = get_top3_station_names(all_csvs)
    print(f"🏆 Top 3 Stations: {top3}")

    print("\n📤 Writing filtered data...")
    write_top3_data(all_csvs, top3)
    print(f"✅ Output written to `{OUTPUT_FILE}`")

    print("\n🧹 Cleaning up temp files...")
    shutil.rmtree(TEMP_FOLDER)
    print("✅ Temp folder deleted.")

# Run the pipeline
run_super_pipeline()


🚀 Starting download + extraction...
🌐 Trying: https://s3.amazonaws.com/tripdata/202505-citibike-tripdata.zip
❌ Not found: https://s3.amazonaws.com/tripdata/202505-citibike-tripdata.zip
🌐 Trying: https://s3.amazonaws.com/tripdata/202505-citibike-tripdata.csv.zip
❌ Not found: https://s3.amazonaws.com/tripdata/202505-citibike-tripdata.csv.zip
🌐 Trying: https://s3.amazonaws.com/tripdata/202504-citibike-tripdata.zip
✅ Downloaded: 202504-citibike-tripdata.zip
📁 Extracting CSV: 202504-citibike-tripdata_3.csv
📁 Extracting CSV: 202504-citibike-tripdata_2.csv
📁 Extracting CSV: 202504-citibike-tripdata_1.csv
📁 Extracting CSV: 202504-citibike-tripdata_4.csv
🌐 Trying: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.zip
❌ Not found: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.zip
🌐 Trying: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.csv.zip
✅ Downloaded: 202503-citibike-tripdata.csv.zip
📁 Extracting CSV: 202503-citibike-tripdata.csv
🌐 Trying: https://s3.amazo

  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK_SIZE):
  for chunk in pd.read_csv(path, chunksize=CHUNK

⚠️ Skipping tripdata_temp\__MACOSX\._202409-citibike-tripdata_1.csv: 'utf-8' codec can't decode byte 0xb0 in position 45: invalid start byte
⚠️ Skipping tripdata_temp\__MACOSX\._202409-citibike-tripdata_2.csv: 'utf-8' codec can't decode byte 0xb0 in position 45: invalid start byte
⚠️ Skipping tripdata_temp\__MACOSX\._202409-citibike-tripdata_3.csv: 'utf-8' codec can't decode byte 0xb0 in position 45: invalid start byte
⚠️ Skipping tripdata_temp\__MACOSX\._202409-citibike-tripdata_4.csv: 'utf-8' codec can't decode byte 0xb0 in position 45: invalid start byte
⚠️ Skipping tripdata_temp\__MACOSX\._202409-citibike-tripdata_5.csv: 'utf-8' codec can't decode byte 0xb0 in position 45: invalid start byte
✅ Output written to `top3_stations_output.csv`

🧹 Cleaning up temp files...
✅ Temp folder deleted.


In [5]:
import os
import requests
import zipfile
import pandas as pd
import shutil
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from io import BytesIO

TEMP_FOLDER = "tripdata_temp"
OUTPUT_FILE = "top3_stations_output.csv"
CHUNK_SIZE = 500_000

TARGET_COLS = [
    "ride_id", "rideable_type", "started_at", "ended_at",
    "start_station_name", "start_station_id",
    "end_station_name", "end_station_id",
    "start_lat", "start_lng", "end_lat", "end_lng",
    "member_casual"
]

DTYPES = {
    "ride_id": str,
    "rideable_type": str,
    "started_at": str,
    "ended_at": str,
    "start_station_name": str,
    "start_station_id": str,
    "end_station_name": str,
    "end_station_id": str,
    "start_lat": float,
    "start_lng": float,
    "end_lat": float,
    "end_lng": float,
    "member_casual": str
}

def get_last_12_months_est():
    eastern = pytz.timezone("US/Eastern")
    now_est = datetime.now(eastern)
    # subtract 1 extra month to exclude current month
    return [(now_est - relativedelta(months=i + 1)).strftime('%Y%m') for i in range(12)]


def download_zip_to_memory(ym):
    base_url = "https://s3.amazonaws.com/tripdata/"
    filenames = [
        f"{ym}-citibike-tripdata.zip",
        f"{ym}-citibike-tripdata.csv.zip"
    ]
    for fname in filenames:
        url = base_url + fname
        try:
            print(f"🌐 Trying: {url}")
            r = requests.get(url, timeout=20)
            if r.status_code == 200:
                print(f"✅ Downloaded: {fname}")
                return BytesIO(r.content)
            else:
                print(f"❌ Not found: {url}")
        except Exception as e:
            print(f"⚠️ Error downloading {url}: {e}")
    return None

def extract_all_csvs(zip_bytes_io, extract_to):
    try:
        with zipfile.ZipFile(zip_bytes_io) as zf:
            for member in zf.namelist():
                if member.endswith('.zip'):
                    nested_zip_data = zf.read(member)
                    with zipfile.ZipFile(BytesIO(nested_zip_data)) as nested_zf:
                        for nested_member in nested_zf.namelist():
                            if nested_member.endswith('.csv'):
                                print(f"📦 Extracting nested CSV: {nested_member}")
                                nested_zf.extract(nested_member, extract_to)
                elif member.endswith('.csv'):
                    print(f"📁 Extracting CSV: {member}")
                    zf.extract(member, extract_to)
    except Exception as e:
        print(f"⚠️ Error extracting zip: {e}")

def flatten_csvs_folder(root_folder):
    flat_files = []
    for root, _, files in os.walk(root_folder):
        for fname in files:
            if fname.endswith(".csv"):
                full_path = os.path.join(root, fname)
                flat_files.append(full_path)
    return flat_files

def get_top3_station_names(filepaths):
    freq = {}
    for path in filepaths:
        try:
            for chunk in pd.read_csv(path, usecols=["start_station_name"], dtype={"start_station_name": str}, chunksize=CHUNK_SIZE):
                chunk = chunk.dropna(subset=["start_station_name"])
                for station in chunk["start_station_name"]:
                    freq[station] = freq.get(station, 0) + 1
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")
    top3 = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:3]
    return [x[0] for x in top3]

def write_top3_data(filepaths, top3, output=OUTPUT_FILE):
    first_write = True
    for path in filepaths:
        try:
            for chunk in pd.read_csv(path, usecols=TARGET_COLS, dtype=DTYPES, chunksize=CHUNK_SIZE, low_memory=False):
                chunk = chunk.dropna(subset=["start_station_name"])
                filtered = chunk[chunk["start_station_name"].isin(top3)]
                if not filtered.empty:
                    filtered.to_csv(output, index=False, mode='a' if not first_write else 'w', header=first_write)
                    first_write = False
        except Exception as e:
            print(f"⚠️ Skipping {path}: {e}")

def run_super_pipeline():
    if os.path.exists(TEMP_FOLDER):
        shutil.rmtree(TEMP_FOLDER)
    os.makedirs(TEMP_FOLDER, exist_ok=True)

    months = get_last_12_months_est()
    print("🚀 Starting download + extraction...")

    for ym in months:
        zip_mem = download_zip_to_memory(ym)
        if zip_mem:
            extract_all_csvs(zip_mem, TEMP_FOLDER)

    all_csvs = flatten_csvs_folder(TEMP_FOLDER)
    if not all_csvs:
        print("❌ No CSV files found.")
        return

    print("\n🔍 Counting top 3 stations...")
    top3 = get_top3_station_names(all_csvs)
    print(f"🏆 Top 3 Stations: {top3}")

    print("\n📤 Writing filtered data...")
    write_top3_data(all_csvs, top3)
    print(f"✅ Output written to `{OUTPUT_FILE}`")

    print("\n🧹 Cleaning up temp files...")
    shutil.rmtree(TEMP_FOLDER)
    print("✅ Temp folder deleted.")

# Run the pipeline
run_super_pipeline()


🚀 Starting download + extraction...
🌐 Trying: https://s3.amazonaws.com/tripdata/202504-citibike-tripdata.zip
✅ Downloaded: 202504-citibike-tripdata.zip
📁 Extracting CSV: 202504-citibike-tripdata_3.csv
📁 Extracting CSV: 202504-citibike-tripdata_2.csv
📁 Extracting CSV: 202504-citibike-tripdata_1.csv
📁 Extracting CSV: 202504-citibike-tripdata_4.csv
🌐 Trying: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.zip
❌ Not found: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.zip
🌐 Trying: https://s3.amazonaws.com/tripdata/202503-citibike-tripdata.csv.zip
✅ Downloaded: 202503-citibike-tripdata.csv.zip
📁 Extracting CSV: 202503-citibike-tripdata.csv
🌐 Trying: https://s3.amazonaws.com/tripdata/202502-citibike-tripdata.zip
✅ Downloaded: 202502-citibike-tripdata.zip
📁 Extracting CSV: 202502-citibike-tripdata_3.csv
📁 Extracting CSV: 202502-citibike-tripdata_2.csv
📁 Extracting CSV: 202502-citibike-tripdata_1.csv
🌐 Trying: https://s3.amazonaws.com/tripdata/202501-citibike-tripdata.

In [6]:
import hopsworks
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()
project = hopsworks.login(
    api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    project=os.getenv("HOPSWORKS_PROJECT")
)
fs = project.get_feature_store()

pred_fg = fs.get_feature_group("citi_bike_predictions", version=1)
pred_df = pred_fg.read()

print("🔢 Total predicted hours:", pred_df['target_hour'].nunique())
print("🗓️ Time range:", pred_df['target_hour'].min(), "→", pred_df['target_hour'].max())


2025-05-10 21:58:06,996 INFO: Initializing external client
2025-05-10 21:58:06,996 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-10 21:58:09,369 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1228957
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.85s) 
🔢 Total predicted hours: 1
🗓️ Time range: 2025-05-01 00:00:00+00:00 → 2025-05-01 00:00:00+00:00
