In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_data():
    print("Generating MetroCycle Data... This may take a moment.")

    # --- 1. CONFIGURATION ---
    NUM_STATIONS = 50
    NUM_BIKES = 800
    START_DATE = datetime(2023, 1, 1)
    END_DATE = datetime(2023, 12, 31)
    DAILY_TRIPS_AVG = 600

    # --- 2. GENERATE STATIONS ---
    station_ids = range(101, 101 + NUM_STATIONS)
    districts = ['Financial District', 'Uptown', 'Waterfront', 'University', 'Suburbs']

    stations = []
    for sid in station_ids:
        district = np.random.choice(districts, p=[0.3, 0.2, 0.2, 0.2, 0.1])
        # Assign capacity based on district density
        capacity = np.random.choice([15, 20, 30, 45], p=[0.1, 0.3, 0.4, 0.2])
        if district == 'Financial District': capacity += 10

        stations.append({
            'station_id': sid,
            'station_name': f"{district} Station {sid}",
            'district': district,
            'latitude': 40.7 + random.uniform(-0.05, 0.05),
            'longitude': -74.0 + random.uniform(-0.05, 0.05),
            'capacity': capacity,
            'deployment_date': '2022-01-01'
        })

    df_stations = pd.DataFrame(stations)
    df_stations.to_csv('stations.csv', index=False)
    print(f"✅ Generated {len(df_stations)} stations.")

    # --- 3. GENERATE WEATHER ---
    # Weather impacts ridership
    date_range = pd.date_range(START_DATE, END_DATE)
    weather_data = []

    for date in date_range:
        month = date.month
        # Simulating seasonality
        temp_base = 30 + 50 * np.sin((month - 1) * np.pi / 6) # Faux temp curve
        temp = temp_base + random.uniform(-10, 10)

        condition = np.random.choice(['Clear', 'Cloudy', 'Rain', 'Snow'],
                                     p=[0.6, 0.25, 0.1, 0.05])

        # Less riders if raining/snowing
        ridership_factor = 1.0
        if condition in ['Rain', 'Snow']: ridership_factor = 0.4
        elif temp < 40 or temp > 90: ridership_factor = 0.7

        weather_data.append({
            'date': date.strftime('%Y-%m-%d'),
            'temperature_f': round(temp, 1),
            'condition': condition,
            'ridership_factor': ridership_factor
        })

    df_weather = pd.DataFrame(weather_data)
    df_weather.to_csv('weather.csv', index=False)
    print(f"✅ Generated {len(df_weather)} days of weather.")

    # --- 4. GENERATE TRIPS ---
    trips = []
    trip_id_counter = 100000

    for _, day in df_weather.iterrows():
        curr_date = datetime.strptime(day['date'], '%Y-%m-%d')
        is_weekend = curr_date.weekday() >= 5

        # Calculate daily trip count based on weather and weekend
        daily_vol = int(np.random.normal(DAILY_TRIPS_AVG, 100) * day['ridership_factor'])
        if is_weekend: daily_vol = int(daily_vol * 0.8) # Slightly less usage on weekends overall

        for _ in range(daily_vol):
            trip_id_counter += 1

            # Weighted Hour Selection (Commuter Patterns)
            if is_weekend:
                # Bell curve around noon for weekends
                hour = int(np.random.normal(14, 4)) % 24
            else:
                # Bimodal distribution (8am and 6pm peaks)
                if random.random() < 0.5:
                    hour = int(np.random.normal(8, 1.5)) % 24
                else:
                    hour = int(np.random.normal(18, 1.5)) % 24

            start_time = curr_date + timedelta(hours=hour, minutes=random.randint(0, 59))

            # Trip Duration (log-normal distribution, skew towards short trips)
            duration_min = int(np.random.lognormal(2.5, 0.6))
            if duration_min < 2: duration_min = 2
            if duration_min > 120: duration_min = 120

            end_time = start_time + timedelta(minutes=duration_min)

            # Route Logic
            start_station = df_stations.sample(1).iloc[0]
            # Users tend to go from Suburbs -> Financial (AM) and vice versa (PM)
            if not is_weekend and 6 <= hour <= 10 and start_station['district'] == 'Suburbs':
                end_station = df_stations[df_stations['district'] == 'Financial District'].sample(1).iloc[0]
            elif not is_weekend and 16 <= hour <= 20 and start_station['district'] == 'Financial District':
                end_station = df_stations[df_stations['district'] == 'Suburbs'].sample(1).iloc[0]
            else:
                end_station = df_stations.sample(1).iloc[0]

            user_type = np.random.choice(['Subscriber', 'Customer'], p=[0.8, 0.2])

            trips.append({
                'trip_id': trip_id_counter,
                'bike_id': random.randint(1, NUM_BIKES),
                'start_time': start_time,
                'end_time': end_time,
                'start_station_id': start_station['station_id'],
                'end_station_id': end_station['station_id'],
                'user_type': user_type,
                'duration_minutes': duration_min
            })

    df_trips = pd.DataFrame(trips)
    df_trips.to_csv('trips.csv', index=False)
    print(f"✅ Generated {len(df_trips)} trips.")
    print("Files created: stations.csv, weather.csv, trips.csv")

if __name__ == "__main__":
    generate_data()

Generating MetroCycle Data... This may take a moment.
✅ Generated 50 stations.
✅ Generated 365 days of weather.
✅ Generated 157913 trips.
Files created: stations.csv, weather.csv, trips.csv
