In [3]:
# Training data preparation
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Generate random taxi data
def generate_taxi_data(start_date, end_date, n=500):
    data = []
    current_date = start_date
    
    for _ in range(n):
        # Random pickup time between start and end date
        pickup_time = current_date + timedelta(
            seconds=random.randint(0, int((end_date - start_date).total_seconds()))
        )
        # Trip duration 5–60 min
        dropoff_time = pickup_time + timedelta(minutes=random.randint(5, 60))
        
        # Random locations (NYC-like lat/longs)
        pickup_lat = round(random.uniform(40.6, 40.9), 6)
        pickup_long = round(random.uniform(-74.05, -73.75), 6)
        dropoff_lat = round(random.uniform(40.6, 40.9), 6)
        dropoff_long = round(random.uniform(-74.05, -73.75), 6)
        
        passenger_count = random.randint(1, 4)
        trip_distance = round(random.uniform(1, 20), 2)
        fare_amount = round(2.5 + trip_distance * random.uniform(2, 4), 2)
        
        data.append([
            pickup_time, dropoff_time, pickup_long, pickup_lat,
            dropoff_long, dropoff_lat, passenger_count,
            trip_distance, fare_amount
        ])
    
    return pd.DataFrame(data, columns=[
        "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude",
        "dropoff_longitude", "dropoff_latitude", "passenger_count",
        "trip_distance", "fare_amount"
    ])

# January and February 2025
jan_data = generate_taxi_data(datetime(2025,1,1), datetime(2025,1,31,23,59,59), n=500)
feb_data = generate_taxi_data(datetime(2025,2,1), datetime(2025,2,28,23,59,59), n=500)

# Combine and save
taxi_data = pd.concat([jan_data, feb_data])
taxi_data.to_csv("us_taxi_jan_feb.csv", index=False)

print("CSV file 'us_taxi_jan_feb.csv' created successfully with", len(taxi_data), "records.")


CSV file 'us_taxi_jan_feb.csv' created successfully with 1000 records.


In [4]:
# March data for prediction (no fare_amount, since that's what we want to predict)
def generate_march_data(start_date, end_date, n=200):
    data = []
    for _ in range(n):
        pickup_time = start_date + timedelta(
            seconds=random.randint(0, int((end_date - start_date).total_seconds()))
        )
        dropoff_time = pickup_time + timedelta(minutes=random.randint(5, 60))

        pickup_lat = round(random.uniform(40.6, 40.9), 6)
        pickup_long = round(random.uniform(-74.05, -73.75), 6)
        dropoff_lat = round(random.uniform(40.6, 40.9), 6)
        dropoff_long = round(random.uniform(-74.05, -73.75), 6)

        passenger_count = random.randint(1, 4)
        trip_distance = round(random.uniform(1, 20), 2)

        # 🚨 No fare_amount column (target variable we’ll predict later)
        data.append([
            pickup_time, dropoff_time, pickup_long, pickup_lat,
            dropoff_long, dropoff_lat, passenger_count,
            trip_distance
        ])

    return pd.DataFrame(data, columns=[
        "pickup_datetime", "dropoff_datetime", "pickup_longitude", "pickup_latitude",
        "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance"
    ])

# March dataset
march_data = generate_march_data(datetime(2025,3,1), datetime(2025,3,31,23,59,59), n=300)

# Save separately
march_data.to_csv("/workspaces/mlops-trip-duration-prediction/DataSets/us_taxi_march.csv", index=False)
