In [28]:
import pandas as pd
import random
from datetime import datetime, timedelta
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

from tqdm import tqdm

In [37]:
class TaxiRideGenerator:
    def __init__(self, n_addresses):
        self.n_addresses = n_addresses
        self.address_pool = self.generate_address_pool()

    def generate_address_pool(self):
        geolocator = Nominatim(user_agent="random_taxi_generator")
        address_pool = []

        for _ in tqdm(range(self.n_addresses), desc="Generating address pool"):
            location = geolocator.geocode("London, United Kingdom", exactly_one=True)
            lat = location.latitude + (random.random() - 0.5) / 10.0
            lon = location.longitude + (random.random() - 0.5) / 10.0
            address = geolocator.reverse((lat, lon), language="en")
            postal_code = self.get_postal_code(address)

            address_info = {
                "lat": lat,
                "lon": lon,
                "address": address.address if address else None,
                "postal_code": postal_code,
            }

            address_pool.append(address_info)

        return address_pool

    def get_postal_code(self, location):
        if location and location.raw.get("address"):
            return location.raw["address"].get("postcode")
        return None

    def generate_random_datetime(self):
        start_date = datetime(2023, 1, 1)
        end_date = datetime(2023, 12, 31)
        random_date = start_date + timedelta(
            seconds=random.randint(0, int((end_date - start_date).total_seconds()))
        )
        return random_date

    def fake_taxi_between_addresses(self, n_rides):
        rides = []
        for _ in tqdm(range(n_rides), desc="Generating taxi rides"):
            driver_id = random.randint(0, 2000)
            user_id = random.randint(0, 5000)

            start_address = random.choice(self.address_pool)
            end_address = start_address
            while end_address == start_address:
                end_address = random.choice(self.address_pool)

            start_datetime = self.generate_random_datetime()
            end_datetime = start_datetime + timedelta(minutes=random.randint(10, 120))

            distance_miles = geodesic((start_address["lat"], start_address["lon"]),
                                      (end_address["lat"], end_address["lon"])).miles

            hour = start_datetime.hour
            cost_multiplier = 1.0
            if 17 <= hour < 20:
                cost_multiplier = 1.2
            elif 22 <= hour or hour < 6:
                cost_multiplier = 1.5

            trip_cost = distance_miles * cost_multiplier

            driver_feedback = random.randint(1, 5)
            user_feedback = random.randint(1, 5)

            rides.append(
                {
                    "driver_id": driver_id,
                    "user_id": user_id,
                    "start_lat": start_address["lat"],
                    "start_lon": start_address["lon"],
                    "start_address": start_address["address"],
                    "start_postal_code": start_address["postal_code"],
                    "end_lat": end_address["lat"],
                    "end_lon": end_address["lon"],
                    "end_address": end_address["address"],
                    "end_postal_code": end_address["postal_code"],
                    "start_datetime": start_datetime,
                    "end_datetime": end_datetime,
                    "distance_miles": distance_miles,
                    "trip_cost": trip_cost,
                    "driver_feedback": driver_feedback,
                    "user_feedback": user_feedback,
                }
            )

        df = pd.DataFrame(rides)
        return df

In [44]:
n_addresses = 1000
taxi_generator = TaxiRideGenerator(n_addresses)

n_rides = 25000000
taxi_rides_df = taxi_generator.fake_taxi_between_addresses(n_rides)
taxi_rides_df.head()

Generating address pool: 100%|████████████████████████████████████████████████████████████████| 1000/1000 [16:49<00:00,  1.01s/it]
Generating taxi rides: 100%|██████████████████████████████████████████████████████| 25000000/25000000 [1:09:19<00:00, 6010.56it/s]


Unnamed: 0,driver_id,user_id,start_lat,start_lon,start_address,start_postal_code,end_lat,end_lon,end_address,end_postal_code,start_datetime,end_datetime,distance_miles,trip_cost,driver_feedback,user_feedback
0,442,4255,51.544221,-0.160938,"78, Adelaide Road, Primrose Hill, Chalk Farm, ...",NW3 3PX,51.495512,-0.111796,"Wedgwood House, Kennington Road, Lambeth, Lond...",SE11 6LS,2023-01-01 23:03:07,2023-01-01 23:25:07,3.978858,5.968287,4,5
1,1731,1853,51.478503,-0.082093,"52, Peckham Grove, Old Kent Road, London Borou...",SE15 6ET,51.542777,-0.162441,"Primrose Hill Slow Tunnel, Adelaide Road, Swis...",NW8 6NH,2023-11-14 21:58:46,2023-11-14 23:18:46,5.635383,5.635383,5,4
2,692,2515,51.51526,-0.171238,"6, Radnor Mews, Paddington, London, Greater Lo...",W2 2SX,51.549472,-0.152828,"sandy Pharmacy, Queen's Crescent, Maitland Par...",NW5 4EB,2023-09-24 10:14:21,2023-09-24 10:43:21,2.494814,2.494814,4,4
3,380,2202,51.464472,-0.176526,"Thomas Baines Road, Battersea, London Borough ...",SW11 2HL,51.556018,-0.095301,"Kelross Road, Taverner Square, Highbury, Londo...",N5 2QN,2023-12-30 00:13:06,2023-12-30 01:18:06,7.234012,10.851019,5,2
4,1263,3285,51.510967,-0.125803,"20, Bedfordbury, Seven Dials, Covent Garden, L...",WC2N 4BJ,51.458309,-0.090028,"Sunray Avenue, Denmark Hill, Dulwich Village, ...",SE24 9PX,2023-01-24 21:20:53,2023-01-24 23:06:53,3.954278,3.954278,4,4


In [45]:
taxi_rides_df.to_csv("rides.csv")