In [1]:
import pandas as pd
from faker import Faker
import numpy as np
import random
from datetime import datetime, timedelta
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

In [4]:
class FakeRidesGenerator:
    def __init__(self, num_samples=1000):
        self.num_samples = num_samples
        self.fake = Faker()

    def load_address_data(self, filename='london.csv'):
        return pd.read_csv(filename)

    def haversine_distance(self, coord1, coord2):
        lat1, lon1 = coord1
        lat2, lon2 = coord2

        R = 6371
        dlat = np.radians(lat2 - lat1)
        dlon = np.radians(lon2 - lon1)
        a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * \
            np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
        distance = R * c

        return distance * 0.621371  # km -> miles

    def generate_fake_rides(self):
        addresses = self.load_address_data()
        drivers = list(range(1, 4926))
        clients = list(range(1, 29892))

        start_coords = addresses[['Latitude', 'Longitude']].sample(
            n=self.num_samples, replace=True).values
        end_coords = addresses[['Latitude', 'Longitude']].sample(
            n=self.num_samples, replace=True).values

        distances = [self.haversine_distance(
            start, end) for start, end in zip(start_coords, end_coords)]

        start_times = pd.to_datetime(np.random.choice(pd.date_range(
            start="2021-01-01", end="2023-12-31", freq='D'), size=self.num_samples))

        hours_probabilities = [0.03, 0.02, 0.01, 0.01, 0.005, 0.005, 0.01, 0.03, 0.1,
                               0.07, 0.05, 0.02, 0.02, 0.03, 0.03, 0.05, 0.08, 0.12, 0.1, 0.09, 0.06, 0.04, 0.01, 0.01]

        start_times += pd.to_timedelta(np.random.choice(range(0, 24),
                                       size=self.num_samples, p=hours_probabilities), unit='h')
        start_times += pd.to_timedelta(np.random.randint(0,
                                       60, size=self.num_samples), unit='m')
        start_times += pd.to_timedelta(np.random.randint(0,
                                       60, size=self.num_samples), unit='s')
        avg_speed = 15  # mph, avg between downtown and suburbs
        end_times = start_times + \
            pd.to_timedelta(np.array(distances) / avg_speed, unit='h')

        trip_costs = []
        base_fee = 3  # based on https://www.taxi-calculator.com/taxi-rate-london/296
        for start, distance in zip(start_times, distances):
            hour = start.hour
            if 5 <= hour < 8:
                cost_multiplier = 1.3
            elif 0 <= hour < 5 or 20 <= hour <= 23:
                cost_multiplier = 1.55
            else:
                cost_multiplier = 1.1
            trip_costs.append((distance * cost_multiplier) + base_fee)

        driver_scores = np.random.choice([1, 2, 3, 4, 5], self.num_samples, p=[
                                         0.05, 0.1, 0.1, 0.2, 0.55])
        driver_notes = [random.choice(['rude', 'too slow', 'too fast']) if score <= 2
                        else random.choice(['ok', 'good']) if score == 3
                        else random.choice(['polite', 'fast']) for score in driver_scores]
        driver_notes = [note if random.random(
        ) > 0.2 else np.nan for note in driver_notes]
        driver_feedback = [self.fake.sentence() if random.random(
        ) > 0.6 else np.nan for _ in range(self.num_samples)]
        user_scores = np.random.choice([1, 2, 3, 4, 5], self.num_samples, p=[
                                       0.05, 0.1, 0.2, 0.3, 0.35])
        user_notes = [random.choice(['rude', 'no tip', 'long wait']) if score <= 2
                      else random.choice(['ok', 'good']) if score == 3
                      else random.choice(['polite', 'fast']) for score in user_scores]
        user_notes = [note if random.random(
        ) > 0.2 else np.nan for note in user_notes]

        fake_rides_df = pd.DataFrame({
            'driver_id': np.random.choice(drivers, self.num_samples),
            'client_id': np.random.choice(clients, self.num_samples),
            'start_lat': start_coords[:, 0],
            'start_long': start_coords[:, 1],
            'end_lat': end_coords[:, 0],
            'end_long': end_coords[:, 1],
            'distance': distances,
            'start_time': start_times,
            'end_time': end_times,
            'trip_cost': trip_costs,
            'driver_score': driver_scores,
            'driver_notes': driver_notes,
            'driver_feedback': driver_feedback,
            'user_score': user_scores,
            'user_notes': user_notes,
        })

        fake_rides_df['end_time'] = fake_rides_df['end_time'].dt.strftime(
            '%Y-%m-%d %H:%M:%S')
        fake_rides_df['trip_cost'] = fake_rides_df['trip_cost'].round(2)
        fake_rides_df['distance'] = fake_rides_df['distance'].round(2)
        
        return fake_rides_df

In [7]:
if __name__ == "__main__":
    generator = FakeRidesGenerator(num_samples=100000000) # approx 13 GB of data
    fake_rides_df = generator.generate_fake_rides()
    fake_rides_df.to_csv('fake_rides.csv', index=False)