# Fake Data

### Library

In [78]:
import json

import random
random.seed(42)

from faker import Faker
fake = Faker("vi_VN")

import pandas as pd

### User

In [82]:
def get_list_name():
    with open("uit_member.json", encoding="utf-8") as f:
        list_name = json.load(f)
    return [i['full_name'] for i in list_name]

def gen_phone_number():
    phone = f"0"

    suffix = [2, 9]
    phone += str(random.choice(suffix))

    for _ in range(8):
        phone += str(random.randint(1, 9))
    return phone

def gen_gender():
    male_percentage = 0.75
    
    if random.random() < male_percentage:
        return "Nam"
    else:   
        return "Nữ"

def gen_birth_date():
    age_groups = [
        (18, 25, 0.45),  # 45% rất trẻ
        (26, 35, 0.35),  # 35% trẻ
        (36, 45, 0.15),  # 15% trung niên
        (46, 60, 0.05),  # 5% lớn tuổi
    ]

    ages, weights = zip(*[((a, b), w) for a, b, w in age_groups])
    min_age, max_age = random.choices(ages, weights=weights)[0]

    return fake.date_of_birth(minimum_age=min_age, maximum_age=max_age).strftime("%Y-%m-%d")

def generate_user():
    list_name = get_list_name()
    n = len(list_name)

    users = []
    for i in range(1, n + 1):
        user = {}
        user["id"] = f"user_{i:05d}"
        user["name"] = list_name[i - 1]
        user["phone"] = gen_phone_number()
        user["gender"] = gen_gender()
        user["birth_date"] = gen_birth_date()
        user["avatar_url"] = f"{user['id']}.jpg"
 
        users.append(user)
    return users



users = generate_user()
print(users)

[{'id': 'user_00001', 'name': 'Nguyễn Thị Như Quỳnh', 'phone': '0934733993', 'gender': 'Nữ', 'birth_date': '1992-09-22', 'avatar_url': 'user_00001.jpg'}, {'id': 'user_00002', 'name': 'Lê Hoàng Quân', 'phone': '0282468775', 'gender': 'Nam', 'birth_date': '2000-11-05', 'avatar_url': 'user_00002.jpg'}, {'id': 'user_00003', 'name': 'Đinh Văn Phượng', 'phone': '0924586545', 'gender': 'Nam', 'birth_date': '1972-10-12', 'avatar_url': 'user_00003.jpg'}, {'id': 'user_00004', 'name': 'Nguyễn Echam Samuel', 'phone': '0998482638', 'gender': 'Nam', 'birth_date': '1977-10-21', 'avatar_url': 'user_00004.jpg'}, {'id': 'user_00005', 'name': 'Nguyễn Xuân Sang', 'phone': '0273883822', 'gender': 'Nam', 'birth_date': '2007-03-18', 'avatar_url': 'user_00005.jpg'}, {'id': 'user_00006', 'name': 'Lê Phú Quý', 'phone': '0958237847', 'gender': 'Nam', 'birth_date': '1999-08-26', 'avatar_url': 'user_00006.jpg'}, {'id': 'user_00007', 'name': 'Lý Quốc Quyền', 'phone': '0242518511', 'gender': 'Nam', 'birth_date': '20

In [83]:
df = pd.DataFrame(users)

In [84]:
df

Unnamed: 0,id,name,phone,gender,birth_date,avatar_url
0,user_00001,Nguyễn Thị Như Quỳnh,0934733993,Nữ,1992-09-22,user_00001.jpg
1,user_00002,Lê Hoàng Quân,0282468775,Nam,2000-11-05,user_00002.jpg
2,user_00003,Đinh Văn Phượng,0924586545,Nam,1972-10-12,user_00003.jpg
3,user_00004,Nguyễn Echam Samuel,0998482638,Nam,1977-10-21,user_00004.jpg
4,user_00005,Nguyễn Xuân Sang,0273883822,Nam,2007-03-18,user_00005.jpg
...,...,...,...,...,...,...
8752,user_08753,Lê Thiên Thi,0974745495,Nam,2002-10-14,user_08753.jpg
8753,user_08754,Phan Tấn Thành,0942885138,Nam,2001-08-05,user_08754.jpg
8754,user_08755,Nguyễn Thế Thành,0277253871,Nam,1985-04-24,user_08755.jpg
8755,user_08756,Trần Thành Vi Thanh,0222814426,Nam,1990-03-04,user_08756.jpg


In [85]:
df.to_csv("users.csv", index=False)

### User_credentials

In [136]:
import hashlib

def hash_password(pw):
    return hashlib.sha256(pw.encode()).hexdigest()

def last_login_at():
    return fake.date_time_between(start_date='-3M', end_date='now').strftime("%Y-%m-%d %H:%M:%S")

def generate_user():
    # Lấy id từ bảng user
    user_df = pd.read_csv("users.csv")
    user_credentials_df = user_df[["id"]].copy()

    # Tạo mật khẩu mặc định 123456
    user_credentials_df["raw_password"] = "123456"

    # Băm mật khẩu
    user_credentials_df["password"] = user_credentials_df["raw_password"].apply(hash_password)

    # Xóa cột mật khẩu thô
    user_credentials_df = user_credentials_df.drop(columns=["raw_password"])

    # Tạo cột last_login_at
    user_credentials_df["last_login_at"] = user_credentials_df["id"].apply(lambda x: last_login_at())

    return user_credentials_df

user_credentials_df = generate_user()

In [137]:
user_credentials_df

Unnamed: 0,id,password,last_login_at
0,user_00001,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2026-01-04 20:48:10
1,user_00002,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-12-28 07:32:16
2,user_00003,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-12-06 05:27:31
3,user_00004,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-12-29 21:12:26
4,user_00005,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-11-30 17:33:12
...,...,...,...
8752,user_08753,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-12-03 05:58:26
8753,user_08754,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2026-01-12 12:28:11
8754,user_08755,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2025-11-08 11:46:54
8755,user_08756,8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3...,2026-01-04 13:15:03


In [138]:
user_credentials_df.to_csv("user_credentials.csv", index=False) 

### PASSENGER & DRIVER

In [220]:
import random

def gen_role():
    r = random.random()
    if r < 0.65:
        return "passenger"
    elif r < 0.80:
        return "driver"
    else:
        return "both"

def devide_users():
    # Lấy id từ bảng user
    df = pd.read_csv("users.csv")
    df = df[["id"]]

    # Tạo cột role với 90% là khách hàng
    df["role"] = df["id"].apply(lambda x: gen_role())

    # Tách thành 2 bảng passengers và drivers
    passengers_df = df[(df["role"] == "passenger") | (df['role'] =="both")].drop(columns=["role"]).reset_index(drop=True)
    drivers_df = df[(df["role"] == "driver") | (df['role'] =="both")].drop(columns=["role"]).reset_index(drop=True)

    return passengers_df, drivers_df

passengers_df, drivers_df = devide_users()

def gen_drivers(df):
    df["rating"] = None
    df["total_trips"] = None
    df["status"] = "active"
    df["created_at"] = df['id'].apply(lambda x: fake.date_time_between(start_date='-1y', end_date='-6M').strftime("%Y-%m-%d %H:%M:%S"))
    df["deleted_at"] = None

    return df

drivers_df = gen_drivers(drivers_df)

def gen_passengers(df):
    df["rating"] = None
    df["ride_count"] = None
    df["status"] = "active"
    df["created_at"] = df['id'].apply(lambda x: fake.date_time_between(start_date='-9M', end_date='now').strftime("%Y-%m-%d %H:%M:%S"))
    df["deleted_at"] = None

    return df

passengers_df = gen_passengers(passengers_df)

In [221]:
drivers_df

Unnamed: 0,id,rating,total_trips,status,created_at,deleted_at
0,user_00007,,,active,2025-04-14 19:57:57,
1,user_00008,,,active,2025-06-10 00:35:45,
2,user_00015,,,active,2025-07-03 20:22:29,
3,user_00016,,,active,2025-05-17 19:26:29,
4,user_00019,,,active,2025-01-26 12:11:32,
...,...,...,...,...,...,...
3107,user_08738,,,active,2025-07-13 10:33:02,
3108,user_08740,,,active,2025-06-19 05:02:57,
3109,user_08741,,,active,2025-05-23 01:30:42,
3110,user_08745,,,active,2025-06-19 19:05:29,


In [222]:
passengers_df

Unnamed: 0,id,rating,ride_count,status,created_at,deleted_at
0,user_00001,,,active,2025-05-06 21:45:20,
1,user_00002,,,active,2025-05-13 12:55:08,
2,user_00003,,,active,2025-09-11 06:02:09,
3,user_00004,,,active,2025-10-25 01:47:44,
4,user_00005,,,active,2025-06-10 14:58:27,
...,...,...,...,...,...,...
7390,user_08753,,,active,2025-05-17 12:51:02,
7391,user_08754,,,active,2025-07-24 14:01:31,
7392,user_08755,,,active,2025-10-09 23:37:40,
7393,user_08756,,,active,2025-10-31 06:03:39,


In [223]:
passengers_df.to_csv("passengers.csv", index=False)
drivers_df.to_csv("drivers.csv", index=False)

### VEHICLES

In [284]:
def gen_type():
    type = ["motorbike", "car"]
    weights = [0.85, 0.15]
    return random.choices(type, weights=weights)[0]

def gen_plate_number():
    prefix = ["59", "63", "54", "60", "61", "62", "15"]
    letter = chr(random.randint(65, 90))  # A-Z
    plate_number = f"{random.choice(prefix)}-{random.choice(letter)}{random.randint(000000, 999999):06d}"

    return plate_number

def gen_brand_model(type):
    if type == "motorbike":
        brands_models = {
            "Honda": ["Wave Alpha", "Future Neo", "Air Blade", "Winner X", "SH Mode", "Vision"],
            "Yamaha": ["Exciter", "Sirius", "Nouvo", "Janus", "Grande"],
            "SYM": ["Attila", "Elizabeth", "Galaxy", "Angel"],
        }
    else:  
        brands_models = {
            "Toyota": ["Vios", "Altis", "Innova", "Fortuner"],
            "Honda": ["City", "Civic", "CR-V", "Accord"],
            "Hyundai": ["Elantra", "Santa Fe", "Tucson", "Accent"],
            "Kia": ["Cerato", "Sportage", "Sorento", "Morning"],
            "Mazda": ["Mazda3", "Mazda6", "CX-5", "CX-8"],
        }

    brand = random.choice(list(brands_models.keys()))
    model = random.choice(brands_models[brand])

    return brand, model

def gen_vehicles():
    # Lấy id của drivers
    drivers_df = pd.read_csv("drivers.csv")

    df = drivers_df[["id"]].copy()
    df = df.rename(columns={"id": "driver_id"})

    df['id'] = df.index + 1

    df = df[["id", "driver_id"]]

    df["type"] = df["driver_id"].apply(lambda x: gen_type())
    df["plate_number"] = df["driver_id"].apply(lambda x: gen_plate_number())

    df[["brand", "model"]] = df["type"].apply(lambda x: pd.Series(gen_brand_model(x)))

    df['status'] = "active"

    return df

vehicles_df = gen_vehicles()

In [285]:
vehicles_df

Unnamed: 0,id,driver_id,type,plate_number,brand,model,status
0,1,user_00007,motorbike,60-M160245,SYM,Galaxy,active
1,2,user_00008,car,54-K876847,Mazda,Mazda3,active
2,3,user_00015,motorbike,60-G772286,Yamaha,Nouvo,active
3,4,user_00016,motorbike,15-I818666,SYM,Attila,active
4,5,user_00019,motorbike,61-X460236,SYM,Angel,active
...,...,...,...,...,...,...,...
3107,3108,user_08738,motorbike,59-B586199,Honda,Air Blade,active
3108,3109,user_08740,car,62-J989152,Honda,City,active
3109,3110,user_08741,motorbike,62-G473554,Yamaha,Sirius,active
3110,3111,user_08745,motorbike,60-M628788,SYM,Attila,active


In [286]:
vehicles_df['type'].value_counts()

type
motorbike    2626
car           486
Name: count, dtype: int64

In [287]:
vehicles_df.to_csv("vehicles.csv", index=False)

### DRIVER_DOCUMENTS

In [315]:
def gen_driver_documents():
    types = ["CCCD", "GPLX"]

    # Lấy id của drivers
    drivers_df = pd.read_csv("drivers.csv")

    all_docs = []
    for type in types:
        
        df = drivers_df[["id"]].copy()
        df = df.rename(columns={"id": "driver_id"})

        df["type"] = type
        df["img_url"] = df["driver_id"].apply(lambda x: f"{x}_{type}.jpg")
        df["status"] = "verified"

        df["expired_at"] = df["driver_id"].apply(lambda x: fake.date_between(start_date='+4y', end_date='+10y').strftime("%Y-%m-%d"))
        all_docs.append(df)

    df = pd.concat(all_docs, ignore_index=True)

    df["id"] = df.index + 1

    df = df[["id", "driver_id", "type", "img_url", "expired_at", "status"]]

    return df

driver_documents_df = gen_driver_documents()

In [302]:
driver_documents_df

Unnamed: 0,id,driver_id,type,img_url,expired_at,status
0,1,user_00007,CCCD,user_00007_CCCD.jpg,2035-07-11,verified
1,2,user_00008,CCCD,user_00008_CCCD.jpg,2029-03-03,verified
2,3,user_00015,CCCD,user_00015_CCCD.jpg,2031-08-30,verified
3,4,user_00016,CCCD,user_00016_CCCD.jpg,2032-08-06,verified
4,5,user_00019,CCCD,user_00019_CCCD.jpg,2035-06-03,verified
...,...,...,...,...,...,...
6219,6220,user_08738,GPLX,user_08738_GPLX.jpg,2034-02-20,verified
6220,6221,user_08740,GPLX,user_08740_GPLX.jpg,2031-11-21,verified
6221,6222,user_08741,GPLX,user_08741_GPLX.jpg,2028-09-14,verified
6222,6223,user_08745,GPLX,user_08745_GPLX.jpg,2035-05-12,verified


In [316]:
driver_documents_df.to_csv("driver_documents.csv", index=False)

### Rides

In [340]:
import random
import math
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

HOTSPOTS = [
    # lat, lng, radius_m, weight
    (10.776889, 106.700806, 300, 0.22),   # Q1
    (10.772104, 106.688514, 350, 0.15),   # Q3
    (10.789345, 106.713478, 400, 0.12),   # Binh Thanh
    (10.803012, 106.680374, 400, 0.10),   # Phu Nhuan
    (10.762910, 106.666987, 500, 0.08),   # Q10
    (10.762622, 106.682287, 500, 0.07),   # Q5
    (10.760179, 106.702820, 450, 0.06),   # Q4
    (10.789923, 106.651528, 600, 0.06),   # Tan Binh
    (10.847014, 106.666564, 700, 0.05),   # Go Vap
    (10.732223, 106.700423, 700, 0.04),   # Q7
    (10.776143, 106.713081, 700, 0.03),   # Q2
    (10.755432, 106.643219, 900, 0.01),   # Q11
]

def random_point(lat, lng, radius_m):
    R = 6371000
    d = radius_m * math.sqrt(random.random())
    theta = random.uniform(0, 2 * math.pi)

    dlat = d * math.cos(theta)
    dlng = d * math.sin(theta)

    lat_new = lat + (dlat / R) * (180 / math.pi)
    lng_new = lng + (dlng / (R * math.cos(math.radians(lat)))) * (180 / math.pi)

    return lat_new, lng_new


def gen_location():
    hotspot = random.choices(
        HOTSPOTS,
        weights=[h[3] for h in HOTSPOTS],
        k=1
    )[0]

    lat, lng, radius, _ = hotspot
    lat, lng = random_point(lat, lng, radius)
    lat, lng = random_point(lat, lng, 10)  # GPS noise ±10m

    return round(lat, 6), round(lng, 6)

def calculate_distance(lat1, lng1, lat2, lng2):
    R = 6371
    dlat = math.radians(lat2 - lat1)
    dlng = math.radians(lng2 - lng1)

    a = math.sin(dlat / 2)**2 + \
        math.cos(math.radians(lat1)) * \
        math.cos(math.radians(lat2)) * \
        math.sin(dlng / 2)**2

    c = 2 * math.asin(math.sqrt(a))
    return round(R * c, 2)

def gen_time(distance_km, base_time):
    requested_at = base_time

    accepted_at = requested_at + timedelta(
        seconds=random.randint(5, 60)
    )

    started_at = accepted_at + timedelta(
        minutes=random.randint(2, 10)
    )

    speed_kmh = random.uniform(15, 30)
    travel_minutes = max(3, distance_km / speed_kmh * 60)

    completed_at = started_at + timedelta(
        minutes=travel_minutes
    )

    return requested_at, accepted_at, started_at, completed_at

def gen_rides(n=1_000_000):
    rows = []

    for i in range(1, n + 1):
        # ====== chọn driver & passenger trước ======
        driver_id = np.random.choice(driver_ids)
        passenger_id = np.random.choice(passenger_ids)

        driver_ct = driver_created[driver_id]
        passenger_ct = passenger_created[passenger_id]

        base_start = max(driver_ct, passenger_ct)

        # ride chỉ xảy ra SAU khi cả 2 đã tồn tại
        base_time = base_start + timedelta(
            seconds=random.randint(0, 30 * 24 * 3600)
        )

        # ====== location ======
        p_lat, p_lng = gen_location()
        d_lat, d_lng = gen_location()

        distance = calculate_distance(p_lat, p_lng, d_lat, d_lng)

        # ====== time ======
        requested_at, accepted_at, started_at, completed_at = gen_time(
            distance, base_time
        )

        rows.append((
            driver_id,
            passenger_id,
            p_lat, p_lng,
            d_lat, d_lng,
            distance,
            requested_at,
            accepted_at,
            started_at,
            completed_at
        ))

    return pd.DataFrame(
        rows,
        columns=[
            "driver_id",
            "passenger_id",
            "pickup_lat", "pickup_lng",
            "dropoff_lat", "dropoff_lng",
            "distance_km",
            "requested_at",
            "accepted_at",
            "started_at",
            "completed_at"
        ]
    )


drivers_df = pd.read_csv("drivers.csv")[["id", "created_at"]]
passengers_df = pd.read_csv("passengers.csv")[["id", "created_at"]]

driver_ids = drivers_df["id"].values
passenger_ids = passengers_df["id"].values

driver_created = dict(
    zip(drivers_df["id"], pd.to_datetime(drivers_df["created_at"]))
)

passenger_created = dict(
    zip(passengers_df["id"], pd.to_datetime(passengers_df["created_at"]))
)

rides_df = gen_rides()

In [343]:
rides_df = rides_df.sort_values(by="requested_at").reset_index(drop=True)

rides_df['id'] = rides_df.index + 1

rides_df = rides_df[[
            "id", "driver_id", "passenger_id",
            "pickup_lat", "pickup_lng",
            "dropoff_lat", "dropoff_lng",
            "distance_km",
            "requested_at", "accepted_at", "started_at", "completed_at"
        ]]

In [344]:
rides_df

Unnamed: 0,id,driver_id,passenger_id,pickup_lat,pickup_lng,dropoff_lat,dropoff_lng,distance_km,requested_at,accepted_at,started_at,completed_at
0,1,user_04220,user_05244,10.791907,106.654706,10.760293,106.665221,3.70,2025-04-17 05:14:41,2025-04-17 05:15:28,2025-04-17 05:19:28,2025-04-17 05:27:27.452726
1,2,user_07632,user_04162,10.759643,106.684679,10.760195,106.678699,0.66,2025-04-17 07:45:51,2025-04-17 07:46:05,2025-04-17 07:52:05,2025-04-17 07:55:05.000000
2,3,user_08706,user_00798,10.775184,106.700794,10.753703,106.641235,6.93,2025-04-17 08:54:39,2025-04-17 08:55:26,2025-04-17 09:03:26,2025-04-17 09:25:53.207599
3,4,user_02664,user_03049,10.763720,106.670221,10.772276,106.688294,2.19,2025-04-17 09:32:22,2025-04-17 09:32:41,2025-04-17 09:35:41,2025-04-17 09:44:09.854124
4,5,user_08217,user_01034,10.776316,106.700784,10.779166,106.701405,0.32,2025-04-17 10:06:59,2025-04-17 10:07:18,2025-04-17 10:13:18,2025-04-17 10:16:18.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999996,user_05651,user_00481,10.803429,106.682392,10.789014,106.715900,4.00,2026-02-14 13:00:53,2026-02-14 13:01:51,2026-02-14 13:04:51,2026-02-14 13:19:47.118909
999996,999997,user_07856,user_04599,10.775791,106.699235,10.788780,106.715387,2.28,2026-02-14 13:34:28,2026-02-14 13:35:05,2026-02-14 13:39:05,2026-02-14 13:45:45.883933
999997,999998,user_06505,user_05671,10.775140,106.701630,10.790138,106.650531,5.83,2026-02-14 13:50:25,2026-02-14 13:51:08,2026-02-14 13:53:08,2026-02-14 14:09:59.295955
999998,999999,user_05615,user_04599,10.761534,106.679316,10.775010,106.688652,1.81,2026-02-14 14:10:15,2026-02-14 14:10:52,2026-02-14 14:12:52,2026-02-14 14:17:43.426977


In [345]:
rides_df.to_csv("rides.csv", index=False)

### FEEDBACKS

In [394]:
rides_df = pd.read_csv("rides.csv")[["id", "driver_id", "passenger_id"]]

rides_df = rides_df.rename(columns={"id": "ride_id", "driver_id": "from_user_id", "passenger_id": "to_user_id"})


In [396]:
ride_ids = rides_df["ride_id"].values

uber_df = pd.read_csv("uber_reviews.csv")[["content", "score"]]

In [405]:
import uuid

def gen_feedbacks():
    ids = np.random.choice(ride_ids, size=len(uber_df))

    df = pd.DataFrame({
        "ride_id": ids,
        "score": uber_df["score"],
        "comment": uber_df["content"]
    })

    df = df.merge(
        rides_df, 
        how="left",
        on="ride_id"
    )

    df["id"] = [uuid.uuid4() for _ in range(len(df))]

    df = df[["id", "ride_id", "from_user_id", "to_user_id", "score", "comment"]]

    return df

feedbacks_df = gen_feedbacks()

In [406]:
feedbacks_df

Unnamed: 0,id,ride_id,from_user_id,to_user_id,score,comment
0,5abd036f-72d7-4cc2-9091-64bb61feb4f0,761147,user_06341,user_06218,5,Good
1,dfc48579-3953-4a31-9d62-711243024c7a,437158,user_06911,user_01787,5,Nice
2,7a960719-9b98-49ba-b837-15c9a09fcc00,812746,user_02966,user_06713,5,Very convenient
3,d2a14075-3aaa-4648-8515-13168a3bcc9f,110686,user_01924,user_01775,4,Good
4,e597f21c-142f-4c44-a035-36604b61f1f2,161238,user_08409,user_07718,5,exllence
...,...,...,...,...,...,...
11995,c27c162c-f528-4897-870a-76020925765a,413311,user_00413,user_08230,5,Excellent!!!
11996,317befc9-903e-4a98-a17e-4d5a585b572c,830149,user_00831,user_06825,5,Worst experience after 10pm in Hyde cityno aut...
11997,22125b2f-a7bc-4f19-9082-f0cef6f8af8a,435294,user_06425,user_00532,5,Exceptional
11998,d98ce116-905d-4f5a-afa0-e45a4711b6f2,569196,user_07713,user_06553,5,Good Service.


In [407]:
feedbacks_df.to_csv("feedbacks.csv", index=False)