In [3]:
import pandas as pd
import numpy as np

from faker import Faker

faker = Faker()

n_users = 10

In [7]:
users_df = pd.DataFrame({
    "user_id": np.arange(1, n_users + 1),
    "gender": np.random.choice(["M", "F"], size=n_users),
    "created_at": pd.to_datetime("2019-01-01") + pd.to_timedelta(np.random.randint(0, 2000, size=n_users), unit='D'),
    "num_orders": np.random.choice([0,1,2,3,4], p=[0.2, 0.5, 0.2, 0.05, 0.05], size=n_users),
})

In [15]:
orders_df = users_df.loc[users_df.index.repeat(users_df['num_orders'])].copy()
orders_df["order_id"] = np.arange(1, len(orders_df) + 1)
orders_df["order_created_at"] = orders_df["created_at"] + pd.to_timedelta(np.random.randint(1, 180, size=len(orders_df)), unit="D")
orders_df["status"] = np.random.choice(["Complete", "Cancelled", "Returned", "Processing", "Shipped"], 
                                       p=[0.25, 0.15, 0.1, 0.2, 0.3], size=len(orders_df))


In [48]:
# list_of_names = [[faker.first_name(), faker.last_name_nonbinary()] for _ in range(1_000_000)]

first_names = np.array([faker.uuid4() for _ in range(1_000_000)])


In [37]:
from concurrent.futures import ProcessPoolExecutor
import pandas as pd

def generate_chunk(seed_offset, n):
    from faker import Faker
    faker = Faker()
    Faker.seed(seed_offset)
    data = [(faker.first_name(), faker.last_name()) for _ in range(n)]
    return data

def parallel_generate(total=1_000_000, workers=6):
    chunk_size = total // workers
    with ProcessPoolExecutor(max_workers=workers) as ex:
        chunks = list(ex.map(generate_chunk, range(workers), [chunk_size]*workers))
    all_data = [item for sublist in chunks for item in sublist]
    return pd.DataFrame(all_data, columns=["first_name", "last_name"])

df = parallel_generate()

In [38]:
dupes = df.duplicated(subset=["first_name", "last_name"], keep=False)
dupes.sum()

np.int64(858869)

In [40]:
first_names.__len__()

1000000

In [49]:
first_names

array(['711b4562-c32a-4c0f-a433-51f0c4a36dea',
       'f63f9ca4-71c9-46e7-90ea-d457e997010f',
       '3b2ce5a5-d91d-40ab-8122-92d6ad19ebfc', ...,
       '12887df5-36c2-4794-835d-8a2c9a26f43d',
       '6374a4f7-5067-462c-b893-bc86c60418e4',
       '5ebae0ba-710b-418e-baf3-c093d0baef5c'],
      shape=(1000000,), dtype='<U36')

In [50]:
has_duplicates = np.unique(first_names).size != first_names.size
print(has_duplicates)

False


In [54]:
import pandas as pd
import numpy as np
from faker import Faker
import re
import os

class DataGenerator:
    def __init__(self, n_users:int, seed:int = 42):
        self.n_users = n_users
        self.faker = Faker()
        Faker.seed(seed)
        np.random.seed(seed)

    @staticmethod
    def sanitize(arr: np.ndarray) -> np.ndarray:
        return np.vectorize(lambda x: re.sub(r"[^a-zA-Z]", "", x).lower())(arr)
    
    def generate_addresses(self, LOC_DATA:pd.DataFrame) -> pd.DataFrame:
        weights = LOC_DATA['population'] / LOC_DATA['population'].sum()
        # sample n_users based on population weights
        df_sampled = LOC_DATA.sample(n=self.n_users, weights=weights, replace=True).reset_index(drop=True)
        # generate street addresses
        df_sampled['street'] = [self.faker.street_address() for _ in range(self.n_users)]

        # return n_users sampled addresses based on population weights
        return df_sampled
    
    def generate_users(self, addresses: pd.DataFrame) -> pd.DataFrame:
        gender_list = np.random.choice(["M", "F"], size=self.n_users)

        # generate names based on gender vectorized
        first_names = np.where(gender_list == "M",
                               np.array([self.faker.first_name_male() for _ in range(self.n_users)]),
                               np.array([self.faker.first_name_female() for _ in range(self.n_users)]))
        last_names = [self.faker.last_name_nonbinary() for _ in range(self.n_users)]

        uuids = [self.faker.uuid4() for _ in range(self.n_users)]

        emails = np.char.lower(self.sanitize(
            np.array(first_names)) + "." + self.sanitize(np.array(last_names)) + "@example.com"
            )
        
        users_df = pd.DataFrame({
            "id": np.arange(1, self.n_users + 1),
            "uuid": uuids,
            "first_name": first_names,
            "last_name": last_names,
            "email": emails,
            "age": np.random.randint(18, 70, size=self.n_users),
            "gender": gender_list,
            **addresses.to_dict(orient='list'),
            "num_of_orders": np.random.choice([0,1,2,3,4], p=[0.2, 0.5, 0.2, 0.05, 0.05], size=self.n_users),
            "created_at": pd.to_datetime("2019-01-01") + pd.to_timedelta((np.random.rand(self.n_users) ** 2) * 365 * 5, unit='d'), # Bias towards older dates
        })
        self.users_df = users_df
        return users_df
    
if __name__ == "__main__":
    # file path in ./data_generation/data
    file_path = os.path.dirname(os.path.abspath(__file__))

    world_pop_df = pd.read_csv(os.path.join(file_path, "data", "world_pop.csv"))
    distribution_centers_df = pd.read_csv(os.path.join(file_path, "data", "distribution_centers.csv"))

    n_users = 2
    data_generator = DataGenerator(n_users=n_users, seed=42)
    addresses = data_generator.generate_addresses(LOC_DATA=world_pop_df)
    users_df = data_generator.generate_users(addresses=addresses)
    print(users_df.head())

NameError: name '__file__' is not defined