In [24]:
from faker import Faker
from faker.providers import BaseProvider, date_time
import csv
import random
from tqdm.notebook import trange

In [25]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [26]:
fake = Faker()
Faker.seed(100)
fake.add_provider(BaseProvider)
fake.add_provider(date_time)

In [27]:
filesize = 100_000

In [28]:
fieldnames=[
    'Index','Customer Id','First Name','Last Name','Company','City','Country',
    'Phone 1','Phone 2','Email','Subscription Date','Birthday','Website'
]

customerids = []
with open(f'data/customers-{filesize}.csv', mode='w', newline='') as csvfile:
    dictwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
    dictwriter.writeheader()
    for i in trange(1, filesize + 1):
        rowdata = {
            'Index': str(i),
            'Customer Id': fake.hexify("^^^^^^^^^^^^", upper=True),
            'First Name': fake.first_name(),
            'Last Name': fake.last_name(),
            'Company': fake.company(),
            'City': fake.city(),
            'Country': fake.country(),
            'Phone 1': fake.phone_number(),
            'Phone 2': fake.phone_number(),
            'Email': fake.email(),
            'Subscription Date': fake.date_between('-10y', end_date='today'),
            'Birthday': fake.date_between('-60y', '-20y'),
            'Website': fake.url(),
        }
        customerids.append(rowdata['Customer Id'])
        dictwriter.writerow(rowdata)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [29]:
random.shuffle(customerids)
repeating_customers = customerids[:int(len(customerids) / 2)]
customerids.extend(customerid for customerid in repeating_customers for _ in range(random.randint(0, 20)))
random.shuffle(customerids)

In [30]:
def get_fake_money(min = 10, max=100):
    amt = str(fake.random_int(min=min*100, max=max*100))
    amt = amt[:-2] + '.' + amt[-2:]
    return amt


In [31]:
fieldnames = [
    'Index', 'Order Id', 'Customer Id', 'Amount', 'Purchase Date'
]
with open(f'data/orders-from-customers-{filesize}.csv', mode='w', newline='') as csvfile:
    dictwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
    dictwriter.writeheader()
    for i in trange(0, len(customerids)):
        rowdata = {
            'Index': str(i),
            'Order Id': fake.hexify("^^^^^^^^^^^^", upper=True),
            'Customer Id': customerids[i],
            'Amount': get_fake_money(),
            'Purchase Date': fake.date_between('-2y', end_date='today'),
        }
        dictwriter.writerow(rowdata)

  0%|          | 0/603065 [00:00<?, ?it/s]