In [4]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta
import csv

# Set up Faker for generating random data
fake = Faker()

# List of provinces
provinces = [
    "Alberta", "British Columbia", "Manitoba", "New Brunswick",
    "Newfoundland and Labrador", "Nova Scotia", "Ontario",
    "Prince Edward Island", "Quebec", "Saskatchewan"
]

# Function to generate a random date within the past one year
def random_date_within_past_year():
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)
    random_days = random.randint(0, 365)
    return start_date + timedelta(days=random_days)

# Function to generate data for a large table and save it in chunks
def generate_large_table(file_path, num_rows, header, valid_ids=None):
    chunk_size = 10000  # You can adjust this value based on your system's memory capacity
    num_chunks = num_rows // chunk_size

    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        for _ in range(num_chunks):
            rows = []
            for _ in range(chunk_size):
                row_data = [fake.uuid4()]
                if valid_ids:
                    row_data.extend(random.sample(valid_ids, len(header) - 1))
                else:
                    row_data.extend(generate_random_data(len(header) - 1))
                rows.append(row_data)

            writer.writerows(rows)

        remaining_rows = num_rows % chunk_size
        if remaining_rows > 0:
            rows = []
            for _ in range(remaining_rows):
                row_data = [fake.uuid4()]
                if valid_ids:
                    row_data.extend(random.sample(valid_ids, len(header) - 1))
                else:
                    row_data.extend(generate_random_data(len(header) - 1))
                rows.append(row_data)

            writer.writerows(rows)

def generate_random_data(num_columns):
    return [fake.name() for _ in range(num_columns)]

# Generating a large table with 100 MB of data
num_rows = 20000  # Change this value to adjust the number of rows

# Generate data for the Order table
table1_header = ['orderId', 'Date', 'Amount']
table1_data = {
    'orderId': [fake.uuid4() for _ in range(num_rows)],
    'Date': [random_date_within_past_year() for _ in range(num_rows)],
    'Amount': [random.uniform(10, 1000) for _ in range(num_rows)]
}
table1_df = pd.DataFrame(table1_data)

# Generate data for the Orders table
valid_order_ids = table1_df['orderId'].to_list()
table2_header = ['orderId', 'CustomerId', 'SalesId']
table2_data = {
    'orderId': random.choices(valid_order_ids, k=num_rows),
    'CustomerId': [fake.uuid4() for _ in range(num_rows)],
    'SalesId': [fake.uuid4() for _ in range(num_rows)]
}
table2_df = pd.DataFrame(table2_data)

# Generate data for the Customer table
valid_customer_ids = table2_df['CustomerId'].to_list()
table3_header = ['CustomerId', 'Name', 'Province', 'Age']
table3_data = {
    'CustomerId': valid_customer_ids,
    'Name': [fake.name() for _ in range(num_rows)],
    'Province': [random.choice(provinces) for _ in range(num_rows)],
    'Age': [random.randint(18, 80) for _ in range(num_rows)]
}
table3_df = pd.DataFrame(table3_data)

# Generate data for the Salesman table
valid_sales_ids = table2_df['SalesId'].to_list()
table4_header = ['SalesId', 'Name', 'Province', 'Age']
table4_data = {
    'SalesId': valid_sales_ids,
    'Name': [fake.name() for _ in range(num_rows)],
    'Province': [random.choice(provinces) for _ in range(num_rows)],
    'Age': [random.randint(25, 65) for _ in range(num_rows)]
}
table4_df = pd.DataFrame(table4_data)

# Save the tables as CSV files
table1_df.to_csv('Order.csv', index=False)
table2_df.to_csv('Orders.csv', index=False)
table3_df.to_csv('Customer.csv', index=False)
table4_df.to_csv('Salesman.csv', index=False)
