In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker('en_US') # Menggunakan locale US untuk nama dan alamat

# --- 1. Master Data Generation ---

# 1.1 Warehouses (10)
num_warehouses = 10
warehouses_data = []
for i in range(num_warehouses):
    warehouses_data.append({
        'warehouse_id': i + 1,
        'name': f"Warehouse {chr(65 + i)} - {fake.city()}",
        'location': fake.address()
    })
df_warehouses = pd.DataFrame(warehouses_data)
print("Generated Warehouses:", len(df_warehouses))

# 1.2 Categories (50)
num_categories = 50
categories_data = []
for i in range(num_categories):
    categories_data.append({
        'category_id': i + 1,
        'name': fake.unique.word().capitalize() + ' Category'
    })
df_categories = pd.DataFrame(categories_data)
print("Generated Categories:", len(df_categories))

# 1.3 Suppliers (200)
num_suppliers = 200
suppliers_data = []
for i in range(num_suppliers):
    suppliers_data.append({
        'supplier_id': i + 1,
        'name': fake.company(),
        'contact_person': fake.name(),
        'phone_number': fake.phone_number(),
        'address': fake.address()
    })
df_suppliers = pd.DataFrame(suppliers_data)
print("Generated Suppliers:", len(df_suppliers))

# 1.4 Products (5,000 unique)
num_products = 5000
products_data = []
for i in range(num_products):
    products_data.append({
        'product_id': i + 1,
        'name': fake.unique.catch_phrase().capitalize() + ' Widget',
        'category_id': random.choice(df_categories['category_id']),
        'supplier_id': random.choice(df_suppliers['supplier_id']),
        'description': fake.paragraph(nb_sentences=2),
        'unit_price': round(random.uniform(1.0, 500.0), 2)
    })
df_products = pd.DataFrame(products_data)
print("Generated Products:", len(df_products))

# --- 2. Transactional Data Generation ---

# 2.1 Stock Records (Current Stock - 100,000)
# Each stock record is for a product in a specific warehouse
num_stock_records = 100000
stocks_data = []

# To ensure unique product-warehouse combinations for current stock, we can generate pairs
# and then select from them. This might be fewer than num_stock_records if combinations are limited.
# Or, allow multiple records for the same product in different warehouses.
# Let's aim for 100k *unique* product-warehouse pairs for simplicity here.

product_warehouse_pairs = []
for _ in range(num_stock_records):
    product_warehouse_pairs.append({
        'product_id': random.choice(df_products['product_id']),
        'warehouse_id': random.choice(df_warehouses['warehouse_id'])
    })

# Convert to DataFrame to drop duplicates, then back to list of dicts
df_stock_pairs = pd.DataFrame(product_warehouse_pairs).drop_duplicates()
actual_num_stock_records = len(df_stock_pairs)

for i, row in df_stock_pairs.iterrows():
    stocks_data.append({
        'stock_id': i + 1,
        'product_id': row['product_id'],
        'warehouse_id': row['warehouse_id'],
        'quantity': random.randint(0, 1000), # Current quantity
        'last_updated': fake.date_time_between(start_date='-1y', end_date='now')
    })
df_stocks = pd.DataFrame(stocks_data)
print("Generated Stock Records (current):", len(df_stocks))

# 2.2 Stock Movements (500,000 movements in last 2 years)
num_stock_movements = 500000
stock_movements_data = []
start_date_movements = datetime.now() - timedelta(days=2*365) # Last 2 years

for i in range(num_stock_movements):
    movement_type = random.choice(['IN', 'OUT', 'ADJUSTMENT', 'TRANSFER'])
    qty = random.randint(1, 500)
    
    # Pick a random product and warehouse
    random_product_id = random.choice(df_products['product_id'])
    random_warehouse_id = random.choice(df_warehouses['warehouse_id'])

    stock_movements_data.append({
        'movement_id': i + 1,
        'product_id': random_product_id,
        'warehouse_id': random_warehouse_id,
        'movement_type': movement_type,
        'quantity': qty if movement_type != 'OUT' else -qty, # Positive for IN/ADJ, negative for OUT
        'movement_date': fake.date_time_between(start_date=start_date_movements, end_date='now'),
        'description': fake.sentence(nb_words=6)
    })
df_stock_movements = pd.DataFrame(stock_movements_data)
print("Generated Stock Movements:", len(df_stock_movements))


# 2.3 Purchase Orders (100,000 orders with ~300,000 detail lines)
num_purchase_orders = 100000
purchase_orders_data = []
purchase_order_details_data = []
start_date_po = datetime.now() - timedelta(days=2*365)

po_detail_id = 1
for i in range(num_purchase_orders):
    po_id = i + 1
    num_items_in_po = random.randint(1, 5) # Each PO has 1 to 5 items

    purchase_orders_data.append({
        'purchase_order_id': po_id,
        'supplier_id': random.choice(df_suppliers['supplier_id']),
        'order_date': fake.date_time_between(start_date=start_date_po, end_date='now'),
        'status': random.choice(['PENDING', 'CONFIRMED', 'DELIVERED', 'CANCELLED']),
        'total_amount': 0.0 # Will be calculated from details
    })

    current_po_total = 0.0
    for _ in range(num_items_in_po):
        product_chosen = random.choice(df_products['product_id'])
        qty = random.randint(10, 500)
        unit_price = df_products[df_products['product_id'] == product_chosen]['unit_price'].iloc[0]
        line_total = round(qty * unit_price, 2)
        current_po_total += line_total

        purchase_order_details_data.append({
            'po_detail_id': po_detail_id,
            'purchase_order_id': po_id,
            'product_id': product_chosen,
            'quantity': qty,
            'unit_price': unit_price,
            'line_total': line_total
        })
        po_detail_id += 1
    
    # Update total_amount in the main purchase_orders_data
    purchase_orders_data[-1]['total_amount'] = round(current_po_total, 2)

df_purchase_orders = pd.DataFrame(purchase_orders_data)
df_purchase_order_details = pd.DataFrame(purchase_order_details_data)
print("Generated Purchase Orders:", len(df_purchase_orders))
print("Generated Purchase Order Details:", len(df_purchase_order_details))


# 2.4 Sales Orders (200,000 orders with ~600,000 detail lines)
num_sales_orders = 200000
sales_orders_data = []
sales_order_details_data = []
start_date_so = datetime.now() - timedelta(days=2*365)

# Generate some fake customers for sales orders
num_customers = 50000 # Let's say we have 50k customers
customers_data = []
for i in range(num_customers):
    customers_data.append({
        'customer_id': i + 1,
        'name': fake.name(),
        'address': fake.address(),
        'email': fake.email()
    })
df_customers = pd.DataFrame(customers_data)
print("Generated Customers:", len(df_customers))


so_detail_id = 1
for i in range(num_sales_orders):
    so_id = i + 1
    num_items_in_so = random.randint(1, 5) # Each SO has 1 to 5 items, on average 3

    sales_orders_data.append({
        'sales_order_id': so_id,
        'customer_id': random.choice(df_customers['customer_id']),
        'order_date': fake.date_time_between(start_date=start_date_so, end_date='now'),
        'status': random.choice(['PENDING', 'SHIPPED', 'DELIVERED', 'CANCELLED']),
        'total_amount': 0.0 # Will be calculated from details
    })

    current_so_total = 0.0
    for _ in range(num_items_in_so):
        product_chosen = random.choice(df_products['product_id'])
        qty = random.randint(1, 100) # Smaller quantities for sales
        unit_price = df_products[df_products['product_id'] == product_chosen]['unit_price'].iloc[0] * random.uniform(1.1, 1.5) # Sell higher than cost
        line_total = round(qty * unit_price, 2)
        current_so_total += line_total

        sales_order_details_data.append({
            'so_detail_id': so_detail_id,
            'sales_order_id': so_id,
            'product_id': product_chosen,
            'quantity': qty,
            'unit_price': round(unit_price, 2),
            'line_total': line_total
        })
        so_detail_id += 1
    
    sales_orders_data[-1]['total_amount'] = round(current_so_total, 2)

df_sales_orders = pd.DataFrame(sales_orders_data)
df_sales_order_details = pd.DataFrame(sales_order_details_data)
print("Generated Sales Orders:", len(df_sales_orders))
print("Generated Sales Order Details:", len(df_sales_order_details))

# --- Optional: Save to CSV ---
output_dir = "fake_warehouse_data"
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

df_warehouses.to_csv(os.path.join(output_dir, 'warehouses.csv'), index=False)
df_categories.to_csv(os.path.join(output_dir, 'categories.csv'), index=False)
df_suppliers.to_csv(os.path.join(output_dir, 'suppliers.csv'), index=False)
df_products.to_csv(os.path.join(output_dir, 'products.csv'), index=False)
df_customers.to_csv(os.path.join(output_dir, 'customers.csv'), index=False)

df_stocks.to_csv(os.path.join(output_dir, 'stocks.csv'), index=False)
df_stock_movements.to_csv(os.path.join(output_dir, 'stock_movements.csv'), index=False)
df_purchase_orders.to_csv(os.path.join(output_dir, 'purchase_orders.csv'), index=False)
df_purchase_order_details.to_csv(os.path.join(output_dir, 'purchase_order_details.csv'), index=False)
df_sales_orders.to_csv(os.path.join(output_dir, 'sales_orders.csv'), index=False)
df_sales_order_details.to_csv(os.path.join(output_dir, 'sales_order_details.csv'), index=False)

print(f"\nAll data saved to '{output_dir}' directory.")

Generated Warehouses: 10
Generated Categories: 50
Generated Suppliers: 200
Generated Products: 5000
Generated Stock Records (current): 43281
Generated Stock Movements: 500000


In [2]:
pip install faker pandas numpy


Collecting faker
  Downloading faker-37.12.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.12.0-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.0 MB 4.4 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/2.0 MB 4.0 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 3.6 MB/s  0:00:00
Installing collected packages: faker
Successfully installed faker-37.12.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
