In [3]:
pip install faker

Collecting faker
  Downloading faker-40.1.2-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.1.2-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.0 MB 1.1 MB/s eta 0:00:02
   --------------- ------------------------ 0.8/2.0 MB 931.2 kB/s eta 0:00:02
   --------------- ------------------------ 0.8/2.0 MB 931.2 kB/s eta 0:00:02
   --------------------- ------------------ 1.0/2.0 MB 798.4 kB/s eta 0:00:02
   --------------------- ------------------ 1.0/2.0 MB 798.4 kB/s eta 0:00:02
   -------------------------- ------------- 1.3/2.0 MB 828.3 kB/s eta 0:00:01
   ------------------------------- -------- 1.6/2.0 MB 847.3 kB/s eta 0:00:01
   ------------------------------- -------- 1.6/2.0 MB 847.3 kB/s eta 0:00:01
   ------------------------------------ --


[notice] A new release of pip is available: 25.1 -> 25.3
[notice] To update, run: C:\Users\routh\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()
random.seed(42)

# ---------------- CONFIG ----------------
NUM_CUSTOMERS = 10000
NUM_STORES = 50
NUM_PRODUCTS = 500
NUM_SALES = 120000
# ----------------------------------------

# ---------- Customers ----------
customers = []
for i in range(NUM_CUSTOMERS):
    customers.append({
        "customer_id": f"C{i+1:05}",
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "email": fake.email(),
        "join_date": fake.date_between(start_date="-1y", end_date="today")
    })

customers_df = pd.DataFrame(customers)

# ---------- Stores ----------
stores = []
cities = ["Bengaluru", "Mumbai", "Delhi", "Chennai", "Hyderabad"]
for i in range(NUM_STORES):
    stores.append({
        "store_id": f"S{i+1:03}",
        "store_name": f"Store_{i+1}",
        "city": random.choice(cities)
    })

stores_df = pd.DataFrame(stores)

# ---------- Products ----------
categories = ["Grocery", "Dairy", "Snacks", "Personal Care", "Beverages"]
products = []
for i in range(NUM_PRODUCTS):
    products.append({
        "product_id": f"P{i+1:04}",
        "product_name": fake.word().capitalize(),
        "category": random.choice(categories),
        "price": random.randint(20, 500)
    })

products_df = pd.DataFrame(products)

# ---------- Sales ----------
sales = []
for i in range(NUM_SALES):
    sale_date = fake.date_between(start_date="-90d", end_date="today")
    customer = random.choice(customers_df["customer_id"])
    store = random.choice(stores_df["store_id"])
    amount = random.randint(50, 3000)

    sales.append({
        "sale_id": f"T{i+1:06}",
        "sale_date": sale_date,
        "customer_id": customer,
        "store_id": store,
        "total_amount": amount
    })

sales_df = pd.DataFrame(sales)

# ---------- Sale Items ----------
sale_items = []
item_id = 1
for _, sale in sales_df.iterrows():
    for _ in range(random.randint(1, 4)):
        product = products_df.sample(1).iloc[0]
        qty = random.randint(1, 5)
        sale_items.append({
            "sale_item_id": f"SI{item_id:07}",
            "sale_id": sale["sale_id"],
            "product_id": product["product_id"],
            "quantity": qty,
            "unit_price": product["price"]
        })
        item_id += 1

sale_items_df = pd.DataFrame(sale_items)

# ---------- Loyalty Rules ----------
loyalty_rules_df = pd.DataFrame([
    {"rule_id": "L001", "rule_name": "Base", "points_per_currency": 1, "min_spend": 0, "bonus_points": 0},
    {"rule_id": "L002", "rule_name": "Mid Spend Bonus", "points_per_currency": 1, "min_spend": 500, "bonus_points": 100},
    {"rule_id": "L003", "rule_name": "High Spend Bonus", "points_per_currency": 1, "min_spend": 1000, "bonus_points": 300},
])

# ---------- Save CSVs ----------
customers_df.to_csv("customers.csv", index=False)
stores_df.to_csv("stores.csv", index=False)
products_df.to_csv("products.csv", index=False)
sales_df.to_csv("sales.csv", index=False)
sale_items_df.to_csv("sale_items.csv", index=False)
loyalty_rules_df.to_csv("loyalty_rules.csv", index=False)

print("Large-scale retail datasets generated successfully.")


Large-scale retail datasets generated successfully.


In [15]:
cust = pd.read_csv("customers.csv")
cust

Unnamed: 0,customer_id,first_name,last_name,email,join_date
0,C00001,Andrea,Roberts,jamiewaters@example.org,2025-07-03
1,C00002,Robert,Perry,jessica17@example.org,2025-04-16
2,C00003,Stacie,Sanchez,meganjimenez@example.net,2025-02-24
3,C00004,Marissa,King,pamelaryan@example.com,2025-09-20
4,C00005,Luis,Smith,richardjanet@example.net,2025-01-28
...,...,...,...,...,...
9995,C09996,Stephen,Crawford,jmiller@example.org,2025-06-04
9996,C09997,Rachel,Morrow,emily23@example.org,2025-04-13
9997,C09998,Steven,Woods,gregorysmith@example.com,2025-05-14
9998,C09999,Jacqueline,Diaz,eolson@example.org,2025-12-23


In [17]:
store = pd.read_csv("stores.csv")
store

Unnamed: 0,store_id,store_name,city
0,S001,Store_1,Bengaluru
1,S002,Store_2,Bengaluru
2,S003,Store_3,Delhi
3,S004,Store_4,Mumbai
4,S005,Store_5,Mumbai
5,S006,Store_6,Mumbai
6,S007,Store_7,Bengaluru
7,S008,Store_8,Hyderabad
8,S009,Store_9,Bengaluru
9,S010,Store_10,Hyderabad


In [16]:
prod = pd.read_csv("products.csv")
prod

Unnamed: 0,product_id,product_name,category,price
0,P0001,Either,Snacks,444
1,P0002,Figure,Beverages,473
2,P0003,Until,Snacks,315
3,P0004,Exactly,Dairy,380
4,P0005,Under,Grocery,43
...,...,...,...,...
495,P0496,Real,Dairy,237
496,P0497,Fall,Grocery,298
497,P0498,Factor,Dairy,351
498,P0499,Field,Dairy,485


In [19]:
sale = pd.read_csv("sales.csv")
sale

Unnamed: 0,sale_id,sale_date,customer_id,store_id,total_amount
0,T000001,2025-12-24,C02331,S005,294
1,T000002,2026-01-17,C02719,S020,2487
2,T000003,2025-10-23,C09326,S019,1848
3,T000004,2025-11-26,C02038,S030,2870
4,T000005,2025-12-02,C04983,S045,1698
...,...,...,...,...,...
119995,T119996,2025-10-27,C02492,S045,1896
119996,T119997,2025-11-20,C08494,S046,2512
119997,T119998,2025-12-23,C06133,S007,1991
119998,T119999,2025-12-07,C05478,S007,332


In [18]:
items = pd.read_csv("sale_items.csv")
items

Unnamed: 0,sale_item_id,sale_id,product_id,quantity,unit_price
0,SI0000001,T000001,P0314,5,477
1,SI0000002,T000001,P0470,5,330
2,SI0000003,T000001,P0395,1,353
3,SI0000004,T000002,P0376,1,223
4,SI0000005,T000002,P0249,1,55
...,...,...,...,...,...
298961,SI0298962,T119999,P0259,3,360
298962,SI0298963,T119999,P0246,2,493
298963,SI0298964,T120000,P0433,4,379
298964,SI0298965,T120000,P0436,2,81


In [12]:
cust = customers_df.to_csv("customers.csv", index=False)
store = stores_df.to_csv("stores.csv", index=False)
prod = products_df.to_csv("products.csv", index=False)
sale = sales_df.to_csv("sales.csv", index=False)
items = sale_items_df.to_csv("sale_items.csv", index=False)
loyalty = loyalty_rules_df.to_csv("loyalty_rules.csv", index=False)

In [20]:
loyalty = pd.read_csv("loyalty_rules.csv")
loyalty

Unnamed: 0,rule_id,rule_name,points_per_currency,min_spend,bonus_points
0,L001,Base,1,0,0
1,L002,Mid Spend Bonus,1,500,100
2,L003,High Spend Bonus,1,1000,300


In [21]:
loyalty.isnull()

Unnamed: 0,rule_id,rule_name,points_per_currency,min_spend,bonus_points
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False


In [22]:
sale.isnull()

Unnamed: 0,sale_id,sale_date,customer_id,store_id,total_amount
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
119995,False,False,False,False,False
119996,False,False,False,False,False
119997,False,False,False,False,False
119998,False,False,False,False,False


In [24]:
prod.isnull()

Unnamed: 0,product_id,product_name,category,price
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
495,False,False,False,False
496,False,False,False,False
497,False,False,False,False
498,False,False,False,False
