In [1]:
import sqlite3
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

random.seed(42)

In [None]:
NUM_RETAILERS = 240
NUM_PRODUCTS = 40
NUM_VISITS = 10000
VISIT_DAYS_RANGE = 365
CITY_NAME = "Bengaluru"
NUM_AGENTS = 2
NUM_BEATS = 12
CHANNELS = ["Pharmacy", "Grocery", "General Trade", "Medical Store"]
CATEGORIES = ["Health", "Personal Care", "Beverages", "Nutrition"]
PACK_SIZES = ["100ml", "200ml", "Box of 10", "Strip of 15", "500ml"]


fake = Faker()

DB_PATH = "sales_agent_co_pilot.db"

In [3]:
#DB connection
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

In [4]:
# DB tables creation
cursor.executescript("""
DROP TABLE IF EXISTS sales;
DROP TABLE IF EXISTS visits;
DROP TABLE IF EXISTS visit_stock;
DROP TABLE IF EXISTS retailer_beat_map;
DROP TABLE IF EXISTS beats;
DROP TABLE IF EXISTS sales_agents;
DROP TABLE IF EXISTS products;
DROP TABLE IF EXISTS retailers;

CREATE TABLE retailers (
    Retailer_ID TEXT PRIMARY KEY,
    Name TEXT,
    City TEXT,
    Channel TEXT,
    Latitude REAL,
    Longitude REAL
);

CREATE TABLE products (
    Product_ID TEXT PRIMARY KEY,
    Product_Name TEXT,
    Category TEXT,
    Price REAL,
    Pack_Size TEXT
);

CREATE TABLE sales_agents (
    Agent_ID TEXT PRIMARY KEY,
    Name TEXT,
    Mobile TEXT,
    Email TEXT
);

CREATE TABLE beats (
    Beat_ID TEXT PRIMARY KEY,
    Beat_Name TEXT,
    City TEXT,
    Assigned_Agent TEXT,
    FOREIGN KEY (Assigned_Agent) REFERENCES sales_agents(Agent_ID)
);

CREATE TABLE retailer_beat_map (
    Retailer_ID TEXT,
    Beat_ID TEXT,
    FOREIGN KEY (Retailer_ID) REFERENCES retailers(Retailer_ID),
    FOREIGN KEY (Beat_ID) REFERENCES beats(Beat_ID)
);

CREATE TABLE retailer_beat_map_optimized (
    Retailer_ID TEXT,
    Beat_ID TEXT,
    FOREIGN KEY (Retailer_ID) REFERENCES retailers(Retailer_ID),
    FOREIGN KEY (Beat_ID) REFERENCES beats(Beat_ID)
);

CREATE TABLE visits (
    Visit_ID TEXT PRIMARY KEY,
    Retailer_ID TEXT,
    Date TEXT,
    Products_Suggested TEXT,
    Feedback TEXT,
    Order_Placed INTEGER,
    Agent_ID TEXT,
    FOREIGN KEY (Retailer_ID) REFERENCES retailers(Retailer_ID),
    FOREIGN KEY (Agent_ID) REFERENCES sales_agents(Agent_ID)
);

CREATE TABLE visit_stock (
    Visit_ID TEXT,
    Product_ID TEXT,
    Retailer_ID TEXT,
    Available_Stock INTEGER,
    PRIMARY KEY (Visit_ID, Product_ID),
    FOREIGN KEY (Visit_ID) REFERENCES visits(Visit_ID),
    FOREIGN KEY (Product_ID) REFERENCES products(Product_ID),
    FOREIGN KEY (Retailer_ID) REFERENCES retailers(Retailer_ID)
);

CREATE TABLE sales (
    Invoice_ID TEXT,
    Visit_ID TEXT,
    Retailer_ID TEXT,
    Product_ID TEXT,
    Quantity INTEGER,
    Date TEXT,
    Total_Amount REAL,
    PRIMARY KEY (Invoice_ID, Product_ID),
    FOREIGN KEY (Visit_ID) REFERENCES visits(Visit_ID),
    FOREIGN KEY (Retailer_ID) REFERENCES retailers(Retailer_ID),
    FOREIGN KEY (Product_ID) REFERENCES products(Product_ID)
);
""")

<sqlite3.Cursor at 0x17c32662540>

In [5]:
# Retailers
retailers = [{
    "Retailer_ID": f"R{i+1:04d}",
    "Name": fake.company(),
    "City": CITY_NAME,
    "Channel": random.choice(CHANNELS),
    "Latitude": round(12.9 + random.uniform(-0.05, 0.05), 6),
    "Longitude": round(77.6 + random.uniform(-0.05, 0.05), 6)
} for i in range(NUM_RETAILERS)]
retailers_df = pd.DataFrame(retailers)

In [6]:
retailers_df.head()

Unnamed: 0,Retailer_ID,Name,City,Channel,Latitude,Longitude
0,R0001,"Hansen, Nguyen and Williams",Bengaluru,Pharmacy,12.852501,77.577503
1,R0002,"Schmidt, Moyer and Rivas",Bengaluru,Grocery,12.863954,77.56025
2,R0003,Moss and Sons,Bengaluru,Pharmacy,12.909049,77.553178
3,R0004,Lewis-Kim,Bengaluru,Pharmacy,12.871864,77.600536
4,R0005,Butler Group,Bengaluru,Pharmacy,12.906125,77.621602


In [7]:
retailers_df['Channel'].value_counts(), retailers_df['Retailer_ID'].nunique()

(Channel
 Pharmacy         68
 General Trade    66
 Grocery          56
 Medical Store    50
 Name: count, dtype: int64,
 240)

In [8]:
# Products
products = [{
    "Product_ID": f"P{i+1:03d}",
    "Product_Name": f"{random.choice(CATEGORIES)} Product {i+1}",
    "Category": random.choice(CATEGORIES),
    "Price": round(random.uniform(20, 500), 2),
    "Pack_Size": random.choice(PACK_SIZES)
} for i in range(NUM_PRODUCTS)]
products_df = pd.DataFrame(products)

product_ids = products_df["Product_ID"].tolist()

In [9]:
products_df.head()

Unnamed: 0,Product_ID,Product_Name,Category,Price,Pack_Size
0,P001,Beverages Product 1,Personal Care,498.53,Strip of 15
1,P002,Nutrition Product 2,Health,245.6,Strip of 15
2,P003,Beverages Product 3,Beverages,341.66,200ml
3,P004,Beverages Product 4,Nutrition,352.96,Box of 10
4,P005,Nutrition Product 5,Health,238.32,Box of 10


In [10]:
products_df['Product_Name'].nunique()

40

In [11]:
products_df['Category'].value_counts()

Category
Nutrition        14
Beverages        10
Health            9
Personal Care     7
Name: count, dtype: int64

In [12]:
# Sales Agents
agents = [{
    "Agent_ID": f"A{i+1:03d}",
    "Name": fake.name(),
    "Mobile": fake.phone_number(),
    "Email": fake.email()
} for i in range(NUM_AGENTS)]
agents_df = pd.DataFrame(agents)

In [13]:
agents_df.head()

Unnamed: 0,Agent_ID,Name,Mobile,Email
0,A001,Kevin Baxter,260-309-7254,millerjoseph@example.net
1,A002,Cheryl Snyder,+1-960-906-3779,nicholssara@example.org


In [14]:
agent_ids = agents_df["Agent_ID"].tolist()
assigned_agents = agent_ids * (NUM_BEATS // len(agent_ids)) + agent_ids[:NUM_BEATS % len(agent_ids)]
random.shuffle(assigned_agents)

In [15]:
# Beats
beats = [{
    "Beat_ID": f"B{i+1:03d}",
    "Beat_Name": f"Beat Zone {i+1}",
    "City": CITY_NAME,
    "Assigned_Agent": assigned_agents[i]
} for i in range(NUM_BEATS)]
beats_df = pd.DataFrame(beats)

In [16]:
beats_df.head()

Unnamed: 0,Beat_ID,Beat_Name,City,Assigned_Agent
0,B001,Beat Zone 1,Bengaluru,A002
1,B002,Beat Zone 2,Bengaluru,A001
2,B003,Beat Zone 3,Bengaluru,A001
3,B004,Beat Zone 4,Bengaluru,A002
4,B005,Beat Zone 5,Bengaluru,A002


In [17]:
beats_df['Assigned_Agent'].value_counts()

Assigned_Agent
A002    6
A001    6
Name: count, dtype: int64

In [18]:
# Retailer to Beat Map
retailer_beat_map_df = retailers_df[["Retailer_ID"]].copy()
retailer_beat_map_df["Beat_ID"] = [random.choice(beats_df["Beat_ID"].tolist()) for _ in range(len(retailers_df))]


In [19]:
# Visit Table
beat_agent_map = beats_df.set_index("Beat_ID")["Assigned_Agent"].to_dict()
retailer_beat_map_df["Agent_ID"] = retailer_beat_map_df["Beat_ID"].map(beat_agent_map)
retailer_agent_map = retailer_beat_map_df.set_index("Retailer_ID")["Agent_ID"].to_dict()

In [20]:
retailer_beat_map_df["Retailer_ID"].nunique(), retailer_beat_map_df["Beat_ID"].nunique()

(240, 12)

In [21]:
retailer_beat_map_df['Beat_ID'].value_counts()

Beat_ID
B011    29
B002    23
B006    21
B001    21
B008    20
B010    20
B004    20
B005    20
B012    19
B003    17
B007    15
B009    15
Name: count, dtype: int64

In [22]:

#Generate Visits and Sales
visits = []
sales = []
visit_stock = []
invoice_counter = 1

for i in range(NUM_VISITS):
    visit_id = f"V{i+1:05d}"
    retailer = random.choice(retailers)
    date = datetime.now().date() - timedelta(days=random.randint(0, VISIT_DAYS_RANGE))
    suggested = random.sample(product_ids, random.randint(1, 5))
    feedback = fake.sentence(nb_words=10)
    order_placed = random.random() < 0.6    #60% conversion rate
    agent_id = retailer_agent_map[retailer["Retailer_ID"]]


    visits.append({
        "Visit_ID": visit_id,
        "Retailer_ID": retailer["Retailer_ID"],
        "Date": date.isoformat(),
        "Products_Suggested": ", ".join(suggested),
        "Feedback": feedback,
        "Order_Placed": int(order_placed),
        "Agent_ID": agent_id
    })

    for sku in random.sample(product_ids, random.randint(2, 6)):
        visit_stock.append({
            "Visit_ID": visit_id,
            "Product_ID": sku,
            "Retailer_ID": retailer["Retailer_ID"],
            "Available_Stock": random.randint(0, 10)
        })

    if order_placed:
        chosen_products = random.sample(suggested, min(3, len(suggested)))
        for pid in chosen_products:
            quantity = random.randint(1, 10)
            price = float(products_df.loc[products_df["Product_ID"] == pid, "Price"].values[0])
            sales.append({
                "Invoice_ID": f"INV{invoice_counter:06d}",
                "Visit_ID": visit_id,
                "Retailer_ID": retailer["Retailer_ID"],
                "Product_ID": pid,
                "Quantity": quantity,
                "Date": date.isoformat(),
                "Total_Amount": round(quantity * price, 2)
            })
        invoice_counter += 1



visits_df = pd.DataFrame(visits)
visit_stock_df = pd.DataFrame(visit_stock)
sales_df = pd.DataFrame(sales)

In [23]:
visits_df.head()

Unnamed: 0,Visit_ID,Retailer_ID,Date,Products_Suggested,Feedback,Order_Placed,Agent_ID
0,V00001,R0038,2025-06-09,"P018, P040",Policy fear part factor bag with window finish...,0,A001
1,V00002,R0225,2024-10-03,"P028, P029",Moment new near dream traditional to kind allo...,0,A002
2,V00003,R0096,2025-04-25,"P005, P006, P028, P007",Group president Mrs data actually fast control.,0,A002
3,V00004,R0032,2024-12-14,"P028, P004, P019",Moment other level arrive already fund perform...,0,A001
4,V00005,R0217,2025-05-18,"P036, P024, P008",Economic by mission scene determine back groun...,0,A001


In [24]:
visits_df.shape

(10000, 7)

In [25]:
visits_df['Retailer_ID'].nunique()

240

In [26]:
visits_df['Order_Placed'].value_counts(normalize=True)*100

Order_Placed
1    59.99
0    40.01
Name: proportion, dtype: float64

In [27]:
visits_df[visits_df["Order_Placed"] == 1][['Retailer_ID']].nunique()

Retailer_ID    240
dtype: int64

In [28]:
sales_df.head()

Unnamed: 0,Invoice_ID,Visit_ID,Retailer_ID,Product_ID,Quantity,Date,Total_Amount
0,INV000001,V00006,R0002,P026,6,2025-04-11,164.22
1,INV000001,V00006,R0002,P037,3,2025-04-11,1374.48
2,INV000002,V00008,R0114,P032,5,2024-08-08,499.3
3,INV000002,V00008,R0114,P027,4,2024-08-08,223.68
4,INV000002,V00008,R0114,P029,3,2024-08-08,962.58


In [29]:
sales_df.shape

(14527, 7)

In [30]:
sales_df['Date'].min(), sales_df['Date'].max()

('2024-07-12', '2025-07-12')

In [31]:
sales_df['Retailer_ID'].nunique(), sales_df['Product_ID'].nunique()

(240, 40)

In [32]:
#Drop invalid sales
valid_visits = visits_df[visits_df["Order_Placed"] == 1][["Visit_ID", "Retailer_ID"]]
sales_df = sales_df.merge(valid_visits, on=["Visit_ID", "Retailer_ID"], how="inner")

In [33]:
sales_df['Retailer_ID'].nunique()

240

In [34]:
sales_df['Visit_ID'].nunique()

5999

In [35]:
#Insert Data into DB
retailers_df.to_sql("retailers", conn, if_exists="append", index=False)
products_df.to_sql("products", conn, if_exists="append", index=False)
agents_df.to_sql("sales_agents", conn, if_exists="append", index=False)
beats_df.to_sql("beats", conn, if_exists="append", index=False)
retailer_beat_map_df[["Retailer_ID", "Beat_ID"]].to_sql("retailer_beat_map", conn, if_exists="append", index=False)
visits_df.to_sql("visits", conn, if_exists="append", index=False)
sales_df.to_sql("sales", conn, if_exists="append", index=False)
visit_stock_df.to_sql("visit_stock", conn, if_exists="append", index=False)

conn.commit()

print(f"DB created and populated successfully: {DB_PATH}")

DB created and populated successfully: sales_agent_co_pilot.db


In [36]:
queries = {
    "Retailer count": "SELECT COUNT(*) FROM retailers;",
    "Product count": "SELECT COUNT(*) FROM products;",
    "Visit count": "SELECT COUNT(*) FROM visits;",
    "Sales count": "SELECT COUNT(*) FROM sales;",
    "Sample 5 visits with order placed": "SELECT * FROM visits WHERE Order_Placed = 1 LIMIT 5;",
    "Sample 5 sales entries": "SELECT * FROM sales LIMIT 5;",
    "Sales joined with products": """
        SELECT s.Invoice_ID, s.Date, s.Quantity, s.Total_Amount, p.Product_Name, p.Category
        FROM sales s
        JOIN products p ON s.Product_ID = p.Product_ID
        LIMIT 5;
    """,
    "Invalid sales check (should be 0 rows)": """
        SELECT s.*
        FROM sales s
        LEFT JOIN visits v ON s.Visit_ID = v.Visit_ID AND s.Retailer_ID = v.Retailer_ID
        WHERE v.Order_Placed != 1 OR v.Order_Placed IS NULL;
    """
}

results = {}
for label, query in queries.items():
    cursor.execute(query)
    results[label] = cursor.fetchall()


results


{'Retailer count': [(240,)],
 'Product count': [(40,)],
 'Visit count': [(10000,)],
 'Sales count': [(14527,)],
 'Sample 5 visits with order placed': [('V00006',
   'R0002',
   '2025-04-11',
   'P037, P026',
   'Property pressure science nature including season station else detail staff can.',
   1,
   'A002'),
  ('V00008',
   'R0114',
   '2024-08-08',
   'P032, P039, P029, P027',
   'Tree want behind discover glass particular.',
   1,
   'A002'),
  ('V00009',
   'R0196',
   '2025-03-04',
   'P021, P008, P001',
   'Television include dark fish candidate front heavy.',
   1,
   'A002'),
  ('V00010',
   'R0221',
   '2025-02-02',
   'P028, P037, P026, P027, P019',
   'Test night daughter themselves fall painting measure place memory show sport.',
   1,
   'A002'),
  ('V00011',
   'R0151',
   '2024-12-19',
   'P006, P026, P020, P022, P015',
   'The special yeah evening already structure.',
   1,
   'A001')],
 'Sample 5 sales entries': [('INV000001',
   'V00006',
   'R0002',
   'P026',
   6

In [37]:
conn.close()