In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()
np.random.seed(42)

In [6]:
# Define Constants
products = ['Milk', 'Panner', 'Curd', 'Ghee', 'Lassi']
suppliers = ['Dairy Fresh', 'Farm Roots', 'Milko Pure', 'Creamy Way', 'Green Valley']
labs = ['NABL Lab A', 'NABL Lab B', 'NABL Lab C']
regions = ['Pune', 'Mumbai', 'Bangalore', 'Chennai', 'Delhi']
start_date = pd.to_datetime('2024-01-01')
num_rows = 100000

In [23]:
# generate datasets
data = []

for i in range(num_rows):
    date = start_date + pd.to_timedelta(random.randint(0, 365), unit='D')
    product = random.choice(products)
    batch_id = f"BTCH{i+1:06d}"
    region = random.choice(regions)
    unit_checked = random.randint(80, 150)
    defect = random.randint(0, 5)
    adulteration_fail = round(random.uniform(0, 3), 2) # % of failed units
    hygiene_fail = round(random.uniform(0, 2), 2)
    supplier_name = random.choice(suppliers)
    delivery_hours = round(random.uniform(4, 24), 1)
    cold_chain_breach = random.choice(['Yes', 'No'])
    fat_percentage = round(random.uniform(3.2, 4.5), 2)
    snf_percentage = round(random.uniform(8.2, 9.5), 2)
    batch_temp = round(random.uniform(3, 7), 1)
    test_lab_name = random.choice(labs)
    
    data.append([
        batch_id, date, product, region, unit_checked, defect,
        adulteration_fail, hygiene_fail, supplier_name,
        delivery_hours, cold_chain_breach, fat_percentage,
        snf_percentage, batch_temp, test_lab_name
    ])

In [24]:
# Create DataFrame
columns = [
    'batch_id', 'date', 'product', 'region', 'units_checked', 'defect',
    'adulteration_fail', 'hygiene_fail', 'supplier_name',
    'delivery_hours', 'cold_chain_breach', 'fat_percentage',
    'snf_percentage', 'batch_temp', 'test_lab_name'
]

df_quality = pd.DataFrame(data, columns=columns)

In [25]:
# save to CSV
df_quality.to_csv("quality_data.csv", index=False)
print(" quality_data.csv created with", len(df_quality), "rows.")

 quality_data.csv created with 30000 rows.


In [26]:
df_quality.head()

Unnamed: 0,batch_id,date,product,region,units_checked,defect,adulteration_fail,hygiene_fail,supplier_name,delivery_hours,cold_chain_breach,fat_percentage,snf_percentage,batch_temp,test_lab_name
0,BTCH000001,2024-04-13,Curd,Chennai,121,3,2.6,1.83,Milko Pure,16.0,Yes,3.91,9.49,4.9,NABL Lab A
1,BTCH000002,2024-01-20,Curd,Chennai,81,1,1.14,1.96,Creamy Way,10.3,Yes,4.19,8.35,6.7,NABL Lab A
2,BTCH000003,2024-07-23,Paneer,Delhi,105,1,0.89,0.35,Farm Roots,7.9,No,3.39,9.39,3.9,NABL Lab A
3,BTCH000004,2024-05-08,Lassi,Delhi,94,3,0.61,1.22,Green Valley,18.8,No,4.38,8.44,4.5,NABL Lab A
4,BTCH000005,2024-07-27,Curd,Mumbai,84,3,0.21,1.11,Creamy Way,22.6,Yes,3.68,8.28,3.0,NABL Lab C


In [12]:
#DATASET 2 COMPLAINTS DATA
# Import libraries
import pandas as pd
import random
from faker import Faker

# setup
fake = Faker()
random.seed(42)

# Configuaration
num_rows = 50000
products = ['Milk', 'Paneer', 'Curd', 'Ghee', 'Lassi']
complaint_types = ['Late Delivery', 'Spoiled Products', 'Bad Smell', 'Packaging Issue', 'Taste Issue']
channels = ['App', 'Call', 'walk-in']
severities = ['Low', 'Medium', 'High']

In [14]:
#generate data
data = []

for i in range(num_rows):
    complaint_id = f"CMP{i+1:06d}"
    date = fake.date_between(start_date='-365d', end_date='today')
    product = random.choice(products)
    complaint_type = random.choice(complaint_types)
    taste_score = round(random.uniform(1.0, 5.0), 1)
    resolved = random.choice(['Yes', 'No'])
    resolution_days = random.randint(0, 7) if resolved == 'Yes' else None
    delivery_time = round(random.uniform(4, 24), 1)
    source_channel = random.choices(severities, weights=[60, 30, 10])[0]
    complaint_severity = random.choices(severities, weights=[60, 30, 10])[0]
    customer_rating_after_resolution = round(random.uniform(3.0, 5.0), 1) if resolved == 'Yes' else None
    
    data.append([
        complaint_id, date, product, complaint_type, taste_score, resolved,
        resolution_days, delivery_time, source_channel,
        complaint_severity, customer_rating_after_resolution
    ])

In [18]:
# Create DataFrame
columns = [
    'complaint_id', 'date', 'product', 'complaint_type', 'taste_score', 'resolved',
    'resolution_days', 'delivery_time', 'source_channel',
    'complaint_severity', 'customer_rating_after_resolution'
]

df_complaints = pd.DataFrame(data, columns=columns)

# Export to Csv
df_complaints.to_csv("complaint_data.csv", index=False)

#preview
print("complaints_data.csv created with", len(df_complaints), "rows.")
df_complaints.head()

complaints_data.csv created with 50000 rows.


Unnamed: 0,complaint_id,date,product,complaint_type,taste_score,resolved,resolution_days,delivery_time,source_channel,complaint_severity,customer_rating_after_resolution
0,CMP000001,2025-02-09,Milk,Late Delivery,4.0,Yes,3.0,6.8,Low,Medium,4.1
1,CMP000002,2024-12-11,Lassi,Packaging Issue,1.1,Yes,3.0,8.7,Medium,Low,4.4
2,CMP000003,2025-01-07,Lassi,Packaging Issue,1.9,No,,20.2,Low,Medium,
3,CMP000004,2025-02-01,Ghee,Bad Smell,2.1,Yes,5.0,6.0,Low,Low,3.7
4,CMP000005,2025-07-10,Curd,Late Delivery,3.9,Yes,6.0,5.6,Low,Medium,4.8


In [19]:
# 3rd Dashboard returns
# Step 1: Import
import pandas as pd
import random
from faker import Faker

# Step 2: Setup
fake = Faker()
random.seed(42)

# Config
num_rows = 30000
products = ['Milk', 'Paneer', 'Curd', 'Ghee', 'Lassi']
reasons = ['Spoiled', 'Late Delivery', 'Damaged Packaging', 'Wrong Product', 'Taste Issue']
reason_categories = {
    'Spoiled': 'Quality',
    'Late Delivery': 'Delivery',
    'Damaged Packaging': 'Packaging',
    'Wrong Product': 'Operations',
    'Taste Issue': 'Quality'
}
teams = ['Logistics', 'Customer Care', 'Quality Control']

# Step 3: Generate data
data = []

for i in range(num_rows):
    return_id = f"RTN{i+1:06d}"
    return_date = fake.date_between(start_date='-365d', end_date='today')
    product = random.choice(products)
    return_units = random.randint(1, 10)
    reason = random.choice(reasons)
    reason_category = reason_categories[reason]
    delivery_delay_hours = round(random.uniform(0, 10), 1)
    product_batch_id = f"BTCH{random.randint(1, 100000):06d}"
    resolved_by_team = random.choice(teams)

    data.append([
        return_id, return_date, product, return_units, reason,
        reason_category, delivery_delay_hours, product_batch_id,
        resolved_by_team
    ])

# Step 4: Create DataFrame
columns = [
    'return_id', 'return_date', 'product', 'return_units', 'return_reason',
    'reason_category', 'delivery_delay_hours', 'product_batch_id',
    'resolved_by_team'
]

df_returns = pd.DataFrame(data, columns=columns)

# Step 5: Export
df_returns.to_csv("returns_data.csv", index=False)

# Preview
print("✅ returns_data.csv generated with", len(df_returns), "rows.")
df_returns.head()


✅ returns_data.csv generated with 30000 rows.


Unnamed: 0,return_id,return_date,product,return_units,return_reason,reason_category,delivery_delay_hours,product_batch_id,resolved_by_team
0,RTN000001,2024-10-10,Milk,1,Damaged Packaging,Packaging,2.4,BTCH018290,Quality Control
1,RTN000002,2025-02-13,Milk,9,Spoiled,Quality,5.9,BTCH004166,Logistics
2,RTN000003,2024-09-15,Milk,4,Late Delivery,Delivery,5.1,BTCH003479,Quality Control
3,RTN000004,2025-03-31,Paneer,9,Wrong Product,Operations,2.2,BTCH077237,Customer Care
4,RTN000005,2024-11-13,Milk,3,Wrong Product,Operations,3.4,BTCH020380,Logistics
