## Below are the requirements of Dataset Generation

Day of Billing Date: 100 days
Brand: Aiba, Biba, Ciba
Article: 10 digits, may not be unique
Site: ST01, ST02 upto ST40
Site Area : A, B, C
MRP: (999, 9999)
Promotion Name: P1, P2, P3...P40
Region: R1, R2, R3, R4
POS Billing Quantity: (-1 :2%, 0:8%, 1:90%)
POS Cost: POS Billin Quantity * rand(45% to 50%)
POS Gross Sales: POS Billing Quantity x MRP
POS Total Discount: POS Gross Sales * rand (0% to 30%)
POS Tax amount: (POS GRoss Sales - POS Total Discount)*5%
POS Net Sales WOT: POS Gross Sales - POS Total Discount - POS Tax amount
POS RGM : POS Net Sales WOT - POS Cost
Vendor : V1, V2, V3...V10
Vendor Code: CV1, CV2...corresponding to vendor V1, V2...
Fiber: Cotton, Silk, Cotton/Silk, Linen, Modal 
Fabric:Plian: 80%, Georgette: 2%, ChiffoN: 2% ,Organza: 2%, Crepe: 4%, Satin: 7%, Other:3%
Brand Tag: Concat of Brand + Fabric + Saree
Wash Care Instructions: Silk, Cotton Silk: Dry Clean only; cotton, Linen, Modal: Handwash Separately
C Technique: C1....C10
Broad_craft: B1: for C1 to C3, B2: C4 to C6, B3: C7 to C9, B4: C10
Handloom Mark: yes,no
Silk Mark: For fiber of Silk : yes, other: no
Type: Woven, Printed, Tie-Dyed, Embroidered
Style Code: Corresponding to Article: 7 charaters
craft Code: 3 Codes for every C Technique, eg. for C1 the craft codes are C11, C12, C13
Site Name: Concat of Site and Site Area
Bin: [1000,2000,3000,4000,5000,6000,7000,8000,9000,10000]


In [19]:
import pandas as pd
import random

# Set the number of rows for the dataset
num_rows = 1000

# Initialize lists to store data for each field
billing_date = [pd.to_datetime("2023-01-01") + pd.Timedelta(days=random.randint(0, 100)) for _ in range(num_rows)]
brand = random.choices(["Aiba", "Biba", "Ciba"], k=num_rows)
article = [str(random.randint(1000000000, 9999999999)) for _ in range(num_rows)]
site = [f"ST{random.randint(1, 40):02d}" for _ in range(num_rows)]
site_area = random.choices(["A", "B", "C"], k=num_rows)
mrp = [random.randint(999, 9999) for _ in range(num_rows)]
promotion_name = random.choices([f"P{i}" for i in range(1, 41)], k=num_rows)  # Corrected parentheses position
region = random.choices(["R1", "R2", "R3", "R4"], k=num_rows)
pos_billing_quantity = random.choices([-1, 0, 1], weights=[0.02, 0.08, 0.9], k=num_rows)
pos_cost = [qty * random.uniform(0.45, 0.5) for qty in pos_billing_quantity]
pos_gross_sales = [qty * price for qty, price in zip(pos_billing_quantity, mrp)]
pos_total_discount = [sales * random.uniform(0, 0.3) for sales in pos_gross_sales]
pos_tax_amount = [(sales - discount) * 0.05 for sales, discount in zip(pos_gross_sales, pos_total_discount)]
pos_net_sales_wot = [sales - discount - tax for sales, discount, tax in zip(pos_gross_sales, pos_total_discount, pos_tax_amount)]
pos_rgm = [net_sales - cost for net_sales, cost in zip(pos_net_sales_wot, pos_cost)]
vendor = [f"V{i}" for i in range(1, 11)] * ((num_rows + 9) // 10)  # Repeating vendors to match the number of rows
fiber = random.choices(["Cotton", "Silk", "Cotton/Silk", "Linen", "Modal"], weights=[0.8, 0.02, 0.02, 0.02, 0.04], k=num_rows)
fabric = random.choices(["Plain", "Georgette", "Chiffon", "Organza", "Crepe", "Satin", "Other"],
                        weights=[0.8, 0.02, 0.02, 0.02, 0.04, 0.07, 0.03], k=num_rows)
type = random.choices(["Woven", "Printed", "Tie-Dyed", "Embroidered"], k=num_rows)
style_code = [a[:7] for a in article]
craft_code = [f"{c}{i}" for c in ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for i in range(1, (num_rows // 10) + 1)]
site_name = [f"{s}{a}" for s, a in zip(site, site_area)]
bin_edges = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
bin_labels = [f"{bin_edges[i]}-{bin_edges[i+1]-1}" for i in range(len(bin_edges)-1)]
bin = pd.cut(mrp, bins=bin_edges, labels=bin_labels, right=False)

# Create the DataFrame
data = {
    "Billing Date": billing_date,
    "Brand": brand,
    "Article": article,
    "Site": site,
    "Site Area": site_area,
    "MRP": mrp,
    "Promotion Name": promotion_name,
    "Region": region,
    "POS Billing Quantity": pos_billing_quantity,
    "POS Cost": pos_cost,
    "POS Gross Sales": pos_gross_sales,
    "POS Total Discount": pos_total_discount,
    "POS Tax Amount": pos_tax_amount,
    "POS Net Sales WOT": pos_net_sales_wot,
    "POS RGM": pos_rgm,
    "Vendor": vendor,
    "Fiber": fiber,
    "Fabric": fabric,
    "Brand Tag": [f"{b} {f} Saree" for b, f in zip(brand, fabric)],
    "Wash Care Instructions": ["Dry Clean only" if f in ["Silk", "Cotton/Silk"] else "Handwash Separately" for f in fiber],
    "C Technique": craft_code,
    "Broad Craft": ["B1", "B1", "B1", "B2", "B2", "B2", "B3", "B3", "B3", "B4"] * (num_rows // 10),
    "Handloom Mark": random.choices(["yes", "no"], k=num_rows),
    "Silk Mark": ["yes" if f == "Silk" else "no" for f in fiber],
    "Type": type,
    "Style Code": style_code,
    "Craft Code": craft_code,
    "Site Name": site_name,
    "Bin": bin
}

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("sample_dataset.csv", index=False)
df

Unnamed: 0,Billing Date,Brand,Article,Site,Site Area,MRP,Promotion Name,Region,POS Billing Quantity,POS Cost,...,Wash Care Instructions,C Technique,Broad Craft,Handloom Mark,Silk Mark,Type,Style Code,Craft Code,Site Name,Bin
0,2023-03-27,Ciba,1065034573,ST13,B,8280,P36,R4,1,0.493027,...,Handwash Separately,C11,B1,yes,no,Printed,1065034,C11,ST13B,8000-8999
1,2023-03-29,Biba,9598533596,ST23,B,5277,P16,R2,-1,-0.468308,...,Handwash Separately,C12,B1,yes,no,Woven,9598533,C12,ST23B,5000-5999
2,2023-03-24,Biba,7129657173,ST16,A,9764,P1,R3,0,0.000000,...,Handwash Separately,C13,B1,yes,no,Woven,7129657,C13,ST16A,9000-9999
3,2023-02-21,Aiba,7327684597,ST04,A,4566,P20,R4,0,0.000000,...,Handwash Separately,C14,B2,no,no,Embroidered,7327684,C14,ST04A,4000-4999
4,2023-01-01,Aiba,3965215473,ST27,A,8286,P11,R1,1,0.491225,...,Handwash Separately,C15,B2,no,no,Embroidered,3965215,C15,ST27A,8000-8999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-01-15,Ciba,9770837027,ST22,A,9851,P29,R3,1,0.493591,...,Handwash Separately,C1096,B2,no,no,Printed,9770837,C1096,ST22A,9000-9999
996,2023-02-16,Ciba,4841567228,ST03,B,7362,P20,R4,0,0.000000,...,Handwash Separately,C1097,B3,yes,no,Printed,4841567,C1097,ST03B,7000-7999
997,2023-02-25,Aiba,7923484470,ST35,C,2684,P19,R2,1,0.472107,...,Dry Clean only,C1098,B3,yes,no,Embroidered,7923484,C1098,ST35C,2000-2999
998,2023-03-08,Ciba,9654156889,ST04,B,5325,P28,R2,1,0.465520,...,Handwash Separately,C1099,B3,no,no,Tie-Dyed,9654156,C1099,ST04B,5000-5999
