In [1]:
!pip install scikit-learn==1.1.1

Collecting scikit-learn==1.1.1
  Downloading scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
     |████████████████████████████████| 31.2 MB 21.0 MB/s            ████████████▎  | 28.5 MB 21.0 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.1
    Uninstalling scikit-learn-0.24.1:
      Successfully uninstalled scikit-learn-0.24.1
Successfully installed scikit-learn-1.1.1
You should consider upgrading via the '/opt/app-root/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [2]:
import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

random.seed(0)

In [3]:
category_names = ['Bottle', 'Pen',
                  'Clothing', 'Drink', 'Footwear']  # using this just for reference.

In [4]:
class Category:
    """
    category class used for data gen 
    name = class name
    max_discount = float, max amount of discount an item can have.
    """
    def __init__(self, name, max_stock, discount_level):
        self.name = name
        self.max_stock = max_stock
        self.max_discount = discount_level

In [20]:
def choose_discount(sample, max_, max_disc):
    """
    chooses discount based on % of stock
    """
    pct_stock = sample / max_
    chosen_disc = round(pct_stock*max_disc + .5, 2) # uh oh! big discounts.
    return chosen_disc

In [11]:
def generate_data_single_cat(cat, num_samples):
    """ 
    function to generate synthetic data points for 1 category
    takes in a category object and number of points to generate.
    """
    category_data = []
    for _ in range(num_samples):
        sample_stock = random.randint(0, cat.max_stock)
        pct_stock = sample_stock / cat.max_stock
        discount = choose_discount(sample_stock, cat.max_stock, cat.max_discount)
        category_data.append([cat.name, sample_stock, discount])
    return category_data

In [12]:
z = generate_data_single_cat(Category('hat', 100, .20), 10)

In [13]:
z = np.asarray(z)

In [15]:
print(z[:,2])

['0.62' '0.59' '0.65' '0.55' '0.63' '0.53' '0.57' '0.53' '0.69' '0.52']


In [16]:
def generate_data_all_cats(categories, samples_per):
    """
    generates 'samples_per' datapoints for each category inputted
    """
    data = []
    for cat in categories:
        subset = generate_data_single_cat(cat, samples_per)
        data += subset
    return data

In [17]:
cats = [Category('Bottle', 100, .2),  # up to .2 discount
        Category('Pen', 1000, .5),  # up to .5
        Category('Clothing', 500, .4),  # up to .4
        Category('Drink', 100, .2),  # up to .2 discount
        Category('Footwear', 50, .3)  # up to .3 discount
       ]

In [18]:
X = generate_data_all_cats(cats, 400)

In [19]:
print(len(X))
print(X[0], X[401], X[801], X[1201], X[1601])

2000
['Bottle', 79, 0.66] ['Pen', 296, 0.65] ['Clothing', 218, 0.67] ['Drink', 56, 0.61] ['Footwear', 16, 0.6]


In [21]:
columns = ['Name', 'Stock', 'Discount']
df = pd.DataFrame(X, columns=columns)
df.to_csv("discount_data/arg-synth-data_1.csv")