In [1]:
# FILE:    marbles.py
# PURPOSE: Simulate discrete sampling from a bowl of marbles
# AUTHOR:  Mick de Neeve <mick@live.nl>
# DATE:    March 29, 2025


import random


# variables:  bowl is the placeholder for the eventual marbles (explicit case), stat is
#             the distribution, and samp is the counts that are recorded.
bowl = []
stat = {}
samp = {}


# init(Stat):
#   Set dictionaries for basic marble statistics (e.g. {"red":1000, "white":2000,
#   "blue":3000} and samples (in this case smap = {"red":0, "white":0, "blue":0}).
def init(S):

    for key,val in S.items():
        stat[key] = val
    
    for key in stat.keys():
        samp[key] = 0

# fill()
#   Fill the bowl of marbles according to the statistics in global: stat.
def fill():

    for s in stat:
        for _ in range(0, stat[s]): bowl.append(s)


# rand(seed, nr)
#   Generate random list of indeces =< total size to pick elements explicitly.
def rands(seed, nr):

    random.seed(seed)
    
    if nr > sum(stat.values()):
        return -1
    else: return [random.randrange(len(bowl)) for _ in range(nr)]


# cprob()
#   Compute cumulative probability thresholds from bowl stats.
def cprob():

    total = sum(stat.values())      # Total element count
    csum = 0                        # Start cumulative sum at 0
    bounds = {}                     # Dictionary to store cumulative probabilities
    
    for key,count in stat.items():
        csum += count/total         # Update cumulative probability
        bounds[key] = csum          # Store it in the dictionary
    
    return bounds


# prob(seed, nr)
#   Generate random list of probabilities =< total size to pick elements implicitly.
def probs(seed, nr):

    random.seed(seed)
    
    if nr > sum(stat.values()):
        return -1
    else: return [random.uniform(0,1) for _ in range(nr)]


# pick(indeces)
#   Explicit sampling verion; elements are picked from a literally filled bowl.
def pick(ind):

    for i in ind: samp[bowl[i]] += 1


# draw(probs, bounds)
#    Implicit sampling version; elements are picked according to probability bounds.
def draw(probs, bounds):

    for p in probs: 

        for colour,bound in bounds.items():

            if p <= bound:
                samp[colour] += 1
                break


In [None]:
We test what we have (explicit method):

    Marble probabilities: red=0.167, white=0.333, blue=0.500
    
    Initialise:           marbles.init({"red":1000, "white":2000, "blue":3000})
    Fill the bowl:        marbles.fill(); marbles.bowl
    Generate randoms:     R = marbles.rands(42, 200)
    Pick marbles:         marbles.pick(R)
    Inspect data:         marbles.samp == {'red': 37, 'white': 69, 'blue': 94}
    
    Sample probabilities: red=0.185, white=0.350, blue=0.470 (tot=1.005)
    
We test what we added (implicit method):

    Initialise:           marbles.init({"red":1000, "white":2000, "blue":3000})
    Generate cumulatives: C = marbles.cprob()
    Inspect cumulatives:  C == {'red': 0.16666666666666666, 'white': 0.5, 'blue': 1.0}
    Generate randoms:     P = marbles.probs(42, 200)
    Draw marbles:         marbles.draw(P, C)
    Inspect data:         marbles.samp == {'red': 33, 'white': 70, 'blue': 97}
    
    Sample probabilities: red=0.165, white=0.350, blue=0.485 (tot=1)

We try to increase the data:

    Rainbow colours: red, orange, yellow, green, blue, indigo, violet.
    Pick numbers: 1000, 2000, 3000, 4000, 5000, 6000, 7000 = 28000 marbles.

    Marble probabilities: red=0.00357, orange=0.0714, yellow=0.1071, green=0.1429,
                          blue=0.1786, indigo=0.2143, violet=0.2500

    Initialise: marbles.init({"red":1000, "orange":2000, "yellow":3000, "green":4000,
                              "blue":5000, "indigo":6000, "violet":7000})
    Size: 1000

    Explicit: {'red': 33, 'orange': 72, 'yellow': 103, 'green': 154, 'blue': 173,
               'indigo': 206, 'violet': 259}
    Probs: red=0.0033, orange=0.072, yellow=0.103, green=0.154, blue0.173, indigo=0.206,
           violet=0.259
    
    Implicit: C == {'red': 0.03571428571428571, 'orange': 0.10714285714285714,
                    'yellow': 0.21428571428571427, 'green': 0.3571428571428571,
                    'blue': 0.5357142857142857, 'indigo': 0.75, 'violet': 1.0}
    Samp == {'red': 32, 'orange': 65, 'yellow': 98, 'green': 151, 'blue': 161,
             'indigo': 236, 'violet': 257}
    Probs = red=0.0032, orange=0.065, yellow=0.098, green0.151, blue0.161, indigo=0.236,
    violet=0.257
