Fast computation of expected value for bags.  Uses numpy vectorization.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

First, some definitions

In [None]:
gift_types = ['horse', 'ball', 'bike', 'train', 'coal', 'book', 'doll', 'block', 'gloves']
ngift_types = len(gift_types)
horse, ball, bike, train, coal, book, doll, block, gloves = range(ngift_types)


Let's look at bags composed of a single gift type.  We use a vectorized version of the original numpy distributions.  

In [None]:
def gift_escore(gift, ngift, n=1000):
    # gift is the gift type
    # ngift is the number of toys in the bag
    # n is the number of sample
    if ngift == 0:
        return np.array([0.0])
    np.random.seed(2016)
    if gift == horse:
        dist = np.maximum(0, np.random.normal(5,2,(n, ngift))).sum(axis=1)
    if gift == ball:
        dist = np.maximum(0, 1 + np.random.normal(1,0.3,(n, ngift))).sum(axis=1)
    if gift == bike:
        dist = np.maximum(0, np.random.normal(20,10,(n, ngift))).sum(axis=1)
    if gift == train:
        dist = np.maximum(0, np.random.normal(10,5,(n, ngift))).sum(axis=1)
    if gift == coal:
        dist = 47 * np.random.beta(0.5,0.5,(n, ngift)).sum(axis=1)
    if gift == book:
        dist = np.random.chisquare(2,(n, ngift)).sum(axis=1)
    if gift == doll:
        dist = np.random.gamma(5,1,(n, ngift)).sum(axis=1)
    if gift == block:
        dist = np.random.triangular(5,10,20,(n, ngift)).sum(axis=1)
    if gift == gloves:
        gloves1 = 3.0 + np.random.rand(n, ngift)
        gloves2 = np.random.rand(n, ngift)
        gloves3 = np.random.rand(n, ngift)
        dist = np.where(gloves2 < 0.3, gloves1, gloves3).sum(axis=1)
    # remove bags with weight above 50
    dist = np.where(dist <= 50.0, dist, 0.0)
    return dist.mean()

Let's find a reasonable upper bound on the number of gifts in the bag.  For this we compute the expected score for bags with an increasing number of toys until the score decreases.  The bag with largest score is determining the maximum value.  This is fine when optimizing the expected value, as adding additional toys uses more toys without improving the objective function.

In [None]:
epsilon = 1
max_type = np.zeros(ngift_types).astype('int')
max_value = np.zeros(ngift_types)
for gift in range(ngift_types):
    print(gift_types[gift], end=': ')
    best_value = 0.0
    for j in range(1, 100):
        value = gift_escore(gift, j)
        if value < best_value - epsilon:
            break
        best_value = value
    max_type[gift] = j
    max_value[gift] = best_value
    print(j)
    

We can now look at escore of more general bag types.  First we precompute weights of bags with a single type;  The code is similar to the above one.

For each gift type , we create a 2D array with nsample rows, and ntype columns.  Column j contains the weights of a bag made of j+1 toys of the given gift type.

In [None]:
nsample = 1000000

def weight_distributions_init(gift, ngift, n=nsample):
    #print('gift:', gift, 'ngift:', ngift)
    if ngift == 0:
        return np.array([0.0])
    np.random.seed(2016)
    if gift == horse:
        dist = np.maximum(0, np.random.normal(5,2,(n, ngift)))
    if gift == ball:
        dist = np.maximum(0, 1 + np.random.normal(1,0.3,(n, ngift)))
    if gift == bike:
        dist = np.maximum(0, np.random.normal(20,10,(n, ngift)))
    if gift == train:
        dist = np.maximum(0, np.random.normal(10,5,(n, ngift)))
    if gift == coal:
        dist = 47 * np.random.beta(0.5,0.5,(n, ngift))
    if gift == book:
        dist = np.random.chisquare(2,(n, ngift))
    if gift == doll:
        dist = np.random.gamma(5,1,(n, ngift))
    if gift == block:
        dist = np.random.triangular(5,10,20,(n, ngift))
    if gift == gloves:
        gloves1 = 3.0 + np.random.rand(n, ngift)
        gloves2 = np.random.rand(n, ngift)
        gloves3 = np.random.rand(n, ngift)
        dist = np.where(gloves2 < 0.3, gloves1, gloves3)
    for j in range(1, ngift):
        dist[:,j] += dist[:,j-1]
    return dist

all_weight_distributions = dict()
    
for gift in range(ngift_types):
    print(gift_types[gift])
    all_weight_distributions[gift] = weight_distributions_init(gift, max_type[gift])

We can now compute expected value of complex bags with lookups of precomputed weight distributions.  With a slight change it code it is easy to compute additional statistics like the variance of the weight.

In [None]:
def weight_distributions(gift, ngift):
    if ngift <= 0:
        return 0
    if ngift >= max_type[gift]:
        return 51
    return all_weight_distributions[gift][:,ngift-1]

def bagtoy_score(nballs=0, nbikes=0, nblocks=0, nbooks=0, ncoal=0, 
                           ndolls=0, ngloves=0, nhorses=0, ntrains=0):
    weights = np.zeros(nsample)
    ntypes = (nhorses, nballs, nbikes, ntrains, ncoal, nbooks, ndolls, nblocks, ngloves)
    for gift in range(ngift_types):
        weights += weight_distributions(gift, ntypes[gift])
    weights = np.where(weights <= 50.0, weights, 0.0)
    return weights.mean(), weights.std()

Let's try some examples

In [None]:
bagtoy_score(nballs=10)

In [None]:
bagtoy_score(ngloves=28)

In [None]:
bagtoy_score(nbikes=1, ndolls=1, ngloves=3)

In [None]:
bagtoy_score(nballs=6, ntrains=1, nbooks=1, ndolls=1, nblocks=1)

It is fast

In [None]:
%timeit bagtoy_score(nballs=6, ntrains=1, nbooks=1, ndolls=1, nblocks=1)