In [39]:
import os
import pandas as pd
import numpy as np
from scipy.optimize import linear_sum_assignment
import datetime as dt
from collections import defaultdict, Counter
from tqdm import tqdm
import matplotlib.pyplot as plt
import datetime as dt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 16
import seaborn as sns

In [3]:
import pandas as pd
from keras.layers import Dense
from keras.models import Sequential

child_wishes = pd.read_csv('/Users/suzukishinji/kaggle/christmas/child_wishlist_v2.csv', header=None)
targets = pd.read_csv('/Users/suzukishinji/kaggle/christmas/sample_submission_random_v2.csv')['GiftId']
child_wishes['target'] = targets

In [4]:
train_data = pd.DataFrame()
valid_data = pd.DataFrame()
for gift_id in range(1000):
    train_split = child_wishes.loc[child_wishes['target'] == gift_id].iloc[:80]
    valid_split = child_wishes.loc[child_wishes['target'] == gift_id].iloc[80:100]
    train_data = train_data.append(train_split)
    valid_data = valid_data.append(valid_split)

# Shuffle the training data
train_data = train_data.sample(frac=1)

In [5]:
y_train = pd.get_dummies(train_data['target']).values
X_train = train_data.drop('target', axis=1).values
y_valid = pd.get_dummies(valid_data['target']).values
X_valid = valid_data.drop('target', axis=1).values

print('Shapes: X_train: %s, y_train: %s, X_valid: %s, y_valid: %s' % 
      (X_train.shape, y_train.shape, X_valid.shape, y_valid.shape))

Shapes: X_train: (80000, 101), y_train: (80000, 1000), X_valid: (20000, 101), y_valid: (20000, 1000)


In [8]:
model = Sequential()
model.add(Dense(200, activation='relu', input_shape=(101,)))
model.add(Dense(200, activation='relu'))
model.add(Dense(1000, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1218e0cc0>

In [9]:
X_all = child_wishes.drop('target', axis=1).values
predicted_probs = model.predict_proba(X_all)



In [11]:
child_to_gift = {}
# Keep track of how many gifts there are left to give
gift_counts = dict((gift_id, 1000) for gift_id in range(1000))
available_gifts = gift_counts.keys()

for i, gift_probs in enumerate(predicted_probs):
    child_id = child_wishes.iloc[i, 0]

    # Ignore children (twins) that already have a gift
    if child_id in child_to_gift:
        continue

    candidate_gifts = available_gifts
    # If this is a twin we need the gift two times
    if child_id < 4000:
        candidate_gifts = [g for g in available_gifts if gift_counts[g] >= 2]

    # Get the candidate gift with the highest probability
    gift_id = max(candidate_gifts, key=lambda gift_id: gift_probs[gift_id])

    child_to_gift[child_id] = gift_id
    gift_counts[gift_id] -= 1

    # If this is a twin, assign the gift to his other sibling as well
    if child_id < 4000:
        sibling_id = child_id + 1 if child_id % 2 == 0 else child_id - 1
        child_to_gift[sibling_id] = gift_id
        gift_counts[gift_id] -= 1

    # Recalculate the available gifts
    available_gifts = [g for g in available_gifts if gift_counts[g] > 0]
    
pred = sorted(child_to_gift.items(), key= lambda t: t[0])

In [17]:
type(pred)

list

In [26]:
y_pred = pd.DataFrame(pred)

In [41]:
y_pred.shape

(1000000, 2)

In [38]:
df_result = pd.DataFrame()
df_result["ChildId"] = y_pred[0]
df_result["GiftId"] = y_pred[1]
df_result.head()
df_result.to_csv("submit_1225_3.csv",index = False)

In [42]:
submit_1 = pd.read_csv('/Users/suzukishinji/kaggle/christmas/submit_1225_3.csv')
submit_1

Unnamed: 0,ChildId,GiftId
0,0,749
1,1,749
2,2,749
3,3,749
4,4,40
5,5,40
6,6,40
7,7,40
8,8,985
9,9,985


In [13]:
import numpy as np
from collections import Counter

n_children = 1000000  # n children to give
n_gift_type = 1000  # n types of gifts available
n_gift_quantity = 1000  # each type of gifts are limited to this quantity
n_gift_pref = 10  # number of gifts a child ranks
n_child_pref = 1000  # number of children a gift ranks
twins = int(0.04 * n_children)  # 0.4% of all population, rounded to the closest even number
ratio_gift_happiness = 2
ratio_child_happiness = 2


def avg_normalized_happiness(pred, child_pref, gift_pref):
    # check if number of each gift exceeds n_gift_quantity
    gift_counts = Counter(elem[1] for elem in pred)
    for count in gift_counts.values():
        assert count <= n_gift_quantity

    # check if twins have the same gift
    for t1 in range(0, twins, 2):
        twin1 = pred[t1]
        twin2 = pred[t1 + 1]
        assert twin1[1] == twin2[1]

    max_child_happiness = n_gift_pref * ratio_child_happiness
    max_gift_happiness = n_child_pref * ratio_gift_happiness
    total_child_happiness = 0
    total_gift_happiness = np.zeros(n_gift_type)

    for row in pred:
        child_id = row[0]
        gift_id = row[1]

        # check if child_id and gift_id exist
        assert child_id < n_children
        assert gift_id < n_gift_type
        assert child_id >= 0
        assert gift_id >= 0
        child_happiness = (n_gift_pref - np.where(gift_pref[child_id] == gift_id)[0]) * ratio_child_happiness
        if not child_happiness:
            child_happiness = -1

        gift_happiness = (n_child_pref - np.where(child_pref[gift_id] == child_id)[0]) * ratio_gift_happiness
        if not gift_happiness:
            gift_happiness = -1

        total_child_happiness += child_happiness
        total_gift_happiness[gift_id] += gift_happiness

    # print(max_child_happiness, max_gift_happiness
    print('normalized child happiness=',
          float(total_child_happiness) / (float(n_children) * float(max_child_happiness)), \
          ', normalized gift happiness', np.mean(total_gift_happiness) / float(max_gift_happiness * n_gift_quantity))
    return float(total_child_happiness) / (float(n_children) * float(max_child_happiness)) + np.mean(
        total_gift_happiness) / float(max_gift_happiness * n_gift_quantity)

In [18]:
gift_pref = pd.read_csv('/Users/suzukishinji/kaggle/christmas/gift_goodkids_v2.csv', header=None).drop(0, 1).values
child_pref = pd.read_csv('/Users/suzukishinji/kaggle/christmas/child_wishlist_v2.csv', header=None).drop(0, 1).values

In [25]:
gift_pref.shape

(1000, 1000)

In [20]:
score = avg_normalized_happiness(pred, child_pref, gift_pref)

IndexError: index 1000 is out of bounds for axis 0 with size 1000

In [None]:
print(score)

In [None]:
df_result = pd.DataFrame()
df_result["ChildId"] = test_air_store_id
df_result["GiftId"] = predicted_probs
df_result.head()
df_result.to_csv("submit_1225_3.csv",index = False)

In [69]:
def my_avg_normalized_happiness(pred):
    total_child_happiness = 0
    total_gift_happiness = np.zeros(1000)
    for c, g in pred:
        total_child_happiness +=  -CHILD_HAPPINESS[c][g]
        total_gift_happiness[g] += -GIFT_HAPPINESS[g][c]
    nch = total_child_happiness / N_CHILDREN
    ngh = np.mean(total_gift_happiness) / 1000
    print('normalized child happiness', nch)
    print('normalized gift happiness', ngh)
    return nch + ngh

In [70]:
def optimize_block(child_block, current_gift_ids):
    gift_block = current_gift_ids[child_block]
    C = np.zeros((BLOCK_SIZE, BLOCK_SIZE))
    for i in range(BLOCK_SIZE):
        c = child_block[i]
        for j in range(BLOCK_SIZE):
            g = GIFT_IDS[gift_block[j]]
            C[i, j] = CHILD_HAPPINESS[c][g] + GIFT_HAPPINESS[g][c]
    row_ind, col_ind = linear_sum_assignment(C)
    return (child_block[row_ind], gift_block[col_ind])

In [71]:
BLOCK_SIZE = 400
INITIAL_SUBMISSION = '/Users/suzukishinji/kaggle/christmas/sample_submission_random_v2.csv'
N_BLOCKS = (N_CHILDREN - TWINS) / BLOCK_SIZE
print('Block size: {}, n_blocks {}'.format(BLOCK_SIZE, N_BLOCKS))

Block size: 400, n_blocks 2400.0


In [72]:
subm = pd.read_csv(INITIAL_SUBMISSION)
initial_anh = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
print(initial_anh)
subm['gift_rank'] = subm.groupby('GiftId').rank() - 1
subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank']
subm['gift_id'] = subm['gift_id'].astype(np.int64)
current_gift_ids = subm['gift_id'].values

normalized child happiness 0.04621203000047112
normalized gift happiness -4.59355e-05
0.0461660945005


In [None]:
start_time = dt.datetime.now()
for i in range(1):
    child_blocks = np.split(np.random.permutation(range(TWINS, N_CHILDREN)), N_BLOCKS)
    for child_block in tqdm(child_blocks[:500]):
        cids, gids = optimize_block(child_block, current_gift_ids=current_gift_ids)
        current_gift_ids[cids] = gids
    subm['GiftId'] = GIFT_IDS[current_gift_ids]
    anh = my_avg_normalized_happiness(subm[['ChildId', 'GiftId']].values.tolist())
    end_time = dt.datetime.now()
    print(i, anh, (end_time-start_time).total_seconds())
subm[['ChildId', 'GiftId']].to_csv("submit_1225_2.csv",index = False)


  0%|          | 0/500 [00:00<?, ?it/s][A
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/Users/suzukishinji/anaconda/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/suzukishinji/anaconda/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/Users/suzukishinji/anaconda/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 500/500 [26:39<00:00,  3.20s/it]


In [None]:
submit_1 = pd.read_csv('/Users/suzukishinji/kaggle/recluit/submit_1225_2.csv')
submit_1.head()