- Starter submission from https://www.kaggle.com/vipito/santa-ip
- KT: https://www.kaggle.com/c/santa-workshop-tour-2019/discussion/119858
- Cost function: https://www.kaggle.com/nickel/santa-s-2019-fast-pythonic-cost-23-s

In [1]:
import numpy as np
import pandas as pd
from numba import njit, prange
from tqdm import notebook,tqdm
from tqdm import trange

In [2]:
data = pd.read_csv('/kaggle/input/santa-workshop-tour-2019/family_data.csv', index_col='family_id')
family_size = data.n_people.values.astype(np.int8)

penalties = np.asarray([
    [
        0,
        50,
        50 + 9 * n,
        100 + 9 * n,
        200 + 9 * n,
        200 + 18 * n,
        300 + 18 * n,
        300 + 36 * n,
        400 + 36 * n,
        500 + 36 * n + 199 * n,
        500 + 36 * n + 398 * n
    ] for n in range(family_size.max() + 1)
])

family_cost_matrix = np.concatenate(data.n_people.apply(lambda n: np.repeat(penalties[n, 10], 100).reshape(1, 100)))

for fam in data.index:
    for choice_order, day in enumerate(data.loc[fam].drop("n_people")):
        family_cost_matrix[fam, day - 1] = penalties[data.loc[fam, "n_people"], choice_order]


accounting_cost_matrix = np.zeros((500, 500))
for n in range(accounting_cost_matrix.shape[0]):
    for diff in range(accounting_cost_matrix.shape[1]):
        accounting_cost_matrix[n, diff] = max(0, (n - 125.0) / 400.0 * n**(0.5 + diff / 50.0))


@njit(fastmath=True)
def cost_function(prediction, family_size=family_size, family_cost_matrix=family_cost_matrix, accounting_cost_matrix=accounting_cost_matrix):
    N_DAYS = family_cost_matrix.shape[1]
    MAX_OCCUPANCY = 300
    MIN_OCCUPANCY = 125
    penalty = 0
    daily_occupancy = np.zeros(N_DAYS + 1, dtype=np.int16)
    for i, (pred, n) in enumerate(zip(prediction, family_size)):
        daily_occupancy[pred - 1] += n
        penalty += family_cost_matrix[i, pred - 1]

    accounting_cost = 0
    n_low = 0
    n_high = 0
    daily_occupancy[-1] = daily_occupancy[-2]
    for day in range(N_DAYS):
        n_next = daily_occupancy[day + 1]
        n = daily_occupancy[day]
        n_high += (n > MAX_OCCUPANCY)
        n_low += (n < MIN_OCCUPANCY)
        diff = abs(n - n_next)
        accounting_cost += accounting_cost_matrix[n, diff]

    return np.asarray([penalty, accounting_cost, n_low, n_high])


def score(prediction):
    fc, ac, l, h = cost_function(prediction, family_size, family_cost_matrix, accounting_cost_matrix)
    return (fc + ac) + (l + h) * 1000000



In [3]:
SELECTED = 'submission69564.04.csv'
pred = pd.read_csv(f'/kaggle/input/santa-output/{SELECTED}', index_col='family_id').assigned_day.values
init_score = score(pred)
print(init_score)
fam = pd.read_csv("/kaggle/input/santa-workshop-tour-2019/family_data.csv")
sub = pd.read_csv('/kaggle/input/santa-workshop-tour-2019/sample_submission.csv')
sub.assigned_day = pred
sub.to_csv(f'submission_{init_score}.csv',index=False)
pref = fam.values[:,1:-1]

69564.04945628237


In [4]:
def seed_finding(seed, prediction_input):
    prediction = prediction_input.copy()
    np.random.seed(seed)
    best_score = score(prediction)
    original_score = best_score
    print("SEED: {}   ORIGINAL SCORE: {}".format(seed, original_score))
#     bar = trange(100,leave=True)
    for t in range(100):
        for i in range(5000):
            for j in range(10):
                di = prediction[i]
                prediction[i] = pref[i, j]
                cur_score = score(prediction)

                KT = 1
                if t < 5:
                    KT = 1.5
                elif t < 10:
                    KT = 4.5
                else:
                    if cur_score > best_score + 100:
                        KT = 3
                    elif cur_score > best_score + 50 :
                        KT = 2.75
                    elif cur_score > best_score + 20:
                        KT = 2.5
                    elif cur_score > best_score + 10:
                        KT = 2
                    elif cur_score > best_score:
                        KT = 1.5
                    else:
                        KT = 1

                prob = np.exp(-(cur_score - best_score) / KT)
                if np.random.rand() < prob:
                    best_score = cur_score
                else:
                    prediction[i] = di
#         bar.set_description(f"{best_score:,.0f}")
#         bar.refresh()
        if best_score < original_score:
            print("NEW BEST SCORE on seed {}: {}".format(seed, best_score))
            sub.assigned_day = prediction
            sub.to_csv(f'submission_{best_score}.csv', index=False)
#             sub.to_csv(f'submission.csv', index=False)
            break

    if best_score >= original_score:
        print("UNLUCKY on seed {} for 100 runs, no impovement.".format(seed))

    return prediction, best_score

# Greedy Search

In [5]:
# best = pred
# previous_score = score(best)
# best_score = previous_score
# early_stop = 0
# epochs = 200
# for i in notebook.tqdm(range(epochs)):
#     for j in notebook.tqdm(range(5000),leave=False):
#         ch = data.iloc[j,:-1].values
#         for k in range(10):
#             ori,best[j] = best[j],ch[k]
#             current_score = score(best)
#             if current_score<best_score:
#                 best_score = current_score
#             else:
#                 best[j] = ori
#     print(f"epoch {i}/{epochs} -- best_score: {best_score:,.0f}")
#     if round(best_score) >= round(previous_score):
#         print("not improving")
#         break
#     else:
#         previous_score = best_score
                
# score(best)

In [6]:
# from tqdm import trange
# best_score = score(best)
# previous_score = best_score
# epochs = 1
# t = trange(5000,leave=True)
# for i in range(epochs):
#     for j in t:
#         for k in range(j,5000):
#             best[j],best[k] = best[k],best[j]
#             current_score = score(best)
#             if current_score<best_score:
#                 best_score = current_score
#             else:
#                 best[j],best[k] = best[k],best[j]
#         t.set_description(f"epoch {i}/{epochs} -- best_score: {best_score:,.0f}")
#         t.refresh()
#     if round(best_score) >= round(previous_score):
#         print("not improving")
#         break
#     else:
#         previous_score = best_score
                
# score(best)

In [7]:

best_score = init_score

for seed in notebook.tqdm(range(300,400)):
    pred, best_score = seed_finding(seed, pred)
    if best_score < init_score:
        init_score = best_score
        
    else:
        best_score = init_score
    pred = pd.read_csv(f'submission_{best_score}.csv', index_col='family_id').assigned_day.values

HBox(children=(IntProgress(value=0), HTML(value='')))

SEED: 300   ORIGINAL SCORE: 69564.04945628237
NEW BEST SCORE on seed 300: 69561.59910055793
SEED: 301   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 301 for 100 runs, no impovement.
SEED: 302   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 302 for 100 runs, no impovement.
SEED: 303   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 303 for 100 runs, no impovement.
SEED: 304   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 304 for 100 runs, no impovement.
SEED: 305   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 305 for 100 runs, no impovement.
SEED: 306   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 306 for 100 runs, no impovement.
SEED: 307   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 307 for 100 runs, no impovement.
SEED: 308   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 308 for 100 runs, no impovement.
SEED: 309   ORIGINAL SCORE: 69561.59910055793
UNLUCKY on seed 309 for 100 runs, no impovement.
SEED: 310   ORIGINAL SCORE: 69561.59910055793
UNLUCKY

In [8]:
!ls

__notebook__.ipynb  submission_69561.59910055793.csv
__output__.json     submission_69564.04945628237.csv
