This notebook is based on a great notebooks from Vlad Golubev and nagadomi - special thanks to both of 
them for sharing codes!

Initial value (best preference cost solution):
https://www.kaggle.com/nagadomi/mipcl-example-only-preference

In [None]:
%%bash
# Install Python 2.7 because the shared library of MIPCL for Python3 is broken (link error).
git clone git://github.com/yyuu/pyenv.git ~/.pyenv
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bash_profile
echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bash_profile
echo 'eval "$(pyenv init -)"' >> ~/.bash_profile
source ~/.bash_profile
apt-get install -y libssl-dev libreadline-dev
pyenv install 2.7.17
pyenv local 2.7.17
pip install pandas numpy

# Install MIPCL(mipcl_py module)
wget https://mipcl-cpp.appspot.com/static/download/mipcl-py-2.6.1.linux-x86_64.tar.gz
tar --exclude='*docs' -xzvf mipcl-py-2.6.1.linux-x86_64.tar.gz # exclude docs directory due to `too many nested subdirectories error` in kaggle kernel
rm -f ./mipcl_py/mipshell/mipcl.so
ln -s mipcl-py2.so ./mipcl_py/mipshell/mipcl.so # Use mipcl-py2

In [None]:
%%bash
source ~/.bash_profile
python <<__EOF__
from __future__ import print_function
import time
import numpy as np
import pandas as pd
import mipcl_py.mipshell.mipshell as mipshell

def get_days(assigned_days, n_people):
    days = np.zeros(assigned_days.max(), int)
    for i, r in enumerate(assigned_days):
        days[r-1] += n_people[i]
    return days


def example_mipcl(desired, n_people):
    def accounting_penalty(day, next_day):
        return (day - 125.0) * (day**(0.5 + abs(day - next_day) / 50.0)) / 400.0
    FAMILY_COST = np.asarray([0,50,50,100,200,200,300,300,400,500])
    MEMBER_COST = np.asarray([0, 0, 9,  9,  9, 18, 18, 36, 36,235])
    num_days = desired.max()
    num_families = desired.shape[0]
    solver = mipshell.Problem(name='Santa2019 only preference')
    C, B, I = {}, {}, {}

    for fid, choices in enumerate(desired):
        for cid in range(10):
            B[fid, choices[cid]-1] = mipshell.Var(type=mipshell.BIN, lb=0.0, ub=1.0)
            C[fid, choices[cid]-1] = FAMILY_COST[cid] + n_people[fid] * MEMBER_COST[cid]

    for day in range(num_days):
        I[day] = mipshell.Var(type=mipshell.INT, lb=125, ub=300)
        mipshell.sum_(n_people[fid]*B[fid, day] for fid in range(num_families) if (fid,day) in B) == I[day]

    for fid in range(num_families):
        mipshell.sum_(B[fid, day] for day in range(num_days) if (fid,day) in B) == 1

    objective = mipshell.sum_(C[fid, day]*B[fid, day] for fid, day in B)

    solver.minimize(objective)
    solver.optimize(silent=False, gap=0.0)
    if solver.is_solution:
        print("Result: ", solver.getObjVal())
        assigned_days = np.zeros(num_families, int)
        for fid, day in B:
            if B[fid, day].val > 0.5:
                assigned_days[fid] = day + 1
        return assigned_days
    else:
        print("Failed", solver.is_solution, solver.is_infeasible, solver.isPureLP)
        return None


def save(assigned_days):
    with open("submission_init.csv", "w") as f:
        f.write("family_id,assigned_day\n")
        for fid, v in enumerate(assigned_days):
            f.write("{},{}\n".format(fid, v))


if __name__ == "__main__":
    ds = pd.read_csv('../input/santa-workshop-tour-2019/family_data.csv')
    t = time.time()
    ret = example_mipcl(ds.values[:,1:11], ds.values[:,11])
    if ret is not None:
        save(ret)
    print("Elapsed time", time.time() - t)
__EOF__

In [None]:
Stochastic optimalization: https://www.kaggle.com/golubev/c-stochastic-product-search-65ns

In [None]:
%%writefile main.cpp
#include <array>
#include <cassert>
#include <algorithm>
#include <cmath>
#include <fstream>
#include <iostream>
#include <vector>
#include <thread>
#include <random>
using namespace std;
#include <chrono>
using namespace std::chrono;

constexpr array<uint8_t, 14> DISTRIBUTION{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 5}; // You can setup how many families you need for swaps and what best choice use for each family
// {2, 5} it's mean the first random family will brute force for choices 1-2 and the second random family will brute force for choices 1-5

constexpr int MAX_OCCUPANCY = 300;
constexpr int MIN_OCCUPANCY = 125;
constexpr int BEST_N = 1000;
array<uint8_t, 5000> n_people;
array<array<uint8_t, 10>, 5000> choices;
array<array<uint16_t, 10>, 5000> PCOSTM;
array<array<double, 176>, 176> ACOSTM;

void init_data() {
    ifstream in("../input/santa-workshop-tour-2019/family_data.csv");
    
    assert(in && "family_data.csv");
    string header;
    int n,x;
    char comma;
    getline(in, header);
    for (int j = 0; j < choices.size(); ++j) {
        in >> x >> comma;
        for (int i = 0; i < 10; ++i) {
            in >> x >> comma;
            choices[j][i] = x-1;
        }
        in >> n;
        n_people[j] = n;
    }
    array<int, 10> pc{0, 50, 50, 100, 200, 200, 300, 300, 400, 500};
    array<int, 10> pn{0,  0,  9,   9,   9,  18,  18,  36,  36, 235};
    for (int j = 0; j < PCOSTM.size(); ++j)
        for (int i = 0; i < 10; ++i)
            PCOSTM[j][i] = pc[i] + pn[i] * n_people[j];
    
    for (int i = 0; i < 176; ++i)
        for (int j = 0; j < 176; ++j)
            ACOSTM[i][j] = i * pow(i+125, 0.5 + abs(i-j) / 50.0) / 400.0;
}
array<uint8_t, 5000> read_submission(string filename) {
    ifstream in(filename);
    assert(in && "submission_init.csv");
    array<uint8_t, 5000> assigned_day{};
    string header;
    int id, x;
    char comma;
    getline(in, header);
    for (int j = 0; j < choices.size(); ++j) {
        in >> id >> comma >> x;
        assigned_day[j] = x-1;
        auto it = find(begin(choices[j]), end(choices[j]), assigned_day[j]);
        if (it != end(choices[j]))
            assigned_day[j] = distance(begin(choices[j]), it);
    }
    return assigned_day;
}
struct Index {
    Index(array<uint8_t, 5000> assigned_days_) : assigned_days(assigned_days_)  {
        setup();
    }
    array<uint8_t, 5000> assigned_days;
    array<uint16_t, 100> daily_occupancy_{};
    int preference_cost_ = 0;
    void setup() {
        preference_cost_ = 0;
        daily_occupancy_.fill(0);
        for (int j = 0; j < assigned_days.size(); ++j) {
            daily_occupancy_[choices[j][assigned_days[j]]] += n_people[j];
            preference_cost_ += PCOSTM[j][assigned_days[j]];
        }
    }
    double calc(const array<uint16_t, 5000>& indices, const array<uint8_t, DISTRIBUTION.size()>& change) {
        double accounting_penalty = 0.0;
        auto daily_occupancy = daily_occupancy_;
        int preference_cost = preference_cost_;
        for (int i = 0; i < DISTRIBUTION.size(); ++i) {
            int j = indices[i];
            daily_occupancy[choices[j][assigned_days[j]]] -= n_people[j];
            daily_occupancy[choices[j][       change[i]]] += n_people[j];
            
            preference_cost += PCOSTM[j][change[i]] - PCOSTM[j][assigned_days[j]];
        }

        for (auto occupancy : daily_occupancy)
            if (occupancy < MIN_OCCUPANCY)
                return 1e12*(MIN_OCCUPANCY-occupancy);
            else if (occupancy > MAX_OCCUPANCY)
                return 1e12*(occupancy - MAX_OCCUPANCY);

        for (int day = 0; day < 99; ++day)
            accounting_penalty += ACOSTM[daily_occupancy[day]-125][daily_occupancy[day+1]-125];

        accounting_penalty += ACOSTM[daily_occupancy[99]-125][daily_occupancy[99]-125];
        return preference_cost + accounting_penalty;
    }
    void reindex(const array<uint16_t, DISTRIBUTION.size()>& indices, const array<uint8_t, DISTRIBUTION.size()>& change) {
        for (int i = 0; i < DISTRIBUTION.size(); ++i) {
            assigned_days[indices[i]] = change[i];
        }
        setup();
    }
};

double calc(const array<uint8_t, 5000>& assigned_days, bool print=false) {
    int preference_cost = 0;
    double accounting_penalty = 0.0;
    array<uint16_t, 100> daily_occupancy{};
    for (int j = 0; j < assigned_days.size(); ++j) {
        preference_cost += PCOSTM[j][assigned_days[j]];
        daily_occupancy[choices[j][assigned_days[j]]] += n_people[j];
    }
    for (auto occupancy : daily_occupancy)
        if (occupancy < MIN_OCCUPANCY)
            return 1e12*(MIN_OCCUPANCY-occupancy);
        else if (occupancy > MAX_OCCUPANCY)
            return 1e12*(occupancy - MAX_OCCUPANCY);

    for (int day = 0; day < 99; ++day)
        accounting_penalty += ACOSTM[daily_occupancy[day]-125][daily_occupancy[day+1]-125];

    accounting_penalty += ACOSTM[daily_occupancy[99]-125][daily_occupancy[99]-125];
    if (print) {
        cout << preference_cost << " " << accounting_penalty << " " << preference_cost+accounting_penalty << endl;
    }
    return preference_cost + accounting_penalty;
}

void save_sub(const array<uint8_t, 5000>& assigned_day) {
    ofstream out("submission_init2.csv");
    out << "family_id,assigned_day" << endl;
    for (int i = 0; i < assigned_day.size(); ++i)
        out << i << "," << choices[i][assigned_day[i]]+1 << endl;
}
        
const vector<array<uint8_t, DISTRIBUTION.size()>> changes = []() {
    vector<array<uint8_t, DISTRIBUTION.size()>> arr;
    array<uint8_t, DISTRIBUTION.size()> tmp{};
    for (int i = 0; true; ++i) {
        arr.push_back(tmp);
        tmp[0] += 1;
        for (int j = 0; j < DISTRIBUTION.size(); ++j)
            if (tmp[j] >= DISTRIBUTION[j]) {
                if (j >= DISTRIBUTION.size()-1)
                    return arr;
                tmp[j] = 0;
                ++tmp[j+1];
            }
    }
    return arr;
}();

template<class ExitFunction>
void stochastic_product_search(Index index, ExitFunction fn) { // 15'360'000it/s  65ns/it  0.065Âµs/it
    double best_local_score = calc(index.assigned_days);
    thread_local std::mt19937 gen(std::random_device{}());
    gen.seed(1);
    uniform_int_distribution<> dis(0, 4999);
    array<uint16_t, 5000> indices;
    iota(begin(indices), end(indices), 0);
    array<uint16_t, DISTRIBUTION.size()> best_indices{};
    array<uint8_t, DISTRIBUTION.size()> best_change{};
    for (; fn();) {
        bool found_better = false;
        for (int k = 0; k < BEST_N; ++k) {
            for (int i = 0; i < DISTRIBUTION.size(); ++i) //random swap
                swap(indices[i], indices[dis(gen)]);
            for (const auto& change : changes) {
                auto score = index.calc(indices, change);
                if (score < best_local_score) {
                    found_better = true;
                    best_local_score = score;
                    best_change = change;
                    copy_n(begin(indices), DISTRIBUTION.size(), begin(best_indices));
                }
            }
        }
        if (found_better) { // reindex from N best if found better
            index.reindex(best_indices, best_change);
//            save_sub(index.assigned_days);
            calc(index.assigned_days, true);
            
        }
    }
    save_sub(index.assigned_days);
}

int main() {
    init_data();
    auto assigned_day = read_submission("/kaggle/working/submission_init.csv");

    Index index(assigned_day);
    calc(index.assigned_days, true);
//    auto forever = []() { return true; };
//    auto count_exit = [start = 0]() mutable { return (++start <= 1000); };
    auto time_exit = [start = high_resolution_clock::now()]() {
        return duration_cast<minutes>(high_resolution_clock::now()-start).count() < 120; //2h
    };
    
    stochastic_product_search(index, time_exit);
    return 0;
}

In [None]:
!g++ -pthread -lpthread -O3 -std=c++17 -o main main.cpp

In [None]:
!./main

My own piece of code based on: https://www.kaggle.com/golubev/optimization-preference-cost-mincostflow
and idea on this: https://www.kaggle.com/golubev/manual-to-improve-submissions

In [None]:
%%time
from ortools.linear_solver import pywraplp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#%matplotlib inline

NUMBER_DAYS = 100
NUMBER_FAMILIES = 5000
MAX_BEST_CHOICE = 5
data = pd.read_csv('/kaggle/input/santa-workshop-tour-2019/family_data.csv')
submission = pd.read_csv('/kaggle/working/submission_init2.csv')
assigned_days = submission['assigned_day'].values
columns = data.columns[1:11]
DESIRED = data[columns].values
COST_PER_FAMILY        = [0,50,50,100,200,200,300,300,400,500]
COST_PER_FAMILY_MEMBER = [0, 0, 9,  9,  9, 18, 18, 36, 36,235]
N_PEOPLE = data['n_people'].values

def get_daily_occupancy(assigned_days):
    daily_occupancy = np.zeros(100, int)
    for fid, assigned_day in enumerate(assigned_days):
        daily_occupancy[assigned_day-1] += N_PEOPLE[fid]
    return daily_occupancy


    
def cost_function(prediction):
    N_DAYS = 100
    MAX_OCCUPANCY = 300
    MIN_OCCUPANCY = 125
    penalty = 0
    days = list(range(N_DAYS,0,-1))
    tmp = pd.read_csv('/kaggle/input/santa-workshop-tour-2019/family_data.csv', index_col='family_id')
    family_size_dict = tmp[['n_people']].to_dict()['n_people']

    cols = [f'choice_{i}' for i in range(10)]
    choice_dict = tmp[cols].to_dict()

    # We'll use this to count the number of people scheduled each day
    daily_occupancy = {k:0 for k in days}
    
    # Looping over each family; d is the day for each family f
    for f, d in enumerate(prediction):
        # Using our lookup dictionaries to make simpler variable names
        n = family_size_dict[f]
        choice_0 = choice_dict['choice_0'][f]
        choice_1 = choice_dict['choice_1'][f]
        choice_2 = choice_dict['choice_2'][f]
        choice_3 = choice_dict['choice_3'][f]
        choice_4 = choice_dict['choice_4'][f]
        choice_5 = choice_dict['choice_5'][f]
        choice_6 = choice_dict['choice_6'][f]
        choice_7 = choice_dict['choice_7'][f]
        choice_8 = choice_dict['choice_8'][f]
        choice_9 = choice_dict['choice_9'][f]

        # add the family member count to the daily occupancy
        daily_occupancy[d] += n

        # Calculate the penalty for not getting top preference
        if d == choice_0:
            penalty += 0
        elif d == choice_1:
            penalty += 50
        elif d == choice_2:
            penalty += 50 + 9 * n
        elif d == choice_3:
            penalty += 100 + 9 * n
        elif d == choice_4:
            penalty += 200 + 9 * n
        elif d == choice_5:
            penalty += 200 + 18 * n
        elif d == choice_6:
            penalty += 300 + 18 * n
        elif d == choice_7:
            penalty += 300 + 36 * n
        elif d == choice_8:
            penalty += 400 + 36 * n
        elif d == choice_9:
            penalty += 500 + 36 * n + 199 * n
        else:
            penalty += 500 + 36 * n + 398 * n

    # for each date, check total occupancy
    #  (using soft constraints instead of hard constraints)
    for _, v in daily_occupancy.items():
        if  (v < MIN_OCCUPANCY): #(v > MAX_OCCUPANCY) or
            penalty += 100000000

    # Calculate the accounting cost
    # The first day (day 100) is treated special
    accounting_cost = (daily_occupancy[days[0]]-125.0) / 400.0 * daily_occupancy[days[0]]**(0.5)
    # using the max function because the soft constraints might allow occupancy to dip below 125
    accounting_costs = [max(0, accounting_cost)]
    diffs = [0]
    # Loop over the rest of the days, keeping track of previous count
    yesterday_count = daily_occupancy[days[0]]
    for day in days[1:]:
        today_count = daily_occupancy[day]
        diff = abs(today_count - yesterday_count)
        accounting_costs.append(max(0, (today_count-125.0) / 400.0 * today_count**(0.5 + diff / 50.0)))
        yesterday_count = today_count

    return penalty, sum(accounting_costs), penalty + sum(accounting_costs)

from random import sample
from random import seed
seed(2)

for f in range(100):
    ad = assigned_days.copy()
    
    days_for_fix = np.array(sample(range(1,101),50))
   
    daily_occupancy = get_daily_occupancy(ad)
    fids = np.where(np.isin(ad, days_for_fix))[0] # Ids of family for move
    
    solver = pywraplp.Solver('Setup occupation of days', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
    PCOSTM, B = {}, {} # cost matrix, boolean vars matrix
    for fid in fids:
        for i in range(MAX_BEST_CHOICE):
            B[fid, DESIRED[fid][i]-1] = solver.BoolVar(f'b{fid, i}') # B[family, choice_day] = boolean variable 
            PCOSTM[fid, DESIRED[fid][i]-1] = COST_PER_FAMILY[i] + N_PEOPLE[fid] * COST_PER_FAMILY_MEMBER[i]  # PCOSTM[family, choice_day] = cost 

    lower_bounds = np.zeros(100)
    upper_bounds = np.zeros(100)

    delta  = 8
    for fi in days_for_fix:
        lower_bounds[fi-1] = max(daily_occupancy[fi-1]-delta,125)
        upper_bounds[fi-1] = min(daily_occupancy[fi-1]+delta,300)

    D = {}
    
    for j in range(NUMBER_DAYS):
        I = solver.IntVar(lower_bounds[j], upper_bounds[j], f'I{j}')
        solver.Add(solver.Sum([N_PEOPLE[i] * B[i, j] for i in range(NUMBER_FAMILIES) if (i,j) in B]) == I) # sum families over choices days 
        if upper_bounds[j]>124:
            rj = range(int(lower_bounds[j]),int(upper_bounds[j])+1)
            for i in rj:
                D[j, i] = solver.BoolVar(f'd{j, i}') 
            solver.Add(solver.Sum([D[j, i]*i for i in rj]) == I)            

    for i in fids:
        solver.Add(solver.Sum(B[i, j] for j in range(NUMBER_DAYS) if (i,j) in B) == 1) # exactly one day assigned to each family
    sM =solver.Sum(PCOSTM[i, j] * B[i, j] for i, j in B)
    for i in range(NUMBER_DAYS):
        if np.isin(i,days_for_fix-1):
            ri = range(int(lower_bounds[i]),int(upper_bounds[i])+1)
            if i<99:
                sM +=  solver.Sum(D[i,j]*(j-125)/400*j**(0.5+abs(j-daily_occupancy[i+1])/50) for j in ri)
                
            if i>0:
                sM +=  solver.Sum(D[i,j]*(daily_occupancy[i-1]-125)/400*daily_occupancy[i-1]**(0.5+abs(j-daily_occupancy[i-1])/50) for j in ri)
            
    solver.Minimize( sM)
    sol = solver.Solve()

    status = ['OPTIMAL', 'FEASIBLE', 'INFEASIBLE', 'UNBOUNDED', 'ABNORMAL', 'MODEL_INVALID', 'NOT_SOLVED']
    if status[sol] == 'OPTIMAL':
        for i, j in B:
            if B[i, j].solution_value() > 0.5:
                ad[i] = j+1
    if cost_function(ad)[2]<cost_function(assigned_days)[2]:

        submission['assigned_day'] = ad
        assigned_days = ad
        print(cost_function(ad))
score = cost_function(assigned_days)[2]
submission.to_csv(f'submission_{score}.csv', index=False)
