In [None]:
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

import pandas as pd

from ortools.linear_solver import pywraplp

In [None]:
data = pd.read_csv("../input/santa-workshop-tour-2019/family_data.csv", index_col='family_id')
data.head()

In [None]:
ax = sns.countplot(data=data, x='n_people')
ax.set_title("Count the number of households with different family size")
plt.show()

In [None]:
# n代表家庭成员个数，如果满足第choice需求
def get_penalty(n, choice):
    if choice == 0:
        penalty = 0
    elif choice == 1:
        penalty = 50
    elif choice == 2:
        penalty = 50 + 9 * n
    elif choice == 3:
        penalty = 100 + 9 * n
    elif choice == 4:
        penalty = 200 + 9 * n
    elif choice == 5:
        penalty = 200 + 18 * n
    elif choice == 6:
        penalty = 300 + 18 * n
    elif choice == 7:
        penalty = 300 + 36 * n
    elif choice == 8:
        penalty = 400 + 36 * n
    elif choice == 9:
        penalty = 500 + (36 + 199) * n
    else:
        penalty = 500 + (36 + 398) * n
    return penalty

In [None]:
N_DAYS = 100 # 安排的天数
N_FAMILY = 5000 # 家庭ID个数
MIN_OCCUPANCY = 125 # 最小承载量
MAX_OCCUPANCY = 300 # 最大承载量

In [None]:
# 计算pcost_mat, 每个家庭，在什么时候(day 0-99)访问时的penalty
pcost_mat = np.full(shape=(N_FAMILY, 100), fill_value=99999)

for f in range(N_FAMILY):
    #  家庭成员数
    f_num = data.loc[f, 'n_people']
    # 对滴f个家庭，初始化pcost_mat 为other choice下的penalty
    pcost_mat[f, :] = get_penalty(f_num, 10)
    # 计算choice 0-9的penalty
    for choice in range(10):
        temp = data.loc[f][choice] # choice的天数
        penalty = get_penalty(f_num, choice)
        pcost_mat[f, temp-1] = penalty
        
pcost_mat

In [None]:
# 计算accounting penalty 矩阵, 前一天的参观人数，当天的参观人数
acost_mat = np.zeros((500, 500), dtype=np.float64)

for i in range(acost_mat.shape[0]):  # 当天安排的人数
    for j in range(acost_mat.shape[1]):  # 前一天安排的人数
        diff = abs(i - j)
        acost_mat[i, j] = max(0, (i - 125) / 400 * i**(0.5 + diff / 50))
        
acost_mat        

In [None]:
FAMILY_SIZE = data['n_people'].values
FAMILY_SIZE

In [None]:
# DESIRED 代表每个家庭choice-1
# pcost_mat 的day是1-100， 对应的下标是0-99
DESIRED = data.values[:, :-1] - 1
DESIRED

In [None]:
# 线性规划
def solveLP():
    solver = pywraplp.Solver('AssigmentProblem',
                             pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)
    x = {}  # family_id 在第j天是否参观
    # 每一天有哪些家庭
    caditates = [[] for _ in range(N_DAYS)]

    for i in range(N_FAMILY):  # family_id
        for j in DESIRED[i, :]:  # family_id的choice
            caditates[j].append(i)  # 第j天， 有i个family参观
            # 定义决策变量 x[i,j] i 代表family_id, j 代表第j天参观
            x[i, j] = solver.BoolVar('x[%i, %i]' % (i, j))

    # 每天参观的人数 100个数 x[i,j] = 0 或 1
    daily_occupancy = [
        solver.Sum([x[i, j] * FAMILY_SIZE[i] for i in caditates[j]])
        for j in range(N_DAYS)
    ]  # j 代表1-100天

    # 每个家庭，在10个choice中出现的总数
    family_presence = [
        solver.Sum(x[i, j] for j in DESIRED[i, :]) for i in range(N_FAMILY)
    ]

    # 定义目标函数
    preference_cost = solver.Sum([
        pcost_mat[i, j] * x[i, j] for i in range(N_FAMILY)
        for j in DESIRED[i, :]
    ])
    
    # 满足preference_cost最小
    solver.Minimize(preference_cost)

    # 人为增加约束条件
    for j in range(N_DAYS - 1):
        # 当前人数不超过前一天人数+25
        solver.Add(daily_occupancy[j] - daily_occupancy[j + 1] <= 25)
        solver.Add(daily_occupancy[j + 1] - daily_occupancy[j] <= 25)

    # 每个家庭都在10个choice中出现1次
    for i in range(N_FAMILY):
        solver.Add(family_presence[i] == 1)

    # 每天访问人数约束
    for j in range(N_DAYS):
        solver.Add(daily_occupancy[j] >= MIN_OCCUPANCY)
        solver.Add(daily_occupancy[j] <= MAX_OCCUPANCY)

    res_status = solver.Solve()
    if res_status == 0:
        print(solver.Objective().Value())
        print('耗时：', solver.wall_time())
        print('统计单纯形迭代的次数: ', solver.iterations())
        temp = [(i, j, x[i, j].solution_value()) for i in range(N_FAMILY)
                for j in DESIRED[i, :] if x[i, j].solution_value() > 0]
        df = pd.DataFrame(temp, columns=['family_id', 'day', 'result'])
    else:
        print("无解")
        df = None
    return df

In [None]:
%%time
lp_df = solveLP()
lp_df

In [None]:
lp_df['result'].value_counts()

In [None]:
assigned_df = lp_df[lp_df['result'] > 0.999]

In [None]:
assigned_df

In [None]:
# 没安排上的 不为0和1
unassigned_df = lp_df[(lp_df['result'] < 0.999) & (lp_df['result'] > (1 - 0.999))]

In [None]:
unassigned_df

In [None]:
unassigned_df['family_id'].nunique(), unassigned_df['family_id'].unique()[:10]

In [None]:
assigned_df['family_size'] = FAMILY_SIZE[assigned_df.family_id].copy()

In [None]:
# 统计每天安排上的访问人数 
assigned_daily_occupancy = assigned_df.groupby("day")['family_size'].sum()
assigned_daily_occupancy

In [None]:
# 安排上的每日出现人数可视化
sns.set_context("notebook")
ax = assigned_daily_occupancy.plot(kind='bar', figsize=(20, 4), rot=90)
ax.plot([0, 100], [125, 125], '--', color='r', label='Min Line')
plt.legend(['Min Line', 'People Quantity'])
plt.show()