In [None]:
import torch
import torch.nn as nn

import math
import random
import pickle
import json
import copy
import os
import numpy as np
import bisect
import time
# device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
device = 'cpu'

MaxSeq=120
MaxNum=16
data_path = '../data/'
cache_path = '../cache/'
model_path = '/kaggle_simulations/agent/'
# model_path = './'

init_prob = 0.5 

def get_mask(seq_len):
    return torch.from_numpy( np.triu(np.ones((seq_len ,seq_len)), k=1).astype('bool')).to(device)

def get_pos(seq_len):
    return torch.arange( seq_len ).unsqueeze(0).to(device)


def cnt_cut(x):
    if x<=0:
        return 0
    else:
        return x
    
class EncModel(nn.Module):
    def __init__(self, MaxNum, embed_dim=128, nlayers1=2, nlayers2=2, nheads=8, dropout=0):
        super(EncModel, self).__init__()
        self.MaxNum = MaxNum
        self.embed_dim = embed_dim
        self.pos_embd1 = nn.Embedding(MaxSeq, embed_dim)
        self.pos_embd2 = nn.Embedding(self.MaxNum, embed_dim)
        self.seq_embd = nn.Embedding(9, embed_dim)
        self.type_embd = nn.Embedding(self.MaxNum+2, embed_dim)
        self.cnt_embd = nn.Embedding(2002, embed_dim)

        self.layer_normal1 = nn.LayerNorm(embed_dim)
        encoder_layers1 = nn.TransformerEncoderLayer(embed_dim, nheads, embed_dim, dropout)
        self.transformer_encoder1 = nn.TransformerEncoder(encoder_layers1, nlayers1)

        self.layer_normal2 = nn.LayerNorm(embed_dim)
        encoder_layers2 = nn.TransformerEncoderLayer(embed_dim, nheads, embed_dim, dropout)
        self.transformer_encoder2 = nn.TransformerEncoder(encoder_layers2, nlayers2)
        
        self.pos1 = get_pos(MaxSeq)
        self.pos2 = get_pos(self.MaxNum)
        self.fc1 = nn.Linear(embed_dim, 1)

    def forward(self,x,x_type,x_cnt):
        sizeB,sizeN = x.shape[0],x.shape[1]
#         x_ori = x
        x = self.seq_embd(x) # x.shape  B * N * S
        pos1 = self.pos_embd1(self.pos1)
        x = x + pos1

        x = x.reshape(sizeB*sizeN,MaxSeq,-1)
        x = x.permute(1, 0, 2)
        x = self.layer_normal1(x)
        x = self.transformer_encoder1(x)
        x = x[-1,:,:].reshape(sizeB,sizeN,-1)
        
        pos2 = self.pos_embd2(self.pos2)
        x_type = self.type_embd(x_type)
        x_cnt = self.cnt_embd(x_cnt)
        x = x + pos2 + x_type #+ x_cnt
        
        x = x.permute(1, 0, 2) # x.shape   N * B * D
        x = self.layer_normal2(x)
        x = self.transformer_encoder2(x)
        x = x.permute(1, 0, 2) # x.shape   B * N * D

        out = self.fc1(x).squeeze(-1) # x.shape   B * N
        return out
    
model = EncModel(MaxNum=16,
                 embed_dim=128,
                 nlayers1=2,
                 nlayers2=3,
                 nheads=4,
            )
model.load_state_dict(torch.load(model_path+'santa1_16000_{}.pt'.format(MaxNum),map_location=device))
model.to(device)

decay = {}
for pos in range(4000):
    if pos < 200: 
        decay[pos] = 0.97**pos
    else:
        decay[pos] = 0


# (-2,-2):0, (-2,-1):1, (0,-2):2, (1,-2):3, (0,-1):4, (1,-1):5
def update(sample,m,rtn,rtn_action,i):
    m = m.copy()
    rtn = rtn.copy()
    rtn_action = rtn_action.copy()
    
    my_action = sample[1][i]
    opp_action = sample[2][i]
    my_reward = sample[3][i]

    if i>0: # 若上一次的机器这次没人选，则占一个0
        last_my_action = sample[1][i-1]
        last_opp_action = sample[2][i-1]

        if last_my_action != my_action and last_my_action != opp_action and m[last_my_action][0] == -1:
            m[last_my_action] = np.append(m[last_my_action][1:],0)
        if last_my_action != last_opp_action and last_opp_action != my_action and last_opp_action != opp_action and m[last_opp_action][0] == -1:
            m[last_opp_action] = np.append(m[last_opp_action][1:],0)

    if my_action == opp_action and m[my_action][0] == -1:
        if my_reward:
            m[my_action] = np.append(m[my_action][1:], 5)
        else:
            m[my_action] = np.append(m[my_action][1:], 4)
    else:
        if m[opp_action][0] == -1:
            m[opp_action] = np.append(m[opp_action][1:], 1)
        
        if m[my_action][0] == -1:
            if my_reward:
                m[my_action] = np.append(m[my_action][1:], 3)
            else:
                m[my_action] = np.append(m[my_action][1:], 2)

    rtn[:-1,:] = rtn[1:,:]
    rtn[-1,:] = m[opp_action].copy()
    
    rtn_action[:-1] = rtn_action[1:].copy()
    rtn_action[-1] = opp_action

    return m,rtn,rtn_action


def get_action(step):
    m_score = [0 for i in range(100)]
    
    t_prob = copy.deepcopy(m_prob)
    for i in range(100):
        t_prob[i] = m_prob[i]*decay[m_cnt[i]]
        
        if m_cnt[i] == 1 and m_cnt_my[i] == 0:
            t_prob[i] = min(t_prob[i],init_prob*decay[m_cnt[i]])
        
    if step>1950 or step<20:
        return np.argmax(t_prob + np.random.random(100)/1e7).item() # 每次就选最大的
    
    thre = np.percentile(t_prob,90)
    op_machine_max = -1
    for action in op_machine:
        op_machine_max = max(op_machine_max,t_prob[action])
        m_score[action] = t_prob[action]

    rate = (1 - m_cnt_my[op_action_list[-1]]/m_cnt[op_action_list[-1]])*(1-m_cnt_my[op_action_list[-1]]/20)
    rate = min(0.05,max(0,rate/5))
    if m_cnt[op_action_list[-1]] == 1 and m_cnt_my[op_action_list[-1]] == 0:
        rate = 0
    if t_prob[op_action_list[-1]]*(1+rate) > thre:
        return op_action_list[-1]

    for action in range(100):
        if t_prob[action] >= thre and action not in my_action_list[-5:]: #可以进行探索
            m_score[action] = t_prob[action]+10000-m_cnt[action] # 哪个数量少选哪个  数量一样选概率最大的
        else:
            m_score[action] = t_prob[action]

    return np.argmax(m_score + np.array(range(100))/1e7).item()
    

time_list = []

total_reward = 0
label = []
my_action_list = []
op_action_list = []
my_reward_list = []
op_machine = set()

machine = [np.zeros(MaxSeq,dtype='int')-1 for _ in range(100)]
rtn = np.zeros([MaxNum,MaxSeq],dtype='int8')-1
rtn_action = np.zeros(MaxNum,dtype='int8') - 1
m_cnt = [0 for _ in range(100)]
m_cnt_op = [0 for _ in range(100)]
m_cnt_my = [0 for _ in range(100)]

m_prob = [init_prob for _ in range(100)]

m_prob_list = []
m_cnt_list = []
def agent(observation, configuration):
    s_time = time.time()
    global total_reward,my_action_list,op_action_list,my_reward_list,machine,rtn,rtn_action,m_prob,time_list
    if observation['step'] == 0:
        my_pull = random.randrange(100)
        total_reward = 0
    else:
        last_reward = observation['reward'] - total_reward
        total_reward = observation['reward']
        
        my_idx = observation['agentIndex']
        my_last_action = observation['lastActions'][my_idx]
        op_last_action = observation['lastActions'][1-my_idx]
        m_cnt[my_last_action]+=1
        m_cnt[op_last_action]+=1
        m_cnt_op[op_last_action]+=1
        m_cnt_my[my_last_action]+=1
        op_machine.add(op_last_action)
        
        my_action_list.append(my_last_action)
        op_action_list.append(op_last_action)
        my_reward_list.append(last_reward)
        
        sample = (label,my_action_list,op_action_list,my_reward_list)# 仿照训练时的数据格式
        
        machine, rtn, rtn_action = update(sample,machine,rtn,rtn_action,len(sample[1])-1)
        
        # 机器序列
        x = np.zeros([MaxNum,MaxSeq],dtype='int8')-1
        x[-1,:] = machine[my_last_action].copy()
        x[:-1,:] = rtn[1:,:].copy()
        x = x+1
        x = torch.tensor(x).to(device).long().unsqueeze(0)
        
        # 对方动作
        x_rtn_action = np.zeros(MaxNum,dtype='int8')-1
        x_rtn_action[:-1] = rtn_action[1:].copy()
        x_rtn_action[-1] = my_last_action
        
        
        # 机器类型
        dic_tmp = {}
        x_rtn_type = np.zeros(MaxNum,dtype='int8')
        for i,action in enumerate(x_rtn_action[::-1]):
            if action == -1:
                break
            if action not in dic_tmp:
                dic_tmp[action] = len(dic_tmp)+1
                
            x_rtn_type[MaxNum-1-i] = dic_tmp[action]
        x_rtn_type = torch.tensor(x_rtn_type).to(device).long().unsqueeze(0)
            
        # 当前次数
        pos =  len(my_action_list)-1
        x_rtn_cnt = []
        for i in range(MaxNum):
            if i == MaxNum - 1:
                x_rtn_cnt.append(cnt_cut(pos+1))
            else:
                x_rtn_cnt.append(cnt_cut(pos-MaxNum+i+3))
        x_rtn_cnt = np.array(x_rtn_cnt)
        x_rtn_cnt = torch.tensor(x_rtn_cnt).to(device).long().unsqueeze(0)
        
        output = model(x,x_rtn_type,x_rtn_cnt)
        for i,action in enumerate(x_rtn_action):
            if action == -1:
                continue
            m_prob[action] = output[0][i].item()/100
            
#         print('rtn_action',rtn_action)
#         print('machice[my_last_action]',machine[my_last_action])
#         print('machice[op_last_action]',machine[op_last_action])
#         print(m_prob)
#         print('m_prob',m_prob)
#         print('tttt',len(m_prob))
#         print()
        m_prob_list.append(m_prob.copy())
        m_cnt_list.append(m_cnt.copy())
        my_pull = get_action(observation['step'])
    
        print('op_last_action:',m_prob[op_last_action]*decay[m_cnt[op_last_action]])
    time_list.append(time.time()-s_time)

    return my_pull
