# ensemble

In [None]:
import json
from collections import defaultdict
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import copy
from re import findall

In [25]:
a = json.load(open('/opt/ml/ensemble/outputs_/sub_star1621500901.3.csv'))
b = json.load(open('/opt/ml/ensemble/outputs_/predictions_0.7101.csv'))
c = json.load(open('/opt/ml/ensemble/outputs_/prediction_sumbt_06295.csv'))
d = json.load(open('/opt/ml/ensemble/outputs_/output_tradebest_7383.csv'))
e = json.load(open('/opt/ml/ensemble/outputs_/output_trade_07345_.csv'))
f = json.load(open('/opt/ml/ensemble/outputs_/prediction_sumbt_06295.csv'))
g = json.load(open('/opt/ml/ensemble/outputs_/output_tradepseudo_7679.csv'))

In [27]:
for result in [b, d, e, g]: # include open-vocab models only
    for idx, state in result.items():
        for i, sv in enumerate(state):
            s, v = sv.rsplit('-', 1)
            if "(" in v and v.count('(') == 1 and v[-1] == ")":
                l, r = v.split('(')
                l = l.rstrip()
                v = f"{l} ({r}"
            elif "(" in v and v.count('(') == 1:
                l, r = v.split('(')
                l = l.rstrip()
                v = f"{l}({r}"
            v = v.replace(" = ", "=").replace(" & ", "&")
            v = v.replace(" =", '=').replace(" &", "&")
            v = v.replace("= ", "=").replace("& ", "&")
            state[i] = f"{s}-{v}"

In [28]:
def recover_state(pred_slot):
    states = []
    for s, v in zip(slot_meta, pred_slot):
        if v != 'none':
            states.append(f'{s}-{v}')
    return states

def make_every_slot(datas, weights):
    acc = []
    for slot in slot_meta:
        slot_cnt = defaultdict(float)
        for data, weight in zip(datas, weights):
            d = {'-'.join(v.split('-')[:2]):v.split('-')[2] for v in data}
            slot_cnt[d.get(slot, 'none')] += weight
        maxval = max(slot_cnt.values())
        res = [k for k, v in slot_cnt.items() if v == maxval]
        acc.append(res[0])
    
    return recover_state(acc)

In [30]:
def get_cls_token(sent_A):
    model.eval()
    tokenized_sent = tokenizer(
            sent_A,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=512
    )
    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(    # **tokenized_sent
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )
    logits = outputs[1].detach().cpu().numpy()
    return logits

slot_meta = json.load(open("/opt/ml/input/data/train/slot_meta.json"))
ontology = json.load(open("/opt/ml/input/data/train/ontology.json"))


MODEL_NAME = "kykim/bert-kor-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

ontology_list = []
for key in ontology:
    if "이름" in key or "출발지" in key or "도착지" in key:
        ontology_list.extend(ontology[key])
    
dataset_cls_hidden = []
for q in ontology_list:
    q_cls = get_cls_token(q)
    dataset_cls_hidden.append(q_cls)
dataset_cls_hidden = np.array(dataset_cls_hidden).squeeze(axis=1)

In [31]:
predicts = {}
for i, dlg in enumerate(zip(a, b, c, d, e, f, g)):
    predicts[dlg[0]] = make_every_slot(
        [a[dlg[0]], b[dlg[1]], c[dlg[2]], d[dlg[3]], e[dlg[4]], f[dlg[5]], g[dlg[6]]], 
        [0.6344, 0.7101, 0.6295, 0.7383, 0.7345, 0.6295, 0.7679]
    )

In [24]:
for i, predict in enumerate(predicts):
    print(f'\r{i/len(predicts)}', end='')
    data = predicts[predict]
    for v in data:
        sl = v.split('-')[:2]
        if "이름" in sl or "출발지" in sl or "도착지" in sl:
            val = v.split('-')[2]
            query_cls_hidden = get_cls_token(val)
            cos_sim = cosine_similarity(query_cls_hidden, dataset_cls_hidden)
            top_question = np.argmax(cos_sim)
            if ontology_list[top_question] != val and max(cos_sim[0]) >= 0.98:
                v = v.replace(v.split('-')[2], ontology_list[top_question]) 

In [32]:
json.dump(predicts, open('/opt/ml/ensemble/predictions002_cos.csv', 'w'), indent=2, ensure_ascii=False) 