In [114]:
# first load the data
import pandas as pd
train_df = pd.read_csv('../input/littlesoldierproject-data/train_riddle.csv')
val_df = pd.read_csv('../input/littlesoldierproject-data/val_riddle.csv')

In [115]:
pip install sentence_transformers

In [116]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import Trainer
import numpy as np
from datasets import load_metric
from datasets import load_dataset
from datasets import Dataset
checkpoint = "nghuyong/ernie-1.0"

In [117]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
print(tokenizer('[UNK]', add_special_tokens=False))
print(tokenizer('[PAD]', add_special_tokens=False))
print(tokenizer('[MASK]', add_special_tokens=False))
print(tokenizer('[CLS]', add_special_tokens=False))
print(tokenizer('[SEP]', add_special_tokens=False))

In [118]:
metrics = load_metric("accuracy")
print(metrics)

In [119]:
# then reformat the data
# 都是XXXX（打一YY）的形式，重新整理成XXXXX.这个YY是————

def clean_riddle(riddle):
    riddle_clean = riddle.split('（')
    if '打一' in riddle_clean[1]:
        raw_key = riddle_clean[1][2:-1]
    elif '打' in riddle_clean[1][:1]:
        raw_key = riddle_clean[1][1:-1]
        if '一' in riddle_clean[1][-2:]:
            raw_key = riddle_clean[1][1:-2]
    else:
        raw_key = riddle_clean[1][:-1]
    
    key = raw_key
    
    if raw_key == '物':
        key = '东西'
    if raw_key[-1] == '名':
        key = raw_key[:-1]
    
    prompt = riddle_clean[0] + '这个' + key + '是'
    return prompt, riddle_clean[0]

In [120]:
import pdb
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def process_data(df, usage:str):
    final_data = []
    types = -1
    for index, row in df.iterrows():
        riddle_clean, raw_question = clean_riddle(row['riddle'])
        text_label = row['choice' + str(row['label'])]
        wrong_answers = ['choice' + str(i) for i in range(5) if i is not row['label']]
        concat_wa = row[wrong_answers[0]] + '、' + row[wrong_answers[1]] + '、' + row[wrong_answers[2]] + '、' + row[wrong_answers[3]]
        label_length = len(tokenizer(text_label, add_special_tokens=False)['input_ids'])
        true_label = riddle_clean + text_label
        start_length = len(riddle_clean)
        end_length = len(true_label)
        text = riddle_clean
        for i in range(label_length):
            text = text + '[MASK]'
        if len(raw_question) == 1:
            types = 0
        elif '，' in row['riddle']:
            types = 2
        else:
            types = 1
        assert len(tokenizer(text, add_special_tokens=False)['input_ids']) == len(tokenizer(true_label, add_special_tokens=False)['input_ids'])
        final_data.append((text, true_label, row['label'], concat_wa, types))
    final_df = pd.DataFrame()
    final_df['text'] = [data[0] for data in final_data]
    final_df['label'] = [data[1] for data in final_data]
    if usage is 'eval':
        final_df['label_index'] = [data[3] for data in final_data]
        final_df['question_type'] = [data[4] for data in final_data]
    return final_df

final_train_df = process_data(train_df, 'train')
final_val_df = process_data(val_df, 'eval')

In [121]:
print(final_val_df)

In [122]:
# done with data processing, start training
def tokenize_function_train(example):
    example['label'] = tokenizer(example['label'], truncation=True, padding='max_length', max_length=64)['input_ids']
    # example['label_index'] = tokenizer(example['label_index'], truncation=True, padding='max_length', max_length=64, add_special_tokens=False)['input_ids']
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=64)

def tokenize_function_eval(example):
    example['label'] = tokenizer(example['label'], truncation=True, padding='max_length', max_length=64)['input_ids']
    example['label_index'] = tokenizer(example['label_index'], truncation=True, padding='max_length', max_length=64, add_special_tokens=False)['input_ids']
    return tokenizer(example["text"], truncation=True, padding='max_length', max_length=64)

train_dataset = Dataset.from_pandas(final_train_df)
val_dataset = Dataset.from_pandas(final_val_df)

tokenized_train_datasets = train_dataset.map(tokenize_function_train, batched=True)
tokenized_val_datasets = val_dataset.map(tokenize_function_eval, batched=True)

print(tokenized_train_datasets)
print(tokenized_val_datasets)

In [123]:
tokenized_train_datasets = tokenized_train_datasets.remove_columns(["text"])
tokenized_train_datasets = tokenized_train_datasets.rename_column('label', 'labels')
tokenized_train_datasets.set_format("torch")
tokenized_val_datasets = tokenized_val_datasets.remove_columns(["text"])
tokenized_val_datasets = tokenized_val_datasets.rename_column('label', 'labels')
tokenized_val_datasets.set_format("torch")

In [124]:
print(tokenized_train_datasets)

In [125]:
print(tokenized_train_datasets['labels'])

In [126]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [127]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_train_datasets, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_val_datasets, batch_size=8, collate_fn=data_collator
)

In [128]:
print(train_dataloader)

In [129]:
for batch in eval_dataloader:
    break
{k:v for k, v in batch.items() if k not in'label_index' and k not in 'question_type'}

In [130]:
for batch in train_dataloader:
    break
{k:v for k, v in batch.items() if k not in'label_index'}


In [131]:
print(tokenized_train_datasets)

In [132]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metric_acc = load_metric('accuracy')
    metrics = {}
    metrics['accuracy'] = metric_acc.compute(predictions=predictions, references=labels)['accuracy']
    
    metric_recall = load_metric('recall')
    metrics['recall'] = metric_recall.compute(predictions=predictions, references=labels, average='macro')['recall']
   
    metric_f1 = load_metric('f1')
    metrics['f1'] = metric_f1.compute(predictions=predictions, references=labels, average='macro')['f1']
    
    metric_precision = load_metric('precision')
    metrics['precision'] = metric_precision.compute(predictions=predictions, references=labels, average='macro')['precision']
    return metrics

In [133]:
# try training
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

In [134]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [135]:
from transformers import get_scheduler

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [136]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

In [137]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, output_hidden_states=True)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [138]:
for batch in eval_dataloader:
    break
{k:v for k, v in batch.items() if k not in'label_index' and k not in 'question_type'}


In [139]:
print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([6])))

In [140]:
for batch in train_dataloader:
    break
{k:v for k, v in batch.items() if k not in'label_index' and k not in 'question_type'}

In [152]:
pred_result = []

model.eval()
for batch in eval_dataloader:
    label_index = batch['label_index']
    question_type = batch['question_type']
    batch_size = batch['labels'].shape[0]
    batch = {k: v.to(device) for k, v in batch.items() if k not in 'label_index' and k not in 'question_type'}
    with torch.no_grad():
        outputs = model(**batch, output_hidden_states=True)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    input_numpy = batch['input_ids'].detach().cpu().numpy()
    
    masked_ans = [[] for _ in range(batch_size)]
    masked_ans = [[i for i in range(64) if input_numpy[j][i] == 3] for j in range(batch_size)]
    for i in range(batch_size):
        if len(masked_ans[i]) < 1:
            print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_numpy[i])))
            print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(batch['labels'].detach().cpu().numpy()[i])))
    start_pos = [masked[0] for masked in masked_ans]
    end_pos = [masked[-1] + 1 for masked in masked_ans]
    print(start_pos, end_pos)
    predictions_numpy = predictions.detach().cpu().numpy()
    mlm_pred = [predictions_numpy[i][start_pos[i]:end_pos[i]] for i in range(batch_size)]
    
    labels_numpy = batch['labels'].detach().cpu().numpy()
    mlm_label = [labels_numpy[i][start_pos[i]:end_pos[i]] for i in range(batch_size)]
    

    other_labels_numpy = label_index.detach().cpu().numpy()
    
    question_types_numpy = question_type.detach().cpu().numpy().tolist()
    
    pred_texts = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(mlm_pred[i])) for i in range(batch_size)]
    pred_text = [''.join(pred_texts[j].split(' ')) for j in range(batch_size)]
    pred_lbls = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(mlm_label[i])) for i in range(batch_size)]
    pred_lbl = [''.join(pred_lbls[j].split(' ')) for j in range(batch_size)]
    other_mlm_lbl = [tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(other_labels_numpy[i])) for i in range(batch_size)]
    other_lbls = [other_mlm_lbl[i].replace("[PAD]", '').strip(' ').split('、') for i in range(batch_size)]
    other_lbl = [[''.join(other_lbls[i][j].split(' ')) for j in range(4)] for i in range(batch_size)]
    print(pred_text)
    for i in range(batch_size):
        pred_result.append((pred_text[i], pred_lbl[i], other_lbl[i], question_types_numpy[i]))

In [153]:
print(pred_result)

In [154]:
pip install sentence_transformers

In [155]:
def cosine_similarity(x,y):
    num = x.dot(y.T)
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    return num / denom

In [156]:
# then use something to match the definition-output-true label
# train a model to pair words and their definition


from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [157]:
def best_suit(pred, label, other):
    pred_embed = model.encode(pred)
    label_embed = model.encode(label)
    other_embed = [model.encode(other_label) for other_label in other]
    true_sim = cosine_similarity(pred_embed, label_embed)
    other_sim = [cosine_similarity(pred_embed, other_lbl_embed) for other_lbl_embed in other_embed]
    success_pred = max([int(true_sim < o_sim) for o_sim in other_sim])
    print(1 - success_pred)
    return 1 - success_pred

In [180]:
def best_suit_hard(pred, label, other):
    true_mutual = sum([1 for chara in pred if chara in label])
    print(true_mutual)
    other_mutual = [sum([1 for chara in pred if chara in other_label]) for other_label in other]
    print(other_mutual)
    for i in range(500):
        other_max = max(other_mutual)
        total_other = sum([1 for o_mutual in other_mutual if o_mutual == other_max])
        if true_mutual == other_max:
            success_pred = 1 / (1 + total_other)
        elif true_mutual > other_max:
            success_pred = 1
        else:
            success_pred = 0
    return success_pred

In [159]:
total = 0
acc = 0
preds_ans = []
for pred, label, other_label, question_type in pred_result:
    total += 1
    tmp_acc = best_suit(pred, label, other_label)
    acc += tmp_acc
    preds_ans.append(tmp_acc)
    

print(acc / total)

In [160]:
ls=[]
lssmall = []
lssent = []
typess = []

# 除去()信息
for i in range(val_df.shape[0]):
    string = val_df.iloc[i,0]
    index = string.find(' ')
    if index == -1 or index == 0:
        index = string.find('（')
    string = string[0:index]
    val_df.iloc[i,0] = string
    
    if len(string) <= 1:
        typess.append(0)
    elif '，' not in string:
        typess.append(1)
    else:
        typess.append(2)

In [162]:
preci = [0 for _ in range(3)]
totalss = [0 for _ in range(3)]
for i in range(len(preds_ans)):
    preci[typess[i]] += preds_ans[i]
    totalss[typess[i]] += 1

print(preci)
print(totalss)

In [181]:
total = 0
acc = 0
preds_ans = []
for pred, label, other_label, question_type in pred_result:
    total += 1
    tmp_acc = best_suit_hard(pred, label, other_label)
    acc += tmp_acc
    preds_ans.append(tmp_acc)
    

print(acc / total)

In [183]:
preci = [0 for _ in range(3)]
totalss = [0 for _ in range(3)]
for i in range(len(preds_ans)):
    preci[typess[i]] += preds_ans[i]
    totalss[typess[i]] += 1

print(preci)
print(totalss)

In [None]:
preci = [0 for _ in range(3)]
totalss = [0 for _ in range(3)]
for i in range(len(preds_ans)):
    preci[typess[i]] += preds_ans[i]
    totalss[typess[i]] += 1

print(precis)
print(totalss)

In [185]:
preds = []
labels = []
other_labels = []
for pred, label, other_label, question_type in pred_result:
    preds.append(pred)
    labels.append(label)
    other_labels.append(other_label)
    
result_df = pd.DataFrame()
result_df['pred'] = preds
result_df['label'] = labels
result_df['other_label'] = other_labels
result_df['pred_ans'] = preds_ans
result_df.to_csv('bert_prompt_result_ernie.csv')