In [None]:
# 比赛链接  https://www.biendata.xyz/competition/haihua_2021/
# 参考 https://www.biendata.xyz/models/category/6353/

import os
import sys
import time
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import json
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import BertForMultipleChoice, BertTokenizer, AdamW, get_cosine_schedule_with_warmup

DEBUG = False
global_start_t = time.time()

print('ok')

In [None]:
with open('../input/haihua-mrc-data/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
    
train_df = []
for i in range(len(train_data)):
    data = train_data[i]
    content = data['Content']
    questions = data['Questions']
    for question in questions:
        question['Content'] = content
        train_df.append(question)
train_df = pd.DataFrame(train_df)

with open('../input/haihua-mrc-data/validation.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)
    
test_df = []
for i in range(len(test_data)):
    data = test_data[i]
    content = data['Content']
    questions = data['Questions']
    cls = data['Type']
    diff = data['Diff']
    for question in questions:
        question['Content'] = content
        question['Type'] = cls
        question['Diff'] = diff
        test_df.append(question)
test_df = pd.DataFrame(test_df)

train_df.to_csv('./train.csv', index=False)
test_df.to_csv('./test.csv', index=False)
print('ok')

In [None]:
FOLD_IDX = 3

CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'hfl/chinese-macbert-large',
    'max_len': 312,
    'epochs': 3,
    'eval_per_step_num': 500,
    'patient_epoch': 1.5,
    'train_bs': 1, 
    'valid_bs': 1,
    'lr': 2e-5,
    'num_workers': 4,
    'accum_iter': 2,
    'weight_decay': 0.001,
}

def seed_all(random_seed=42):
    os.environ['PYTHONHASHSEED'] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    
seed_all(CFG['seed'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)

print('ok')

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df['label'] = train_df['Answer'].apply(lambda x: ['A', 'B', 'C', 'D'].index(x))
test_df['label'] = 0

print('before train_df.shape: ', train_df.shape, 'test.shape: ', test_df.shape)

SAMPLED_TRAIN_NUM = 613
SAMPLED_TEST_NUM = 507
if DEBUG:
    train_df = train_df.iloc[:SAMPLED_TRAIN_NUM]
    test_df = test_df.iloc[:SAMPLED_TEST_NUM]
    CFG['epochs'] = 2
    
print('after train_df.shape: ', train_df.shape, 'test.shape: ', test_df.shape)

tokenizer = BertTokenizer.from_pretrained(CFG['model'])
# tokenizerFast = BertTokenizerFast.from_pretrained(CFG['model'])

print('ok')

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx): # 将一条数据从（文章，问题，4个选项）转成（文章，问题，选项1）、（文章，问题，选项2）...
        label = self.df.label.values[idx]
        question = self.df.Question.values[idx]
        content = self.df.Content.values[idx]
        choice = self.df.Choices.values[idx][2:-2].split('\', \'')
        if len(choice) < 4:
            for i in range(4 - len(choice)):
                choice.append('D. 不知道')
        content = [content for i in range(len(choice))]
        pair = [question + ' ' + i[2:] for i in choice]
        return content, pair, label
    
def collate_fn(data): #将文章问题选项拼接在一起后，得到分词后的数字id，输出的size是(batch, n_choices, max_len)
    input_ids, attention_mask, token_type_ids = [], [], []
    for x in data:
        text = tokenizer(x[1], text_pair=x[0], padding='max_length', truncation=True, 
                         max_length=CFG['max_len'], return_tensors='pt')
        input_ids.append(text['input_ids'].tolist())
        attention_mask.append(text['attention_mask'].tolist())
        token_type_ids.append(text['token_type_ids'].tolist())
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)
    label = torch.tensor([x[-1] for x in data])
    return input_ids, attention_mask, token_type_ids, label

class AverageMeter:
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def train_model(model, train_loader, val_loader): # 训练一个epoch
    global fold_step_num, fold_best_step_num, fold_best_acc, patient_step_num
    
    model.train()
    losses = AverageMeter()
    accs = AverageMeter()
    
    optimizer.zero_grad()
    tk = tqdm(train_loader, total=len(train_loader), position=0, leave=True)
    for step, (input_ids, attention_mask, token_type_ids, y) in enumerate(tk):
        input_ids, attention_mask, token_type_ids, y = (input_ids.to(device), attention_mask.to(device),
                                                        token_type_ids.to(device), y.to(device).long())
        output = model(input_ids, attention_mask, token_type_ids).logits
        #print('get here 222')

        loss = criterion(output, y) / CFG['accum_iter']
        loss.backward()

        if ((step+1)%CFG['accum_iter']==0) or ((step+1)==len(train_loader)): # 梯度累加
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
            fold_step_num += 1
            if fold_step_num % CFG['eval_per_step_num'] ==0:
                val_loss, val_acc = test_model(model, val_loader)
                if val_acc > fold_best_acc:
                    fold_best_acc = val_acc
                    fold_best_step_num = fold_step_num
                    torch.save(model.state_dict(), '{}_fold_best.pt'.format(CFG['model'].split('/')[-1]))
                print(f'in train() after test_model() mid step, fold_step_num: {fold_step_num} '
                      f'fold_best_acc:{fold_best_acc:.3f} val_acc: {val_acc:.3f} val_loss: {val_loss:.3f}')
                
            if (fold_step_num - fold_best_step_num) > patient_step_num:  # 提前退出
                break

        acc = (output.argmax(1)==y).sum().item() / y.size(0)
        losses.update(loss.item()*CFG['accum_iter'], y.size(0))
        accs.update(acc, y.size(0))
        
        tk.set_postfix(loss=losses.avg, acc=accs.avg)
        
    return losses.avg, accs.avg
        
def test_model(model, val_loader): # 验证
    model.eval()
    
    losses = AverageMeter()
    accs = AverageMeter()
    y_truth, y_pred = [], []
    
    with torch.no_grad():
        tk = tqdm(val_loader, total=len(val_loader), position=0, leave=True)
        for idx, (input_ids, attention_mask, token_type_ids, y) in enumerate(tk):
            input_ids, attention_mask, token_type_ids, y = (input_ids.to(device), attention_mask.to(device),
                                                        token_type_ids.to(device), y.to(device).long())
            output = model(input_ids, attention_mask, token_type_ids).logits
            y_truth.extend(y.cpu().numpy())
            y_pred.extend(output.argmax(1).cpu().numpy())
            
            loss = criterion(output, y)
            acc = (output.argmax(1)==y).sum().item() / y.size(0)
            losses.update(loss.item(), y.size(0))
            accs.update(acc, y.size(0))
            
            tk.set_postfix(loss=losses.avg, acc=accs.avg)
            
    return losses.avg, accs.avg
        
print('ok')

In [None]:
folds = StratifiedKFold(n_splits=CFG['fold_num'], shuffle=True, 
                        random_state=CFG['seed']).split(
                        np.arange(train_df.shape[0]), train_df.label.values)   # 五折交叉验证
cv = []
train_start_t = time.time()
for fold, (trn_idx, val_idx) in enumerate(folds):
    if fold!=FOLD_IDX:
        print(f'fold: {fold} is not FOLD_IDX: {FOLD_IDX}, pass!')
        continue
        
    print('fold: ', fold)
    fold_start_t = time.time()
    
    train = train_df.loc[trn_idx]
    val = train_df.loc[val_idx]
    train_set = MyDataset(train)
    val_set = MyDataset(val)
    
    train_loader = DataLoader(train_set, batch_size=CFG['train_bs'], collate_fn=collate_fn, shuffle=True, num_workers=CFG['num_workers'])
    val_loader = DataLoader(val_set, batch_size=CFG['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=CFG['num_workers'])
    
    fold_best_acc = 0
    fold_step_num, fold_best_step_num = 0, 0
    patient_step_num = int(CFG['patient_epoch'] * len(train_loader) / CFG['accum_iter'] / CFG['train_bs'])
    model = BertForMultipleChoice.from_pretrained(CFG['model']).to(device)
    
    scaler = GradScaler()
    optimizer = AdamW(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'])
    criterion = nn.CrossEntropyLoss()
    scheduler = get_cosine_schedule_with_warmup(optimizer, len(train_loader)//CFG['accum_iter'], 
                                                CFG['epochs']*len(train_loader)//CFG['accum_iter'])
    
    for epoch in range(CFG['epochs']):
        print('epoch: ', epoch)
        epoch_start_t = time.time()
        time.sleep(0.2)
        
        train_loss, train_acc = train_model(model, train_loader, val_loader)
        val_loss, val_acc = test_model(model, val_loader)
        if val_acc > fold_best_acc:
            fold_best_acc = val_acc
            torch.save(model.state_dict(), '{}_fold_best.pt'.format(CFG['model'].split('/')[-1]))
        print(f'epoch {epoch} finished, cost time: {time.time() - epoch_start_t:.2f} sec')
     
    print(f'fold {fold} finished, cost time: {time.time() - fold_start_t:.2f} sec')
    cv.append(fold_best_acc)
    
print('cv is ', cv, 'cv mean is ', np.mean(cv))
print(f'Train finished here, total cost time: {time.time() - train_start_t:.2f} sec')

In [None]:
!ls ./