In [25]:
import sys
import re
import numpy as np
import pandas as pd
import json
from collections import Counter, OrderedDict
from math import sqrt as msqrt
import random
import subprocess

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import csv

In [40]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True

setup_seed(1212)

pretrain_model_path = "data/pretrain.model"
evidence_selection_model_path = "data/selection.model"
evidence_validation_model_path = "data/validation.model"
preprocessing_result_path='data/preprocessing_word2idx.csv'
preprocessing_evidence_path='data/preprocessing_evidences.csv'

pad_idx = 0
max_len = 100
max_pred = 2
min_freq = 5

d_model = 256  # n_heads * d_k
n_heads = 4
n_layers = 4
p_dropout = .1
# BERT propability defined
p_mask = .8
p_replace = .1
p_do_nothing = 1 - p_mask - p_replace

p_dev = 0.1
batch_size = 128
epoch = 80
lr = 1e-4
weight_decay = 0

device = "cpu"
device = torch.device(device)

In [27]:
class BERT(nn.Module):
    def __init__(self, pretrain_model, max_vocab, max_len, d_out):
        super(BERT, self).__init__()
        self.max_vocab = max_vocab
        self.max_len = max_len
        self.d_out= d_out
        self.embedding = pretrain_model.embedding
        self.encoders = pretrain_model.encoders
        self.pooler = pretrain_model.pooler
        self.embedding.word_emb.weight.requires_grad = False
        self.embedding.seg_emb.weight.requires_grad = True
        self.embedding.pos_emb.weight.requires_grad = False
        for p in self.encoders.parameters():
            p.requires_grad = True
        self.pooler.fc.weight.requires_grad = True
        self.classifier_head = nn.Sequential(
            nn.Linear(d_model, d_out),
            nn.Softmax(dim=1) ,
        )

    def forward(self, tokens, segments):
        output = self.embedding(tokens, segments)
        for layer in self.encoders:
            output = layer(output, src_key_padding_mask=tokens.data.eq(pad_idx))
        output = self.classifier_head(torch.mean(output, dim=1))
        return output

    def gelu(self, x):
        '''
        Two way to implements GELU:
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        or
        0.5 * x * (1. + torch.erf(torch.sqrt(x, 2))) 
        '''
        return .5 * x * (1. + torch.erf(x / msqrt(2.)))
class Embeddings(nn.Module):
    def __init__(self, max_vocab, max_len):
        super(Embeddings, self).__init__()
        self.seg_emb = nn.Embedding(2, d_model)
        self.word_emb = nn.Embedding(max_vocab, d_model)
        self.pos_emb = nn.Embedding(max_len, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p_dropout)

    def forward(self, x, seg):
        '''
        x: [batch, seq_len]
        '''
        word_enc = self.word_emb(x)

        # positional embedding
        pos = torch.arange(x.shape[1], dtype=torch.long, device=device)
        pos = pos.unsqueeze(0).expand_as(x)
        pos_enc = self.pos_emb(pos)

        seg_enc = self.seg_emb(seg)
        x = self.norm(word_enc + pos_enc + seg_enc)
        return self.dropout(x)
        # return: [batch, seq_len, d_model]

class Pooler(nn.Module):
    def __init__(self):
        super(Pooler, self).__init__()
        self.fc = nn.Linear(d_model, d_model)
        self.tanh = nn.Tanh()

    def forward(self, x):
        '''
        x: [batch, d_model] (first place output)
        '''
        x = self.fc(x)
        x = self.tanh(x)
        return x

In [28]:
class TrainDataPreprocessor():
    def __init__(self, pretrain_result_path=preprocessing_result_path):
        print(f"Loading preprocessing result word2idx from {pretrain_result_path}")
        with open(pretrain_result_path, 'r') as file:
            reader = csv.reader(file)
            self.word2idx = {row[0]: int(row[1]) for row in list(reader)}
        self.vocab_size = len(self.word2idx)
        self.special_token_offset = self.word2idx[next(word for word in self.word2idx if not word.startswith("<"))]
    
    def preprocess(self, X, filter_empty=True):
        X = self.tokenize(X)
        X = self.lemmatize(X)
        X = self.filterWord(X, filter_empty)
        return X
    
    def get_ids(self, X):
        if isinstance(X[0], tuple):
            return [([self.word2idx.get(word, self.word2idx['<UNK>']) for word in evidence],
                  [self.word2idx.get(word, self.word2idx['<UNK>']) for word in claim]) for evidence, claim in X]
        elif isinstance(X[0], list):
            return [self.word2idx.get(word, self.word2idx['<UNK>']) for evidence in X for word in evidence]
        else:
            return [self.word2idx.get(word, self.word2idx['<UNK>']) for word in X]

    def tokenize(self, X):
        if isinstance(X[0], tuple):
            return [(word_tokenize(evidence), word_tokenize(claim)) for evidence, claim in X]
        elif isinstance(X, list):
            return [word_tokenize(evidence) for evidence in X]
        else:
            return word_tokenize(X)
    
     
    def lemmatize(self, X):
        lemmatizer = WordNetLemmatizer()
        if isinstance(X[0], tuple):
            return [([lemmatizer.lemmatize(word) for word in evidence], [lemmatizer.lemmatize(word) for word in claim]) for evidence, claim in X]
        elif isinstance(X[0], list):
            return [lemmatizer.lemmatize(word) for claim in X for word in claim]
        else:
            return [lemmatizer.lemmatize(word) for word in X]


    def filterWord(self, X, filter_empty=True):
        reg = re.compile(r'^[A-Za-z\-]+$')
        stop_words = set(stopwords.words('english'))
        if isinstance(X[0], tuple):
            return list(filter(lambda x: not filter_empty or (len(x[0]) > 0 and len(x[1]) > 0), 
                      [(list(filter(lambda word: (reg.match(word) and word not in stop_words) or word.isdigit(), evidence)),
                        list(filter(lambda word: (reg.match(word) and word not in stop_words) or word.isdigit(), claim)))
                         for evidence, claim in X]))
        elif isinstance(X[0], list):
            return list(filter(lambda x: not filter_empty or len(x) > 0, 
                      [list(filter(lambda word: (reg.match(word) and word not in stop_words) or word.isdigit(), evidence))
                         for evidence in X]))
        else:
            return list(filter(lambda word: (reg.match(word) and word not in stop_words) or word.isdigit(), X))

    def make_data(self, X, y):
        return [self.__get_one_case(evidence, claim, label) for (evidence, claim), label in zip(X, y)]

    def __get_one_case(self, evidence, claim, label):
        input_ids = [self.word2idx['<CLS>']] + evidence[:max_len-len(claim)-3] + [self.word2idx['<SEP>']] + claim + [self.word2idx['<SEP>']]
        segment_ids = [0] * (1 + min(len(evidence), max_len-len(claim)-3) + 1) + [1] * (1 + len(claim))

        # zero padding for tokens
        self.__padding(input_ids, max_len - len(input_ids))
        self.__padding(segment_ids, max_len - len(segment_ids))

        return [input_ids, segment_ids, label]

    def __padding(self, ids, n_pads):
        return ids.extend([pad_idx for _ in range(n_pads)])

class TrainDataset(Dataset):
    def __init__(self, input_ids, segment_ids, labels):
        super(TrainDataset, self).__init__()
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.segment_ids[index], self.labels[index]
    
class TrainEvaluator():
    def __init__(self):
        self.batch_num = 0
        self.cls_total = 0
        self.cls_correct = torch.LongTensor([0]).to(device)
        self.loss = torch.FloatTensor([0]).to(device)
    def eval(self, predicts, labels, loss):
        self.cls_total += labels.size(0)
        self.batch_num += 1
        self.cls_correct += torch.sum(torch.argmax(predicts, dim=1).view(-1) == labels.view(-1))
        self.loss += loss
    def get_cls_acc(self):
        return self.cls_correct.item() / self.cls_total
    def get_loss(self):
        return self.loss.item() / self.batch_num 


In [29]:
def train(model, optimizer, traindataloader, devdataloader, epoch=epoch, path=evidence_selection_model_path):

    print('\nstart training, parameter total:{}, trainable:{}\n'.format(sum(p.numel() for p in model.parameters()), 
                                                                        sum(p.numel() for p in model.parameters() if p.requires_grad)))#看看模型的参数
    
    best_acc = 0
    criterion = nn.CrossEntropyLoss()

    for n in range(epoch):
        print(f'-------------------- epoch {n+1} --------------------')
        evaluator = TrainEvaluator()
        t_batch = len(traindataloader)
        for i, one_batch in enumerate(traindataloader):
            input_ids, segment_ids, labels = [ele.to(device) for ele in one_batch]
            predict = model(input_ids, segment_ids)
            loss = criterion(predict, labels)
            loss = (loss.float()).mean()
            evaluator.eval(predict, labels, loss)
            print('[ Epoch{}: {}/{} ] '.format(n+1, i+1, t_batch), end='\r')    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch:{n + 1} [Train] loss: {evaluator.get_loss():.6f} \t cls_acc: {evaluator.get_cls_acc():.3f}')
        model.eval()
        evaluator = TrainEvaluator()
        t_batch = len(devdataloader)
        with torch.no_grad():
            for i, one_batch in enumerate(devdataloader):
                input_ids, segment_ids, labels = [ele.to(device) for ele in one_batch]
                predict = model(input_ids, segment_ids)
                loss = criterion(predict, labels)
                loss = (loss.float()).mean()
                evaluator.eval(predict, labels, loss)
        model.train()
        
        cls_acc, loss = evaluator.get_cls_acc(), evaluator.get_loss()
        print(f'Epoch:{n + 1} [Dev]   loss: {loss:.6f} \t cls_acc: {cls_acc:.3f}')
        if cls_acc > best_acc:
                # 如果validation的結果好于之前所有的結果，就把当下的模型存下來以便后续的预测使用
                best_acc = cls_acc
                torch.save(model, path)
                print(f'saved model with \t cls_acc: {cls_acc:.3f}')


In [34]:
preprocessor = TrainDataPreprocessor(preprocessing_result_path)

Loading preprocessing result word2idx from data/preprocessing_word2idx.csv


In [35]:
def preprocess_and_save_evidences(preprocessor, preprocessing_evidence_path=preprocessing_evidence_path):
    with open('../data/evidence.json', 'r') as file:
        evidences = list(json.load(file).values())
    evidences = preprocessor.preprocess(evidences, filter_empty=False)
    evidences = preprocessor.get_ids(evidences)
    with open(preprocessing_evidence_path, 'w') as file:
        writer = csv.writer(file)
        for evidence in evidences:
            writer.writerow(evidence)

def load_preprocessing_evidences(preprocessing_evidence_path=preprocessing_evidence_path):
    print(f"Loading preprocessing evidences from {preprocessing_evidence_path}")
    with open(preprocessing_evidence_path, 'r') as file:
        reader = csv.reader(file)
        return [[int(id) for id in row] for row in reader]

def load_evidence_claim_pairs_for_selection(claim_file_path):
    evidence_claim_pairs, labels = [], []
    with open(claim_file_path, 'r') as claims:
        claims = json.load(claims)
    preprocessing_evidences = load_preprocessing_evidences()
    for claim in claims.values():
        claim_text = claim['claim_text']
        claim_text = preprocessor.preprocess(claim_text)
        claim_ids = preprocessor.get_ids(claim_text)
        for i, evidence in enumerate(claim['evidences']):
            evidence_claim_pairs.append((preprocessing_evidences[int(evidence[9:])], claim_ids))
            labels.append(1)
        for _ in range(i):
            while True:
                random_evidence = random.choice(preprocessing_evidences)
                if random_evidence not in claim['evidences']:
                    break
            evidence_claim_pairs.append((random_evidence, claim_ids))
            labels.append(0)
    return evidence_claim_pairs, labels

def load_evidence_claim_pairs_for_validation(claim_file_path):
    evidence_claim_pairs, labels = [], []
    with open(claim_file_path, 'r') as claims:
        claims = json.load(claims)
    preprocessing_evidences = load_preprocessing_evidences()
    for claim in claims.values():
        if claim['claim_label'] == 'DISPUTED' or claim['claim_label'] == 'NOT_ENOUGH_INFO':
            # need optimize !!!!!!!!!!!!!!!!!
            continue 
        claim_text = claim['claim_text']
        claim_text = preprocessor.preprocess(claim_text)
        claim_ids = preprocessor.get_ids(claim_text)
        for i, evidence in enumerate(claim['evidences']):
            evidence_claim_pairs.append((preprocessing_evidences[int(evidence[9:])], claim_ids))
            if claim['claim_label'] == 'SUPPORTS':
                labels.append(1)
            elif claim['claim_label'] == 'REFUTES':
                labels.append(2)
            else:
                break
        # use evidence selected by the previous step
        # potential_evidences = select_evidence(claim_text)
        for _ in range(i):
            while True:
                random_evidence = random.choice(preprocessing_evidences)
                if random_evidence not in claim['evidences']:
                    break
            evidence_claim_pairs.append((random_evidence, claim_ids))
            labels.append(0)
            
    return evidence_claim_pairs, labels

def get_data_loader(preprocessor, evidence_claim_pairs, labels):
    data = preprocessor.make_data(evidence_claim_pairs, labels)
    data = [torch.LongTensor(ele) for ele in zip(*data)]
    return DataLoader(TrainDataset(*data), batch_size=batch_size, shuffle=True, num_workers = 8)

In [41]:
nltk.download('punkt')
# preprocess_and_save_evidences(preprocessor)
train_dataloader = get_data_loader(preprocessor, *load_evidence_claim_pairs_for_selection('data/train-claims.json'))
dev_dataloader = get_data_loader(preprocessor, *load_evidence_claim_pairs_for_selection('data/dev-claims.json'))

model = BERT(torch.load(pretrain_model_path), preprocessor.vocab_size, max_len, 2)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
model.to(device)
train(model, optimizer, train_dataloader, dev_dataloader)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading preprocessing evidences from data/preprocessing_evidences.csv
Loading preprocessing evidences from data/preprocessing_evidences.csv


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
def select_evidence(selection_model, claim_text, preprocessing_evidences, limit=5):
    selection_model.eval()
    claim_text = preprocessor.preprocess(claim_text)
    claim_ids = preprocessor.get_ids(claim_text)
    evidence_claim_pairs = [(evidence_ids, claim_ids) for evidence_ids in preprocessing_evidences]
    data = preprocessor.make_data(evidence_claim_pairs, [0]*len(evidence_claim_pairs))
    data = [torch.LongTensor(ele) for ele in zip(*data)]
    data_loader = DataLoader(TrainDataset(*data), batch_size=1024, shuffle=False, num_workers = 8)
    result_idx, result_labels, result_prob = [], [], []
    with torch.no_grad():
        t_batch = len(data_loader)
        idx = 0 
        for i, one_batch in enumerate(data_loader):
            input_ids, segment_ids, _ = [ele.to(device) for ele in one_batch]
            predict = selection_model(input_ids, segment_ids)
            result_idx.extend(range(idx, idx + len(predict)))
            result_labels.extend(torch.argmax(predict, dim=1).tolist())
            result_prob.extend(torch.max(predict, dim=1).tolist())
            idx += len(predict)
            print(f'[ {i+1}/{t_batch} ] ', end='\r')
    final_result_set = sorted([(idx, label, prob) for idx, label, prob in zip(result_idx, result_labels, result_prob)], key=lambda x: x[2], reverse=True)[:limit]
    return [(idx, preprocessing_evidences[idx]) for idx, _, _ in final_result_set]

def find_evidence_text(ids):
    with open('/kaggle/input/train-dataset/evidence.json', 'r') as file:
        evidences = json.load(file)
        return [evidences["evidence-"+str(id)] for id in ids]

In [None]:
selection_model = torch.load(evidence_selection_model_path).to(device)
preprocessing_evidences = load_preprocessing_evidences(preprocessing_evidence_path)
potential_evidence = select_evidence(selection_model, "[South Australia] has the most expensive electricity in the world.", preprocessing_evidences)
for text in find_evidence_text([id for id, _ in potential_evidence]):
    print(text)

In [None]:
train_dataloader = get_data_loader(preprocessor, *load_evidence_claim_pairs_for_validation('../data/train-claims.json'))
dev_dataloader = get_data_loader(preprocessor, *load_evidence_claim_pairs_for_validation('../data/dev-claims.json'))

model = BERT(torch.load(pretrain_model_path), preprocessor.vocab_size, max_len, 3)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
model.to(device)
train(model, optimizer, train_dataloader, dev_dataloader, path=evidence_validation_model_path)

In [None]:
def predict_single_claim(selection_model, validation_model, claim_text, preprocessing_evidences, limit=5):
    validation_model.eval()
    potential_evidences = select_evidence(selection_model, claim_text, preprocessing_evidences, limit)
    if not potential_evidences:
        return 'NOT_ENOUGH_INFO', []

    claim_text = preprocessor.preprocess(claim_text)
    claim_ids= preprocessor.get_ids(claim_text)
    evidence_claim_pairs = [(evidence_ids, claim_ids) for evidence_ids in potential_evidences]
    data = preprocessor.make_data(evidence_claim_pairs, [0]*len(evidence_claim_pairs))
    data = [torch.LongTensor(ele) for ele in zip(*data)]
    data_loader = DataLoader(TrainDataset(*data), batch_size=1024, shuffle=False, num_workers = 8)
    result_idx, result_labels, result_prob = [], [], []
    with torch.no_grad():
        idx = 0
        for one_batch in data_loader:
            input_ids, segment_ids, _ = [ele.to(device) for ele in one_batch]
            predict = validation_model(input_ids, segment_ids)
            result_idx.extend(range(idx, idx + len(predict)))
            result_labels.extend(torch.argmax(predict, dim=1).tolist())
            result_prob.extend(torch.max(predict, dim=1).tolist())
            idx += len(predict)
    
    final_result_set = sorted([(idx, label, prob) for idx, label, prob in zip(result_idx, result_labels, result_prob)], key=lambda x: x[2], reverse=True)[:limit]
    if all(label == 0 for _, label, _ in final_result_set):
        return 'NOT_ENOUGH_INFO', [potential_evidences[idx] for idx, _, _ in final_result_set]
    if any(label == 1 for _, label, _ in final_result_set) and not any(label == 2 for _, label, _ in final_result_set):
        return 'SUPPORTS', [potential_evidences[idx] for idx, label, _ in final_result_set if label == 1]
    if any(label == 2 for _, label, _ in final_result_set) and not any(label == 1 for _, label, _ in final_result_set):
        return 'REFUTES', [potential_evidences[idx] for idx, label, _ in final_result_set if label == 2]
    if any(label == 1 for _, label, _ in final_result_set) and any(label == 2 for _, label, _ in final_result_set):
        return 'DISPUTED', [potential_evidences[idx] for idx, label, _ in final_result_set if label == 2 or label == 1]
    return 'NOT_ENOUGH_INFO', [potential_evidences[idx] for idx, _, _ in final_result_set]

In [None]:
selection_model, validation_model = torch.load(evidence_selection_model_path).to(device), torch.load(evidence_validation_model_path).to(device)
preprocessing_evidences = load_preprocessing_evidences(preprocessing_evidence_path)
label, evidence = predict_single_claim(selection_model, validation_model,"[South Australia] has the most expensive electricity in the world.", preprocessing_evidences)
print(label)
print(find_evidence_text(evidence))

In [None]:
def predict_claims_and_save(claim_file_path='data/dev-claims.json', output_file_path='data/predict-output.json'):
    selection_model, validation_model = torch.load(evidence_selection_model_path), torch.load(evidence_validation_model_path)
    preprocessing_evidences = load_preprocessing_evidences(preprocessing_evidence_path, preprocessing_evidences)
    with open(claim_file_path, 'r') as claims:
        claims = json.load(claims)

    result = OrderedDict()   
    for claim_id, claim in claims.items():
        claim_text = claim['claim_text']
        label, evidence = predict_single_claim(selection_model, validation_model, claim_text)
        result[claim_id] = {'claim_text': claim_text, 'claim_label': label, 'evidences': evidence}
    with open(output_file_path, 'w') as output:
            output.write(json.dumps(result))

predict_claims_and_save()