notebook1
## PART 1. Document retrieval

In [None]:
from pathlib import Path
from functools import partial
import re
import numpy as np
import pandas as pd
import jieba
import scipy

import json
import pickle
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Set, Tuple, Union
from pathlib import Path
from functools import partial

jieba.set_dictionary("dict.txt.big")
# Download "dict.txt.big" from https://github.com/fxsjy/jieba

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, paired_cosine_distances

from pandarallel import pandarallel

# Adjust the number of workers if you want
pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

from tqdm import tqdm

tqdm.pandas()  # for progress_apply

from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

In [None]:

@dataclass
class Claim:
    data: str


@dataclass
class AnnotationID:
    id: int


@dataclass
class EvidenceID:
    id: int


@dataclass
class PageTitle:
    title: str


@dataclass
class SentenceID:
    id: int


@dataclass
class Evidence:
    data: List[List[Tuple[AnnotationID, EvidenceID, PageTitle, SentenceID]]]


In [None]:

def save_doc(
        data: List[Dict[str, Union[int, Claim, Evidence]]],
        predictions: pd.Series,
        mode: str = "train",
        num_pred_doc: int = 5,
) -> None:
    with open(
            f"data/{mode}_doc{num_pred_doc}.jsonl",
            "w",
            encoding="utf8",
    ) as f:
        for i, d in enumerate(data):
            d["predicted_pages"] = list(predictions.iloc[i])
            f.write(json.dumps(d, ensure_ascii=False) + "\n")  #%% md


In [None]:
def calculate_precision(
        data: List[Dict[str, Union[int, Claim, Evidence]]],
        predictions: pd.Series,
) -> None:
    precision = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        # Extract all ground truth of titles of the wikipedia pages
        # evidence[2] refers to the title of the wikipedia page
        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])

        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        if len(predicted_pages) != 0:
            precision += len(hits) / len(predicted_pages)

        count += 1

    # Macro precision
    print(f"Precision: {precision / count}")


def calculate_recall(
        data: List[Dict[str, Union[int, Claim, Evidence]]],
        predictions: pd.Series,
) -> None:
    recall = 0
    count = 0

    for i, d in enumerate(data):
        if d["label"] == "NOT ENOUGH INFO":
            continue

        gt_pages = set([
            evidence[2]
            for evidence_set in d["evidence"]
            for evidence in evidence_set
        ])
        predicted_pages = predictions.iloc[i]
        hits = predicted_pages.intersection(gt_pages)
        recall += len(hits) / len(gt_pages)
        count += 1

    print(f"Recall: {recall / count}")

In [None]:

# my parameter
least_similarity = 0.85
save_at_least = 2

In [None]:

# Get the stopwords
# https://github.com/bryanchw/Traditional-Chinese-Stopwords-and-Punctuations-Library
from TCSP import read_stopwords_list

stopwords = read_stopwords_list()


def tokenize(text: str, stopwords: list) -> str:
    """This function performs Chinese word segmentation and removes stopwords.

    Args:
        text (str): claim or wikipedia article
        stopwords (list): common words that contribute little to the meaning of a sentence

    Returns:
        str: word segments separated by space (e.g. "我 喜歡 吃 蘋果")
    """
    # Windows area (because of pandarallel)
    import jieba

    tokens = list(jieba.cut(text))

    return " ".join([w for w in tokens if w not in stopwords])


In [None]:

def get_pred_docs_sklearn(
        claim: str,
        tokenizing_method: callable,
        vectorizer: TfidfVectorizer,
        tf_idf_matrix: scipy.sparse.csr_matrix,
        wiki_pages: pd.DataFrame,
        topk: int,
) -> set:
    tokens = tokenizing_method(claim)
    claim_vector = vectorizer.transform([tokens])
    # similarity_scores = tf_idf_matrix.dot(claim_vector.T).toarray()
    similarity_scores = linear_kernel(tf_idf_matrix, claim_vector).flatten()

    # `similarity_scores` shape: (num_wiki_pages x 1)
    # similarity_scores = similarity_scores[:, 0]  # flatten the array

    # Sort the similarity scores in descending order
    sorted_indices = np.argsort(similarity_scores)[::-1]
    topk_sorted_indices = sorted_indices[:topk]

    # Get the wiki page names based on the topk sorted indices
    results = wiki_pages.iloc[topk_sorted_indices]["id"]

    exact_matchs = []
    # You can find the following code in our AICUP2023 baseline.
    # Basically, we check if a result is exactly mentioned in the claim.
    for result in results:
        if (
                (result in claim)
                or (result in claim.replace(" ", ""))  # E.g., MS DOS -> MSDOS
                or (result.replace("·", "") in claim)  # E.g., 湯姆·克魯斯 -> 湯姆克魯斯
                or (result.replace("-", "") in claim)  # E.g., X-SAMPA -> XSAMPA
        ):
            exact_matchs.append(result)
        elif "·" in result:
            splitted = result.split("·")  # E.g., 阿爾伯特·愛因斯坦 -> 愛因斯坦
            for split in splitted:
                if split in claim:
                    exact_matchs.append(result)
                    break
                elif result is results.iloc[0]:
                    exact_matchs.append(result)
        elif result is results.iloc[0]:
            exact_matchs.append(result)

    # exclude some documents with "low" similarity_score
    if (len(exact_matchs) > save_at_least):  # at least we save n documents
        for i in range(save_at_least, len(exact_matchs)):
            if (paired_cosine_distances(tf_idf_matrix[topk_sorted_indices[i]], claim_vector)[0] > least_similarity):
                exact_matchs = exact_matchs[:i]
                break

    return set(exact_matchs)

# Helper function (you don't need to modify this)


In [None]:

def get_title_from_evidence(evidence):
    titles = []
    for evidence_set in evidence:
        if len(evidence_set) == 4 and evidence_set[2] is None:
            return [None]
        for evidence_sent in evidence_set:
            titles.append(evidence_sent[2])
    return list(set(titles))



In [None]:

def save_results_to_markdown(results: dict, output_file="grid_search_results.md"):
    file_exists = Path(output_file).exists()

    with open(output_file, "a") as f:
        if not file_exists:
            f.write("# Grid Search Results\n\n")
            f.write("| Experiment  | F1 Score | Precision | Recall |\n")
            f.write("| ----------- | -------- | --------- | ------ | \n")

        exp_name = results["exp_name"]
        f1 = results["f1_score"]
        prec = results["precision"]
        recall = results["recall"]
        f.write(f"| {exp_name} | {f1:.4f} | {prec:.4f} | {recall:.4f} |\n")
    print(f"Results saved to {output_file}")



In [None]:
# Hyperparameters

wiki_path = "data/wiki-pages"
min_wiki_length = 10
num_of_samples = 500
topk = 50
min_df = 1
max_df = 0.8
use_idf = True
sublinear_tf = True
ngram_range = (1, 2)

# Set up the experiment name for logging
exp_name = (
        f"len{min_wiki_length}__top{topk}__min_df={min_df}_"
        + f"max_df={max_df}__{num_of_samples}s__" + f"least_sim={least_similarity}__"
        + f"ngram_r={ngram_range}__" + f"save_at_least={save_at_least}"
)
if sublinear_tf:
    exp_name = "sublinearTF_" + exp_name
if not use_idf:
    exp_name = "no_idf_" + exp_name


In [None]:
# First time running this cell will 34 minutes using Google Colab.

wiki_cache = "wiki"
target_column = "text"

wiki_cache_path = Path(f"data/{wiki_cache}.pkl")
if wiki_cache_path.exists():
    wiki_pages = pd.read_pickle(wiki_cache_path)
else:
    # You need to download `wiki-pages.zip` from the AICUP website
    wiki_pages = jsonl_dir_to_df(wiki_path)
    # wiki_pages are combined into one dataframe, so we need to reset the index
    wiki_pages = wiki_pages.reset_index(drop=True)

    # tokenize the text and keep the result in a new column `processed_text`
    wiki_pages["processed_text"] = wiki_pages[target_column].parallel_apply(
        partial(tokenize, stopwords=stopwords)
    )
    # save the result to a pickle file
    wiki_pages.to_pickle(wiki_cache_path, protocol=4)
# Build the TfidfVectorizer


In [None]:

vectorizer = TfidfVectorizer(
    max_df=max_df,
    min_df=min_df,
    use_idf=use_idf,
    ngram_range=ngram_range,
    sublinear_tf=sublinear_tf,
    token_pattern=r"(?u)\b\w+\b",
    norm=None,
)


In [None]:

wiki_pages = wiki_pages[
    wiki_pages['processed_text'].str.len() > min_wiki_length
    ]
corpus = wiki_pages["processed_text"].tolist()


In [None]:

# Start to encode the corpus with TF-IDF
X = vectorizer.fit_transform(corpus)

# fit_transform will do the following two steps:
# 1. fit: learn the vocabulary and idf from the corpus
# 2. transform: transform the corpus into a vector space
# Note the result is a sparse matrix, which contains lots of zeros for each row.


In [None]:

TRAIN_DATA = load_json("data/public_train.jsonl")
TEST_DATA = load_json("data/all_test.jsonl")


In [None]:

train_df = pd.DataFrame(TRAIN_DATA)

# Perform the prediction for document retrieval
train_df["predictions"] = train_df["claim"].progress_apply(
    partial(
        get_pred_docs_sklearn,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        tf_idf_matrix=X,
        wiki_pages=wiki_pages,
        topk=topk,
    )
)
precision = calculate_precision(TRAIN_DATA, train_df["predictions"])
recall = calculate_recall(TRAIN_DATA, train_df["predictions"])


In [None]:

test_df = pd.DataFrame(TEST_DATA)

# Perform the prediction for document retrieval
test_df["predictions"] = test_df["claim"].progress_apply(
    partial(
        get_pred_docs_sklearn,
        tokenizing_method=partial(tokenize, stopwords=stopwords),
        vectorizer=vectorizer,
        tf_idf_matrix=X,
        wiki_pages=wiki_pages,
        topk=topk,
    )
)

In [None]:
doc_path = f"data/train_doc5.jsonl"
if Path(doc_path).exists():
    with open(doc_path, "r", encoding="utf8") as f:
        predicted_results = pd.Series([
            set(json.loads(line)["predicted_pages"])
            for line in f
        ])
else:
    save_doc(TRAIN_DATA, train_df["predictions"], mode="train", num_pred_doc=5)


In [None]:
doc_path = f"data/test_doc5.jsonl"
if Path(doc_path).exists():
    with open(doc_path, "r", encoding="utf8") as f:
        predicted_results = pd.Series([
            set(json.loads(line)["predicted_pages"])
            for line in f
        ])
else:
    save_doc(TEST_DATA, test_df["predictions"], mode="test", num_pred_doc=5)

# Part 2

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertModel
from sklearn.model_selection import train_test_split
from utils import *
from transformers import BertTokenizer, BertTokenizerFast
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import classification_report
import os
import warnings

warnings.filterwarnings('always')

In [None]:
def train_data_preprocessing(mapping, df):
    claims = []
    pages = []
    sentences = []
    per_sentences = []
    pos_sentences = []
    labels = []
    owned_evidence_set = set()

    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue

        claim = df["claim"].iloc[i]
        evidence_sets = df["evidence"].iloc[i]
        searched_page = []

        related_count = 0
        # Labels which are related
        for evidence_set in evidence_sets:
            for evidence in evidence_set:
                page = evidence[2]
                if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                    continue
                page_sent_idx = str(evidence[3])
                sentence = mapping[page][page_sent_idx]
                if sentence == "":
                    continue

                if (claim, page, sentence) not in owned_evidence_set:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_sent_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_sent_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_sent_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_sent_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(1)
                    owned_evidence_set.add((claim, page, sentence))
                    related_count += 1

                    if page not in searched_page:
                        searched_page.append(page)

        not_related_count = 0

        # Labels which are not related
        for page in searched_page:
            if not_related_count >= related_count:
                break
            if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                continue
            for page_contents_idx in mapping[page]:
                if not_related_count >= related_count:
                    break

                sentence = mapping[page][page_contents_idx]
                if sentence == "":
                    continue

                if (claim, page, sentence) not in owned_evidence_set:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_contents_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_contents_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_contents_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_contents_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(0)
                    owned_evidence_set.add((claim, page, sentence))
                    not_related_count += 1

    #predicted_pages
    for i in range(len(df)):
        claim = df["claim"].iloc[i]
        predicted_pages_sets = df["predicted_pages"].iloc[i]

        for page in predicted_pages_sets:
            if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                continue
            for page_contents_idx in mapping[page]:
                sentence = mapping[page][page_contents_idx]
                if sentence == "":
                    continue
                pair = (claim, page, sentence)
                if pair not in owned_evidence_set and np.random.rand(1) <= 0.1:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_contents_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_contents_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_contents_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_contents_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(0)
                    owned_evidence_set.add((claim, page, sentence))

    return pd.DataFrame(
        {"claim": claims, "per_sentences": per_sentences, "sentence": sentences, "pos_sentences": pos_sentences,
         "page": pages, "label": labels})



In [None]:
def eval_data_preprocessing(mapping, df):
    claims = []
    pages = []
    sentences = []
    per_sentences = []
    pos_sentences = []
    labels = []
    owned_evidence_set = set()

    for i in range(len(df)):
        if df["label"].iloc[i] == "NOT ENOUGH INFO":
            continue

        claim = df["claim"].iloc[i]
        evidence_sets = df["evidence"].iloc[i]
        searched_page = []

        # Labels which are related
        for evidence_set in evidence_sets:
            for evidence in evidence_set:
                page = evidence[2]
                if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                    continue
                page_sent_idx = str(evidence[3])
                sentence = mapping[page][page_sent_idx]
                if sentence == "":
                    continue

                if (claim, page, sentence) not in owned_evidence_set:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_sent_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_sent_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_sent_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_sent_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(1)
                    owned_evidence_set.add((claim, page, sentence))

                    if page not in searched_page:
                        searched_page.append(page)

        # Labels which are not related
        for page in searched_page:
            if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                continue
            for page_contents_idx in mapping[page]:
                sentence = mapping[page][page_contents_idx]
                if sentence == "":
                    continue

                if (claim, page, sentence) not in owned_evidence_set:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_contents_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_contents_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_contents_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_contents_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(0)
                    owned_evidence_set.add((claim, page, sentence))

    # predicted_pages
    for i in range(len(df)):
        claim = df["claim"].iloc[i]
        predicted_pages_sets = df["predicted_pages"].iloc[i]

        for page in predicted_pages_sets:
            if page == "臺灣海峽危機#第二次臺灣海峽危機（1958）":
                continue
            for page_contents_idx in mapping[page]:
                sentence = mapping[page][page_contents_idx]
                if sentence == "":
                    continue
                pair = (claim, page, sentence)
                if pair not in owned_evidence_set:
                    claims.append(claim)
                    pages.append(page)

                    if str(int(page_contents_idx) - 1) in mapping[page]:
                        per_sentences.append(mapping[page][str(int(page_contents_idx) - 1)])
                    else:
                        per_sentences.append("")

                    sentences.append(sentence)

                    if str(int(page_contents_idx) + 1) in mapping[page]:
                        pos_sentences.append(mapping[page][str(int(page_contents_idx) + 1)])
                    else:
                        pos_sentences.append("")

                    labels.append(0)
                    owned_evidence_set.add((claim, page, sentence))

    return pd.DataFrame(
        {"claim": claims, "per_sentences": per_sentences, "sentence": sentences, "pos_sentences": pos_sentences,
         "page": pages, "label": labels})



In [None]:
model_version = 'hfl/chinese-lert-large'


class BERTCustom(nn.Module):
    def __init__(self):
        super(BERTCustom, self).__init__()
        self.bert_model = BertModel.from_pretrained(model_version, return_dict=True)
        self.dropout1 = nn.Dropout(0.3)
        self.fc = nn.Linear(1024, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.bert_model(input_ids, attention_mask, token_type_ids)
        x = self.dropout1(x.pooler_output)
        x = self.fc(x)

        return x

In [None]:
class WikiDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        claim = self.df['claim'][index]
        per_sentence = self.df['per_sentences'][index]
        sentence = self.df['sentence'][index]
        pos_sentence = self.df['pos_sentences'][index]
        page = self.df['page'][index]
        label = self.df['label'][index]
        label = int(label)

        full_input = claim + "[SEP]" + page + "[SEP]" + per_sentence + "[SEP]" + sentence + "[SEP]" + pos_sentence

        input = self.tokenizer.encode_plus(
            full_input,
            None,
            add_special_tokens=True,
            padding="max_length",
            max_length=256,
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        out = {
            "input_ids": input["input_ids"].flatten(),
            "attention_mask": input["attention_mask"].flatten(),
            "token_type_ids": input["token_type_ids"].flatten(),
            "targets": torch.tensor(label, dtype=torch.float)
        }

        return out

In [None]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages)
del wiki_pages

In [None]:
SEED = 42

TRAIN_DATA = load_json("data/public_train.jsonl")
TEST_DATA = load_json("data/all_test.jsonl")
DOC_DATA = load_json("data/train_doc5.jsonl")

LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

_y = [LABEL2ID[data["label"]] for data in TRAIN_DATA]
# GT means Ground Truth
TRAIN_GT, DEV_GT = train_test_split(
    DOC_DATA,
    test_size=0.1,
    random_state=SEED,
    shuffle=True,
    stratify=_y,
)

In [None]:
if not os.path.exists(
        f"data/Part2"):
    os.makedirs(
        f"data/Part2")

In [None]:
pd.DataFrame(TRAIN_GT).to_json("data/Part2/train_doc5.jsonl", orient='records', lines=True, force_ascii=False)
pd.DataFrame(DEV_GT).to_json("data/Part2/dev_doc5.jsonl", orient='records', lines=True, force_ascii=False)

In [None]:
train_preprocessed = train_data_preprocessing(mapping, pd.DataFrame(TRAIN_GT))

total_training_samples = len(train_preprocessed)
print(f"train_preprocessed length: {total_training_samples}")
class_0_samples = train_preprocessed["label"].value_counts()[0]
class_1_samples = train_preprocessed["label"].value_counts()[1]

weight_for_class_0 = total_training_samples / class_0_samples
weight_for_class_1 = total_training_samples / class_1_samples

print(weight_for_class_0)
print(weight_for_class_1)

print(train_preprocessed["label"].value_counts())

In [None]:
eval_val_preprocessed = eval_data_preprocessing(mapping, pd.DataFrame(DEV_GT))

print(f"eval_val_preprocessed length: {len(eval_val_preprocessed)}")
print(eval_val_preprocessed["label"].value_counts())

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(model_version)

In [None]:
claim = train_preprocessed['claim'][100]
sentence = train_preprocessed['sentence'][100]
per_sentence = train_preprocessed['per_sentences'][100]
pos_sentence = train_preprocessed['pos_sentences'][100]
page = train_preprocessed['page'][100]
label = train_preprocessed['label'][100]
label = int(label)

print(claim)
print(page)
print(sentence)
print(label)
print(torch.tensor(label, dtype=torch.float))

full_input = claim + "[SEP]" + page + "[SEP]" + sentence

input = tokenizer.encode_plus(
    full_input,
    None,
    add_special_tokens=True,
    padding="max_length",
    max_length=256,
    truncation=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors="pt"
)

print(input)

In [None]:
train_dataset = WikiDataset(train_preprocessed, tokenizer)
full_eval_dataset = WikiDataset(eval_val_preprocessed, tokenizer)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=32
)

eval_val_dataloader = torch.utils.data.DataLoader(
    full_eval_dataset,
    batch_size=256
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = BERTCustom()
model.to(device)

pos_weight = torch.tensor([weight_for_class_1]).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)

In [None]:
def train_model(n_epochs, training_loader, eval_val_dataloader, model, optimizer, loss_fn, scheduler):
    f1_record = 0
    for epoch in range(n_epochs):
        train_loss = []
        eval_loss = []

        all_eval_preds = []
        all_eval_targets = []

        all_train_preds = []
        all_train_targets = []


        model.train()
        training_progress_bar = tqdm(enumerate(training_loader), total=len(training_loader))
        training_progress_bar.set_description(f'Epoch Training {epoch}')
        for batch_index, batch in enumerate(training_loader):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.long).squeeze()
            targets = targets.view(-1, 1).float()
            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            train_loss.append(loss.detach().cpu().item())
            loss.backward()
            optimizer.step()

            # Convert the logits to probabilities
            probs = torch.sigmoid(outputs)

            # Convert the probabilities to binary predictions
            preds = (probs > 0.7).float()

            # Now flatten the tensors and convert them to numpy arrays for use with sklearn
            preds = preds.view(-1).cpu().numpy()
            targets = targets.view(-1).cpu().numpy()

            all_train_preds.extend(preds)
            all_train_targets.extend(targets)
            training_progress_bar.update()

        training_progress_bar.close()

        report = classification_report(all_train_targets, all_train_preds, zero_division=0)
        print(report)

        precision = precision_score(all_train_targets, all_train_preds, zero_division=0)
        recall = recall_score(all_train_targets, all_train_preds, zero_division=0)
        f1 = f1_score(all_train_targets, all_train_preds, zero_division=0)

        print(
            f"Epoch {epoch} Training loss {sum(train_loss) / len(train_loss):.4f} precision {precision:.4f} recall {recall:.4f} f1 {f1:.4f}")

        model.eval()
        evaling_progress_bar = tqdm(enumerate(eval_val_dataloader), total=len(eval_val_dataloader))
        evaling_progress_bar.set_description(f'Epoch Evaling {epoch}')
        with torch.no_grad():
            for batch_index, batch in enumerate(eval_val_dataloader):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.long).squeeze()
                targets = targets.view(-1, 1).float()
                outputs = model(input_ids, attention_mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                eval_loss.append(loss.detach().cpu().item())

                probs = torch.sigmoid(outputs)
                preds = (probs > 0.7).float()
                preds = preds.view(-1).cpu().numpy()
                targets = targets.view(-1).cpu().numpy()

                all_eval_preds.extend(preds)
                all_eval_targets.extend(targets)

                evaling_progress_bar.update()

        evaling_progress_bar.close()

        report = classification_report(all_eval_targets, all_eval_preds, zero_division=0)
        print(report)

        precision = precision_score(all_eval_targets, all_eval_preds, zero_division=0)
        recall = recall_score(all_eval_targets, all_eval_preds, zero_division=0)
        f1 = f1_score(all_eval_targets, all_eval_preds, zero_division=0)

        print(
            f"Epoch {epoch} Evaling loss {sum(eval_loss) / len(eval_loss):.4f} precision {precision:.4f} recall {recall:.4f} f1 {f1:.4f}")

        if f1 > f1_record:
            print(f"Record: {f1_record}")
            f1_record = f1
            print(f"Best f1 now: {f1_record}")
            if not os.path.exists(
                    f"checkpoints/sent_retrieval/{model_version.replace('/', '_')}"):
                os.makedirs(
                    f"checkpoints/sent_retrieval/{model_version.replace('/', '_')}")
            torch.save(model.state_dict(), f"checkpoints/sent_retrieval/{model_version.replace('/', '_')}/val_best.pt")

        scheduler.step(sum(eval_loss) / len(eval_loss))

    return model

In [None]:
trained = train_model(10, train_dataloader, eval_val_dataloader, model, optimizer, loss_fn, scheduler)

In [None]:
def largest_five_values(lst):
    # Enumerate the list, providing (index, value) tuples
    enumerated_lst = list(enumerate(lst))

    # Sort the enumerated list by the value in each tuple (i.e., the second element in the tuple)
    # Set reverse=True to sort in descending order
    sorted_lst = sorted(enumerated_lst, key=lambda x: x[1], reverse=True)

    # Return the first five elements from the sorted list
    return sorted_lst[:5]

In [None]:
def predict_all(json_file, model_file, mapping, tokenizer, device):
    data = load_json(json_file)
    data = pd.DataFrame(data)
    data_length = len(data)
    predicted_evidences = []

    part2_model = BERTCustom()
    part2_model.to(device)
    part2_model.load_state_dict(
        torch.load(model_file))
    part2_model.eval()

    progress_bar = tqdm(total=data_length, desc='Predicting')
    for i in range(data_length):
        predicted_evidence = []
        holding_evidence = []
        holding_probs = []

        predicted_pages = data["predicted_pages"][i]
        claim = data["claim"][i]

        for page in predicted_pages:
            for page_idx in mapping[page]:
                with torch.no_grad():

                    if str(int(page_idx) - 1) in mapping[page]:
                        per_sentence = mapping[page][str(int(page_idx) - 1)]
                    else:
                        per_sentence = ""

                    sentence = mapping[page][page_idx]

                    if str(int(page_idx) + 1) in mapping[page]:
                        pos_sentence = mapping[page][str(int(page_idx) + 1)]
                    else:
                        pos_sentence = ""

                    if sentence == "":
                        continue

                    full_input = claim + "[SEP]" + page + "[SEP]" + per_sentence + "[SEP]" + sentence + "[SEP]" + pos_sentence

                    input = tokenizer.encode_plus(
                        full_input,
                        None,
                        add_special_tokens=True,
                        padding="max_length",
                        max_length=256,
                        truncation=True,
                        return_token_type_ids=True,
                        return_attention_mask=True,
                        return_tensors="pt"
                    )

                    input_ids = input['input_ids'].to(device, dtype=torch.long)
                    attention_mask = input['attention_mask'].to(device, dtype=torch.long)
                    token_type_ids = input['token_type_ids'].to(device, dtype=torch.long)
                    outputs = part2_model(input_ids, attention_mask, token_type_ids)

                    probs = torch.sigmoid(outputs)
                    preds = (probs > 0.7).float()
                    preds = preds.view(-1).cpu().numpy()

                    holding_evidence.append([page, int(page_idx)])
                    holding_probs.append(probs.detach().cpu().item())

        top_5 = largest_five_values(holding_probs)

        for tup in top_5:
            predicted_evidence.append(holding_evidence[tup[0]])

        if len(predicted_evidence) < 1:
            holding_probs = np.array(holding_probs)
            max_index = np.argmax(holding_probs)
            predicted_evidence.append(holding_evidence[max_index])

        predicted_evidences.append(predicted_evidence)

        progress_bar.update()

    progress_bar.close()

    data["predicted_evidence"] = predicted_evidences

    file_name = json_file.split(".")
    file_name[0] += "sent5"
    file_name = ".".join(file_name)

    data.to_json(file_name, orient='records', lines=True, force_ascii=False)
    return data



In [None]:
predict_all("data/Part2/train_doc5.jsonl", "checkpoints/sent_retrieval/hfl_chinese-lert-large/val_best.pt", mapping,
            tokenizer,
            device)

In [None]:
predict_all("data/Part2/dev_doc5.jsonl", "checkpoints/sent_retrieval/hfl_chinese-lert-large/val_best.pt", mapping,
            tokenizer,
            device)

In [None]:
predict_all("data/test_doc5.jsonl", "checkpoints/sent_retrieval/hfl_chinese-lert-large/val_best.pt", mapping,
            tokenizer,
            device)

notebook3
## PART 3. Claim verification

import libs

In [None]:
import pickle
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm

import torch
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
)

from dataset import BERTDataset

from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

Global variables

In [None]:
LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

TRAIN_DATA = load_json("data/Part2/train_doc5sent5.jsonl")
DEV_DATA = load_json("data/Part2/dev_doc5sent5.jsonl")

TRAIN_PKL_FILE = Path("data/Part2/train_doc5sent5.pkl")
DEV_PKL_FILE = Path("data/Part2/dev_doc5sent5.pkl")

Preload wiki database (same as part 2.)

In [None]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages,)
del wiki_pages

### Helper function

AICUP dataset with top-k evidence sentences.

In [None]:
class AicupTopkEvidenceBERTDataset(BERTDataset):
    """AICUP dataset with top-k evidence sentences."""


    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        claim = item["claim"]
        evidence = item["evidence_list"]

        # In case there are less than topk evidence sentences
        pad = ["[PAD]"] * (self.topk - len(evidence))
        evidence += pad
        concat_claim_evidence = " [SEP] ".join([*claim, *evidence])

        concat = self.tokenizer(
            concat_claim_evidence,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
        )


        label = LABEL2ID[item["label"]] if "label" in item else -1
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}

        if "label" in item:
            concat_ten["labels"] = torch.tensor(label)
            

        return concat_ten

Evaluation function

In [None]:
def run_evaluation(model: torch.nn.Module, dataloader: DataLoader, device):
    model.eval()

    loss = 0
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            y_true.extend(batch["labels"].tolist())

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss += outputs.loss.item()
            logits = outputs.logits
            y_pred.extend(torch.argmax(logits, dim=1).tolist())

    acc = accuracy_score(y_true, y_pred)

    return {"val_loss": loss / len(dataloader), "val_acc": acc}

Prediction

In [None]:
def run_predict(model: torch.nn.Module, test_dl: DataLoader, device) -> list:
    model.eval()

    preds = []
    for batch in tqdm(test_dl,
                      total=len(test_dl),
                      leave=False,
                      desc="Predicting"):
        batch = {k: v.to(device) for k, v in batch.items()}
        pred = model(**batch).logits
        pred = torch.argmax(pred, dim=1)
        preds.extend(pred.tolist())
    return preds

### Main function

In [None]:
def join_with_topk_evidence(
    df: pd.DataFrame,
    mapping: dict,
    mode: str = "train",
    topk: int = 5,
) -> pd.DataFrame:
    """join_with_topk_evidence join the dataset with topk evidence.

    Note:
        After extraction, the dataset will be like this:
               id     label         claim                           evidence            evidence_list
        0    4604  supports       高行健...     [[[3393, 3552, 高行健, 0], [...  [高行健 （ ）江西赣州出...
        ..    ...       ...            ...                                ...                     ...
        945  2095  supports       美國總...  [[[1879, 2032, 吉米·卡特, 16], [...  [卸任后 ， 卡特積極參與...
        停各种战争及人質危機的斡旋工作 ， 反对美国小布什政府攻打伊拉克...

        [946 rows x 5 columns]

    Args:
        df (pd.DataFrame): The dataset with evidence.
        wiki_pages (pd.DataFrame): The wiki pages dataframe
        topk (int, optional): The topk evidence. Defaults to 5.
        cache(Union[Path, str], optional): The cache file path. Defaults to None.
            If cache is None, return the result directly.

    Returns:
        pd.DataFrame: The dataset with topk evidence_list.
            The `evidence_list` column will be: List[str]
    """
    if "evidence" in df.columns:
        df["evidence"] = df["evidence"].map(
            lambda x: [[x]] if not isinstance(x[0], list) else [x]
            if not isinstance(x[0][0], list) else x)

    print(f"Extracting evidence_list for the {mode} mode ...")
    # if mode == "eval":
    # extract evidence
    df["evidence_list"] = df["predicted_evidence"].map(lambda x: [
        mapping.get(evi_id, {}).get(str(evi_idx), "")
        for evi_id, evi_idx in x  # for each evidence list
    ][:topk] if isinstance(x, list) else [])
   
    
    # # format evidence column to List[List[Tuple[str, str, str, str]]]
    # if "evidence" in df.columns:
    #     df["evidence"] = df["evidence"].map(
    #         lambda x: [[x]] if not isinstance(x[0], list) else [x]
    #         if not isinstance(x[0][0], list) else x)

    # print(f"Extracting evidence_list for the {mode} mode ...")
    # if mode == "eval":
    # # extract evidence
    #     df["evidence_list"] = df["predicted_evidence"].map(lambda x: [
    #         mapping.get(evi_id, {}).get(str(evi_idx), "")
    #         for evi_id, evi_idx in x  # for each evidence list
    #     ][:topk] if isinstance(x, list) else [])
    # else:
    #     # extract evidence
    #     df["evidence_list"] = df["evidence"].map(lambda x: [
    #         " ".join([  # join evidence
    #             mapping.get(evi_id, {}).get(str(evi_idx), "")
    #             for _, _, evi_id, evi_idx in evi_list
    #         ]) if isinstance(evi_list, list) else ""
    #         for evi_list in x  # for each evidence list
    #     ][:1] if isinstance(x, list) else [])

    #     df2 = df.copy()
    #     df2["evidence_list"] = df2["predicted_evidence"].map(lambda x: [
    #         mapping.get(evi_id, {}).get(str(evi_idx), "")
    #         for evi_id, evi_idx in x  # for each evidence list
    #     ][:topk] if isinstance(x, list) else [])

    #     for i in range(0, len(df)):
    #         if len(df["evidence_list"][i][0]) == 0:
    #             df["evidence_list"][i][0] = df2["evidence_list"][i][0]

    return df

### Step 1. Setup training environment

Hyperparams

In [None]:
#@title  { display-mode: "form" }

# MODEL_NAME = "hfl/chinese-roberta-wwm-ext"
# MODEL_NAME = "hfl/chinese-xlnet-mid"
# MODEL_NAME = "hfl/chinese-lert-base"
# MODEL_NAME = "hfl/chinese-lert-large"
# MODEL_NAME = "hfl/chinese-pert-base"
# MODEL_NAME = "hfl/chinese-macbert-base"
# MODEL_NAME = "uer/sbert-base-chinese-nli"
# MODEL_NAME = "bert-base-chinese"

MODEL_NAME = "hfl/chinese-lert-large"

TRAIN_BATCH_SIZE = 32  #@param {type:"integer"}
TEST_BATCH_SIZE = 32  #@param {type:"integer"}
SEED = 42  #@param {type:"integer"}
LR = 2e-5  #@param {type:"number"}
NUM_EPOCHS = 20  #@param {type:"integer"}
MAX_SEQ_LEN = 256  #@param {type:"integer"}
EVIDENCE_TOPK = 3  #@param {type:"integer"}
VALIDATION_STEP = 50  #@param {type:"integer"}


Experiment Directory

In [None]:
OUTPUT_FILENAME = "submission.jsonl"

EXP_DIR = f"claim_verification/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_top{EVIDENCE_TOPK}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

### Step 2. Concat claim and evidences
join topk evidence

In [None]:
if not TRAIN_PKL_FILE.exists():
    train_df = join_with_topk_evidence(
        pd.DataFrame(TRAIN_DATA),
        mapping,
        topk=EVIDENCE_TOPK,
    )
    train_df.to_pickle(TRAIN_PKL_FILE, protocol=4)
else:
    with open(TRAIN_PKL_FILE, "rb") as f:
        train_df = pickle.load(f)

if not DEV_PKL_FILE.exists():
    dev_df = join_with_topk_evidence(
        pd.DataFrame(DEV_DATA),
        mapping,
        mode="eval",
        topk=EVIDENCE_TOPK,
    )
    dev_df.to_pickle(DEV_PKL_FILE, protocol=4)
else:
    with open(DEV_PKL_FILE, "rb") as f:
        dev_df = pickle.load(f)

### Step 3. Training

Prevent CUDA out of memory

In [None]:
torch.cuda.empty_cache()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_dataset = AicupTopkEvidenceBERTDataset(
    train_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)
val_dataset = AicupTopkEvidenceBERTDataset(
    dev_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

In [None]:
train_df_count = pd.DataFrame(TRAIN_DATA)

train_df_count_samples = len(train_df_count)
print(f"train_preprocessed length: {train_df_count_samples}")

class_0 = train_df_count["label"].value_counts()["supports"]
class_1 = train_df_count["label"].value_counts()["refutes"]
class_2 = train_df_count["label"].value_counts()["NOT ENOUGH INFO"]

total_samples = class_0 + class_1 + class_2

frequency_0 = class_0 / total_samples
frequency_1 = class_1 / total_samples
frequency_2 = class_2 / total_samples

weight_0 = 1 / frequency_0
weight_1 = 1 / frequency_1
weight_2 = 1 / frequency_2

total_weight = weight_0 + weight_1 + weight_2

weight_0 /= total_weight
weight_1 /= total_weight
weight_2 /= total_weight

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL2ID),
)
model.to(device)

class_weights = torch.tensor([weight_0, weight_1, weight_2]).float().to(device)
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

writer = SummaryWriter(LOG_DIR)

Training (30 mins)

In [None]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0

average_acc = 0.0
number = 0


for epoch in range(NUM_EPOCHS):
    model.train()

    print(epoch)
    
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        # loss = outputs.loss
        # loss.backward()
        logits = outputs.logits
        loss = loss_fct(logits.view(-1, logits.shape[-1]), batch['labels'].view(-1))

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        writer.add_scalar("training_loss", loss.item(), current_steps)

        y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_true = batch["labels"].tolist()

        current_steps += 1

        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            # print("Start validation")
            number += 1

            val_results = run_evaluation(model, eval_dataloader, device)

            # log each metric separately to TensorBoard
            for metric_name, metric_value in val_results.items():
                print(f"{metric_name}: {metric_value}")
                if metric_name == "val_acc":
                    average_acc += metric_value
                writer.add_scalar(f"{metric_name}", metric_value, current_steps)

            save_checkpoint(
                model,
                CKPT_DIR,
                current_steps,
                mark=f"val_acc={val_results['val_acc']:.4f}",
            )


print("Finished training!")

### Step 4. Make your submission

Prediction

In [None]:
TEST_DATA = load_json("data/test_doc5sent5.jsonl")
TEST_PKL_FILE = Path("data/test_doc5sent5.pkl")

test_df = join_with_topk_evidence(
    pd.DataFrame(TEST_DATA),
    mapping,
    mode="eval",
    topk=EVIDENCE_TOPK,
)
test_df.to_pickle(TEST_PKL_FILE, protocol=4)

test_dataset = AicupTopkEvidenceBERTDataset(
    test_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
)
test_dataloader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE)

In [None]:
# CKPT_DIR = "checkpoints/" + f"claim_verification/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_{EVIDENCE_TOPK}"
CKPT_DIR = "checkpoints/claim_verification/"
ckpt_name = "val_acc=0.7350_model.3050"  #@param {type:"string"}
model = load_model(model, ckpt_name, CKPT_DIR)
predicted_label = run_predict(model, test_dataloader, device)

Write files

In [None]:
predict_dataset = test_df.copy()

for i in range(0, len(predict_dataset)):
    if len(predict_dataset["predicted_evidence"][i]) > 5:
        predict_dataset["predicted_evidence"][i] = predict_dataset["predicted_evidence"][i][:5] 

predict_dataset["predicted_label"] = list(map(ID2LABEL.get, predicted_label))
predict_dataset[["id", "predicted_label", "predicted_evidence"]].to_json(
    OUTPUT_FILENAME,
    orient="records",
    lines=True,
    force_ascii=False,
)

make submission

In [None]:
# # 讀取 JSON 檔案
data = load_json("submission.jsonl")
df = pd.DataFrame(data)

# 依照 id 欄位排序
df = df.sort_values("id")
print(df)


df.to_json(
    "OUTPUT_NAME",
    orient="records",
    lines=True,
    force_ascii=False,
)