In [1]:
!pip install torch transformers==4.27.1 datasets==2.9.0 accelerate==0.17.1 evaluate==0.4.0


Collecting transformers==4.27.1
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.9.0
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.17.1
  Downloading accelerate-0.17.1-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.8/212.8 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate==0.4.0
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.27.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
import os
import re
import random
from random import randrange, sample
import numpy as np
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from sklearn.metrics import accuracy_score

In [4]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

!pip install jsonlines

import jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [5]:
def read_jsonfile(path: str):
    data = []
    with jsonlines.open(path) as reader:
        for obj in reader:
            data.append(obj)
    return data

In [6]:
def delete_key(json_data, key_to_delete):
    for data in json_data:
        for key in key_to_delete:
            data.pop(key, None)
    return json_data

In [8]:
def transToDict(data):
    data = delete_key(data, ['sentences_containing_the_numeral_in_answer_options'])

    keys = data[0].keys()
    data_dic = {key: [] for key in keys}

    for item in data:
        data_dic['news_article'].append(' '.join(item['news_article']).strip())
        data_dic['question_stem'].append(item['question_stem'].replace("___", '[Num]').strip())
        data_dic['answer_options'].append([str(option) for option in item['answer_options']])
        data_dic['target_num'].append(str(item['target_num']).strip())
        data_dic['ans'].append(str(item['ans']).strip())

    return data_dic

In [9]:
class Template:

    def __init__(self):
        self.templates = {}

    def load_template(self):

        news_article_ch = "根据新闻，为以下问题中的[Num]选择正确选项"
        question_ch = "新闻: {news_article}\n问题: {question}\n选项:\nA {option1}\nB {option2}\nC {option3}\nD {option4}"
        input_template_ch = question_ch.format(news_article=news_article_ch, question="{question}")

        label_template = "{ans} {num}"


        self.templates['input_template_ch'] = input_template_ch
        self.templates['label_template'] = label_template

In [10]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


def calculate_scores(eval_pred, tokenizer):
    predictions, labels = eval_pred

    # Decoding predictions and labels
    d_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    d_labels = tokenizer.batch_decode(np.where(labels != -100, labels, tokenizer.pad_token_id), skip_special_tokens=True)

    # Computing numerical and option accuracy
    num_accuracy = np.mean([1 if pred.split()[-1] == label.split()[-1] else 0 for pred, label in zip(d_preds, d_labels)])
    opt_accuracy = accuracy_score([pred.strip()[0] for pred in d_preds], [label.strip()[0] for label in d_labels])

    # Calculating the general accuracy (balanced by weighing in-between numerical accuracy and optional accuracy)
    general_accuracy = 0.7 * num_accuracy + 0.3 * opt_accuracy

    # Computing macro F1 score
    macro_f1 = f1_score([label.split()[-1] for label in d_labels], [pred.split()[-1] for pred in d_preds], average='macro')

    # Computing micro F1 score
    micro_f1 = f1_score([label.split()[-1] for label in d_labels], [pred.split()[-1] for pred in d_preds], average='micro')

    # Packaging the results into a dictionary
    result = {
        'num_acc': num_accuracy,
        'opt_acc': opt_accuracy,
        'accuracy':general_accuracy,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
    }

    return result

In [11]:
def train_model(tokenizer, tokenized_dataset):
    # Loading the model
    model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

    # Defining the label pad token id
    label_pad_token_id = -100

    # Defining the data collator for Seq2Seq
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        pad_to_multiple_of=8,
        label_pad_token_id=label_pad_token_id,
        padding='max_length'
    )

    # Defining the training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir='./model',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        predict_with_generate=True,
        fp16=False,
        learning_rate=1e-4,
        num_train_epochs=3,
        warmup_ratio=0.1,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        gradient_accumulation_steps=10,
        seed=42,
    )

    # Defining the Seq2Seq trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["val"],
        compute_metrics=lambda eval_pred: calculate_scores(eval_pred, tokenizer)
    )


    torch.cuda.empty_cache()

    trainer.train()


In [12]:
def preprocessing(sample, tokenizer, template):
    # Extracting input and label templates from the template object
    input_template = template.templates['input_template_ch']
    label_template = template.templates['label_template']

    # Extracting relevant fields from the sample
    news_articles = sample["news_article"]
    questions = sample["question_stem"]
    answer_options = sample["answer_options"]
    ans_labels = sample["ans"]
    target_nums = sample["target_num"]

    # Preparing inputs
    inputs = []
    for news_article, question, options in zip(news_articles, questions, answer_options):

        input_str = input_template.format(
            news_article=news_article.strip(),
            question=question.strip(),
            option1=str(options[0]),
            option2=str(options[1]),
            option3=str(options[2]),
            option4=str(options[3])
        )
        inputs.append(input_str)

    # Tokenizing inputs
    tokenized_inputs = tokenizer(inputs, truncation=True, max_length=512)

    # Preparing labels
    labels = []
    dict_ = {'0': 'A', '1': 'B', '2': 'C', '3': 'D'}
    for ans, num in zip(ans_labels, target_nums):

        label_str = label_template.format(
            ans=dict_[ans],
            num=num
        )
        labels.append(label_str)

    # Tokenizing labels
    tokenized_labels = tokenizer(text_target=labels, truncation=True, max_length=64)

    # Updating model inputs with labels
    model_inputs = tokenized_inputs
    model_inputs["labels"] = tokenized_labels["input_ids"]

    return model_inputs


In [20]:

import os
import json
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

def get_predictions(model, tokenized_dataset, tokenizer, batch_size=4, max_new_tokens=128, device='cuda'):
    """
    Get the predictions from the trained model.
    """
    def collate_function(batch):
        # Collect input IDs and attention masks for each example in the batch
        input_ids_list = []
        attention_mask_list = []
        for example in batch:
            input_ids_list.append(torch.tensor(example['input_ids']))
            attention_mask_list.append(torch.tensor(example['attention_mask']))

        # Pad sequences to make them of equal length
        padded_input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
        padded_attention_mask = pad_sequence(attention_mask_list, batch_first=True, padding_value=tokenizer.pad_token_id)

        return padded_input_ids, padded_attention_mask

    dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, collate_fn=collate_function)

    model.to(device)

    preds_opt, preds_num, preds_out = [], [], []


    for inputs, attention_mask in tqdm(dataloader):
        inputs = inputs.to(device)
        attention_mask = attention_mask.to(device)

        output_ids = model.generate(input_ids=inputs, attention_mask=attention_mask, max_length=max_new_tokens)
        decode_pred_ans = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        option_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

        for decode_pred in decode_pred_ans:
            num = decode_pred.split(" ")[-1].strip()
            preds_num.append(num)

            starting_char = decode_pred[0]
            preds_opt.append(option_mapping.get(starting_char, -1))  # Default value of -1 if starting_char is not A, B, C, or D

            preds_out.append(decode_pred)

            return preds_opt, preds_num, preds_out

def predict_init(tokenizer=None, tokenized_dataset=None, dataset_test=None):

    model_path = "model/checkpoint-1299"
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

    preds_opt, preds_num, preds_out = get_predictions(model=model, tokenized_dataset=tokenized_dataset, tokenizer=tokenizer, batch_size=30, max_new_tokens=25, device='cuda')

    labels_opt = []
    labels_num = []

    for sample in dataset_test:
        labels_opt.append(int(sample["ans"]))
        labels_num.append(sample["target_num"])

    # Computing micro F1 score
    micro_f1 = f1_score(labels_opt, preds_opt, average='micro')

    # Computing macro F1 score
    macro_f1 = f1_score(labels_opt, preds_opt, average='macro')

    option_accuracy = accuracy_score(y_pred=preds_opt, y_true=labels_opt, normalize=True, sample_weight=None)

    count_equal_ans = 0
    for pred, label in zip(preds_num, labels_num):
        if pred == label:
            count_equal_ans += 1

    # Calculating numerical accuracy
    numerical_acc = round(count_equal_ans / len(labels_num) * 100, 4)

    # Calculating general accuracy
    general_accuracy = 0.7 * numerical_acc + 0.3 * option_accuracy

    micro_f1 = round(micro_f1 * 100, 4)
    macro_f1 = round(macro_f1 * 100, 4)

    print(f"micro_f1: {micro_f1}")
    print(f"macro_f1: {macro_f1}")
    print(f"option accuracy: {option_accuracy}")
    print(f"numerical accuracy: {numerical_acc}")
    print(f"general accuracy: {general_accuracy}")


    from sklearn.metrics import confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt


    cm = confusion_matrix(labels_opt, preds_opt)

    # Plotting confusion matrix

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['A', 'B', 'C', 'D'], yticklabels=['A', 'B', 'C', 'D'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()


    from sklearn.metrics import precision_recall_curve

    # Computing precision and recall for each class
    precision = dict()
    recall = dict()
    for i in range(4):  # 4 classes
        precision[i], recall[i], _ = precision_recall_curve((labels_opt == i), (preds_opt == i))

    # Plotting precision-recall curve for each class
    plt.figure(figsize=(8, 6))
    for i in range(4):  # 4 classes
        plt.plot(recall[i], precision[i], label=f'Class {i}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()


    from sklearn.metrics import roc_curve, auc

    # Computing ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(4):  # 4 classes
        fpr[i], tpr[i], _ = roc_curve((labels_opt == i), (preds_opt == i))
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Plotting ROC curve for each class
    plt.figure(figsize=(8, 6))
    for i in range(4):  # 4 classes
        plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:0.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()


    # Creating a bar chart to visualize predictions vs. ground truth
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(labels_opt)), labels_opt, color='blue', alpha=0.5, label='Ground Truth')
    plt.bar(range(len(preds_opt)), preds_opt, color='red', alpha=0.5, label='Predictions')
    plt.xlabel('Sample Index')
    plt.ylabel('Class')
    plt.title('Prediction vs. Ground Truth')
    plt.legend()
    plt.show()


    # Printing some examples of misclassifications
    misclassified_samples = [(i, labels_opt[i], preds_opt[i]) for i in range(len(labels_opt)) if labels_opt[i] != preds_opt[i]]
    print("Misclassified Samples:")
    for sample in misclassified_samples[:5]:
        print(f"Sample Index: {sample[0]}, Ground Truth: {sample[1]}, Predicted: {sample[2]}")

    # Initializing an empty list to store the results
    save_ = []

    # Iterating througgh each sample in the dataset
    for sample in dataset_test:

        # Creating a dictionary for each sample containing relevant information
        result = {
            "news_article": sample["news_article"],
            "question_stem": sample["question_stem"],
            "ans": sample['ans'],
            "target_num": sample['target_num']
        }

        save_.append(result)

    # Iterating throughs along with preds_opt, preds_num, and preds_out
    for res, pred_opt, pred_num, pred_out in zip(save_, preds_opt, preds_num, preds_out):

        # Adding predicted values to each dictionary
        res['pred_opt'] = pred_opt
        res['pred_num'] = pred_num
        res['pred_out'] = pred_out

    #the directory and file name for saving the predictions
    output_dir = "./output"
    output_file_name = "predictions.json"

    # Creating output directory if it doesn't exists
    os.makedirs(output_dir, exist_ok=True)

    # Generating the file path
    json_file_path = os.path.join(output_dir, output_file_name)


    # Writing the prediction results to JSON file
    with open(json_file_path, "w", encoding="utf-8") as json_file:
        json.dump(save_, json_file, ensure_ascii=False)

In [14]:
torch.manual_seed(42)
np.random.seed(42)
template_ = Template()
template_.load_template()

input_template = template_.templates['input_template_ch']
label_template = template_.templates['label_template']
tokenizer = AutoTokenizer.from_pretrained('t5-small')


data_train_pth = '/content/drive/MyDrive/NQuAD_train.json'
data_test_pth = '/content/drive/MyDrive/NQuAD_test.json'

dataset_train = read_jsonfile(data_train_pth)[0]
data_split = int(len(dataset_train) * 0.7)
random.seed(42)
random.shuffle(dataset_train)

dataset_val = dataset_train[data_split:]
dataset_train = dataset_train[:data_split]

dataset_train = Dataset.from_dict(convert_to_dict(dataset_train))
dataset_val = Dataset.from_dict(convert_to_dict(dataset_val))

datasets = DatasetDict()
datasets['train'] = dataset_train
datasets['val'] = dataset_val

tokenized_train_dataset = datasets.map(
    lambda x: preprocessing(x, tokenizer, template_),
    batched=True,
    remove_columns=['news_article', 'question_stem', 'answer_options', 'ans', 'target_num']
)
train_model(tokenizer, tokenized_train_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

  0%|          | 0/52 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Numerical accuracy,Optional accuracy,General accuracy,Macro F1,Micro F1
0,1.1111,0.548399,51.7229,0.540433,36.36816,0.519761,0.540433
1,0.5733,0.471601,60.6926,0.623896,42.671989,0.610412,0.623896
2,0.5144,0.45534,61.4719,0.633766,43.22046,0.620321,0.633766


In [None]:

dataset_test = read_jsonfile(data_test_pth)[0]
dataset_test = Dataset.from_dict(delete_key(dataset_test))
datasets['test'] = dataset_test
tokenized_test_dataset = datasets.map(lambda x: preprocessing(x, tokenizer, template_), batched=True, remove_columns=['news_article', 'question_stem', 'answer_options', 'ans', 'target_num'])
predict_init(tokenizer, tokenized_test_dataset, dataset_test)


import shutil

import zipfile
import os

shutil.make_archive('model', 'zip', 'model')

zip_file_path = 'model.zip'
extract_dir = 'extracted_model'

os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Model folder extracted successfully to:", extract_dir)