In [None]:
!pip install transformers
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as nnf
from torch.utils.data import Dataset, DataLoader
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
from tqdm import tqdm
import os
import sys
import json
import pandas as pd

import random
import evaluate
from rich.table import Column, Table
from rich import box
from rich.console import Console

# Device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running on device: {device}')

# Clustering Method

## Model and Tokenizer Setup

In [3]:
tokenizer = AutoTokenizer.from_pretrained('VietAI/vit5-base-vietnews-summarization')
model = AutoModelForSeq2SeqLM.from_pretrained('VietAI/vit5-base-vietnews-summarization')

## Dataset setup

In [4]:
class MyDataset(Dataset):

    def __init__(
        self, tokenizer, cluster_list, source_len=1024, target_len=500,
        source_dir='/kaggle/input/vims-feature/12_clustering_summary.json', 
        target_dir='/kaggle/input/vims-feature/summary_data.json'
    ):
        self.tokenizer = tokenizer
        self.source_len = source_len
        self.target_len = target_len
        self.cluster_list = cluster_list
        with open(source_dir, 'r') as f:
            source_data = json.load(f)
        self.source_data = {k: source_data[k] for k in cluster_list}
        with open(target_dir, 'r') as f:
            target_data = json.load(f)
        self.target_data = {k: target_data[k] for k in cluster_list}

    def __len__(self):
        """returns the length of dataframe"""
        return len(self.cluster_list)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""
        
        ## Source text
        cluster = self.cluster_list[index]
        source_text = self.source_data[cluster]
        target_text = self.target_data[cluster]
        
        ## Tokenize
        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.target_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_txt": source_text,
            "target_txt": target_text,
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_mask": target_mask.to(dtype=torch.long),
        }

In [5]:
dataset = MyDataset(tokenizer, cluster_list = os.listdir('/kaggle/input/vims-feature/original_feature/original_feature/10_cluster')[:240])
for i, output in enumerate(dataset):
    print("Source:")
    print("Source article: \n", output['source_txt'])
    print("Source input ids length: \n", len(output['source_ids']))
    print("Source input ids: \n", output['source_ids'])
    print("Source attention mask: \n", output['source_mask'])
    print("\n")
    print("Target:\n")
    print("Target text: \n", output['target_txt'])
    print("Target input ids length: \n", len(output['target_ids']))
    print("Target attention mask: \n", output['target_mask'])
    if i >= 0:
        break

Source:
Source article: 
 Ông Lộc cũng cho biết, thời gian vừa qua ông có viết đơn tố cáo cán bộ UBND xã Thanh Tường vi phạm trong công tác quản lý, sử dụng tài chính liên quan đến kinh phí xây dựng trường tiểu học của xã. Hồi giữa tháng 3, ông Lộc làm đơn tố cáo UBND xã Thanh Tường sai phạm về tài chính trong việc xây dựng trường tiểu học. Tôi vào bàn ngồi, thấy anh ta đi vào, tôi nghĩ vào trả tiền. Ông Lộc lấy gói bim bim giao cho khách xong ngồi xuống ghế. Khoảng hơn 1 tiếng sau, như đã hẹn, hai người này ghé quán của ông để mua hàng là một gói bim bim. Theo lời ông Lộc, ông không rõ động cơ, mục đích của hai người lạ mặt ấy khi hành động như thế. Trong lúc ông Lộc đang loay hoay mở cửa quán, anh Nguyễn Thế Định (trú cùng xóm) chạy sang thông báo là có hai người đi xe máy đến hỏi. Chiều ngày 25-5, Công an huyện Thanh Chương đã xuống bệnh viện, nơi ông Lộc đang nằm điều trị để tiếp xúc, lấy lời khai của nạn nhân để điều tra làm rõ vụ việc. Quá bất ngờ tôi không kịp phản ứng, ngón cái

## Training Setup

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table()

def resetTable():
    global training_logger

    training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)
resetTable()

In [7]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 20 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    console.print(training_logger)
    resetTable()

In [8]:
def validate(epoch, tokenizer, model, device, loader):
    """
    Function to evaluate model for predictions
    """
    model.eval()
    
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
        
            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=256,
                num_beams=5,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
            
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            
            if _%20==0:
                console.print(f'Completed {_}')
            
            predictions.extend(preds)
            actuals.extend(target)

    # print ROUGE score
    rouge = evaluate.load('rouge')
    results = rouge.compute(predictions=predictions,
                            references=actuals)
    console.print("ROUGE: ", results)
    
    return predictions, actuals

In [9]:
CFG = {
    'seed': 719,
    'model_arch': "VietAI/vit5-base-vietnews-summarization",
    'epochs': 3,
    'train_bs': 2,
    'valid_bs': 2,
    'lr': 1e-4,
}

In [10]:
def T5Trainer(output_dir="/kaggle/working/"):

    """
    T5 trainer
    """
    # Set random seeds and deterministic pytorch for reproducibility
    seed_everything(CFG['seed'])

    # logging
    console.log(f"""[Model]: Loading {CFG["model_arch"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = AutoTokenizer.from_pretrained(CFG['model_arch'])

    # Defining the model
    model = AutoModelForSeq2SeqLM.from_pretrained(CFG['model_arch'])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Creation of Dataset and Dataloader
    global val_dataset
    
    cluster_list = os.listdir('/kaggle/input/vims-feature/original_feature/original_feature/10_cluster')
    train_cluster = cluster_list[:240]
    val_cluster = cluster_list[240:270]
    test_cluster = cluster_list[270:]

    console.print(f"FULL Dataset: {len(cluster_list)}")
    console.print(f"TRAIN Dataset: {len(train_cluster)}")
    console.print(f"TEST Dataset: {len(val_cluster)}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = MyDataset(
        tokenizer,
        train_cluster
    )
    val_set = MyDataset(
        tokenizer,
        val_cluster
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": CFG["train_bs"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": CFG["valid_bs"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=CFG["lr"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(CFG["epochs"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # evaluating test dataset
    global final_df
    console.log(f"[Initiating Validation]...\n")
    predictions, actuals = validate(0, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
    final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

## Train

In [11]:
T5Trainer()

## Show result

In [12]:
rd_idx=5
## Actual
print("Actual: \n")
print(final_df['Actual Text'][rd_idx])
print()
## Prediction
print("Prediction: \n")
print(final_df['Generated Text'][rd_idx])

Actual: 

Hãng tin AFP dẫn nguồn tin từ giới chức Nhật Bản cho biết, cậu bé Yamato Tanooka, 7 tuổi, bị mất tích sau chuyến đi chơi cùng gia đình trong một khu rừng đầy gấu ở phía bắc Nhật Bản đã được tìm thấy hôm nay 3/6. Khi được tìm thấy, cậu bé đang trong tình trạng khá tốt và không bị thương. Khu vực phát hiện cách nơi cậu bé được cho là đã mất tích khoảng 5km. Khu rừng này được cho là nơi sinh sống của khoảng 500 con gấu nâu, tuy nhiên rất may là những ngày này chúng hoạt động hạn chế do có mưa rào. Ban đầu, cha mẹ của Yamato Tanooka khai báo với cảnh sát rằng con trai họ mất tích trong lúc cả nhà đang đi dạo trong rừng và hái rau dại. Sau đó, ông Takayuki Tanooka mới thừa nhận rằng mình và vợ dừng xe trên một con đường núi, yêu cầu con trai ra ngoài để chịu phạt rồi lái xe đi. Sau khi lái xe đi khoảng 500m, họ quay lại đón con nhưng Tanooka đã biến mất.

Prediction: 

Theo nguồn tin từ giới chức Nhật Bản, một quan chức của Lực lượng Phòng vệ Nhật Bản đã tình cờ tìm thấy cậu bé Ya

# GPT (Not complete)

## Dataset setup

In [3]:
prefix_length = 1

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# class VNSummaryDataset(Dataset):
#     def __init__(self, original_data_path, summary_data_path, cluster_list, 
#                  prefix_length, gpt2_type='NlpHUST/gpt2-vietnamese',
#                  original_data_type='mean'):
#         self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
#         self.prefix_length = prefix_length
#         self.original_data_path = original_data_path
#         self.original_data_type = original_data_type
#         self.cluster_list = cluster_list
#         with open(summary_data_path, 'r') as f:
#             summary_data = json.load(f)
#         summary_data = {k: summary_data[k] for k in cluster_list}
#         print("Data size is %0d" % len(cluster_list))
#         sys.stdout.flush()
#         self.summary_tokens = {}
#         max_seq_len = 0
#         for cluster, summary in summary_data.items():
#             tokens = torch.tensor(self.tokenizer.encode(summary), dtype=torch.int64)
#             self.summary_tokens[cluster] = tokens
#             max_seq_len = max(max_seq_len, tokens.shape[0])
#         all_len = torch.tensor([len(self.summary_tokens[i]) for i in self.summary_tokens.keys()]).float()
#         self.max_seq_len = min(int(all_len.mean() + all_len.std() * 10), int(all_len.max()))

#     def pad_tokens(self, item):
#         tokens = self.summary_tokens[item]
#         padding = self.max_seq_len - tokens.shape[0]
#         if padding > 0:
#             tokens = torch.cat((tokens, torch.zeros(padding, dtype=torch.int64) - 1))
#             self.summary_tokens[item] = tokens
#         elif padding < 0:
#             tokens = tokens[:self.max_seq_len]
#             self.summary_tokens[item] = tokens
#         mask = tokens.ge(0)  # mask is zero where we out of sequence
#         tokens[~mask] = 0
#         mask = mask.float()
#         mask = torch.cat((torch.ones(self.prefix_length), mask), dim=0)  # adding prefix mask
#         return tokens, mask
    
#     def __len__(self):
#         return len(self.cluster_list)

#     def __getitem__(self, item):
#         cluster = self.cluster_list[item]
#         tokens, mask = self.pad_tokens(cluster)
#         prefix = torch.from_numpy(np.load(os.path.join(self.original_data_path, cluster, '{}.npy'.format(self.original_data_type)))).to(device)
#         return tokens, mask, prefix

class VNSummaryDataset(Dataset):
    def __init__(self, original_data_path, summary_data_path, cluster_list, 
                 prefix_length, gpt2_type='NlpHUST/gpt2-vietnamese'):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.prefix_length = prefix_length
        with open(original_data_path, 'r') as f:
            self.prefix_data = json.load(f)
        self.cluster_list = cluster_list
        with open(summary_data_path, 'r') as f:
            summary_data = json.load(f)
        summary_data = {k: summary_data[k] for k in cluster_list}
        print("Data size is %0d" % len(cluster_list))
        sys.stdout.flush()
        self.summary_tokens = {}
        max_seq_len = 0
        for cluster, summary in summary_data.items():
            tokens = torch.tensor(self.tokenizer.encode(summary), dtype=torch.int64)
            self.summary_tokens[cluster] = tokens
            max_seq_len = max(max_seq_len, tokens.shape[0])
        all_len = torch.tensor([len(self.summary_tokens[i]) for i in self.summary_tokens.keys()]).float()
        self.max_seq_len = min(int(all_len.mean() + all_len.std() * 10), int(all_len.max()))

    def pad_tokens(self, item):
        tokens = self.summary_tokens[item]
        padding = self.max_seq_len - tokens.shape[0]
        if padding > 0:
            tokens = torch.cat((tokens, torch.zeros(padding, dtype=torch.int64) - 1))
            self.summary_tokens[item] = tokens
        elif padding < 0:
            tokens = tokens[:self.max_seq_len]
            self.summary_tokens[item] = tokens
        mask = tokens.ge(0)  # mask is zero where we out of sequence
        tokens[~mask] = 0
        mask = mask.float()
        mask = torch.cat((torch.ones(self.prefix_length), mask), dim=0)  # adding prefix mask
        return tokens, mask
    
    def __len__(self):
        return len(self.cluster_list)

    def __getitem__(self, item):
        cluster = self.cluster_list[item]
        tokens, mask = self.pad_tokens(cluster)
        prefix = torch.FloatTensor(self.prefix_data[cluster]).unsqueeze(0).to(device)
        return tokens, mask, prefix

In [5]:
dataset = VNSummaryDataset(
    original_data_path = '/kaggle/input/vims-feature/original_data_embedding.json', 
    summary_data_path = '/kaggle/input/vims-feature/summary_data.json',
    cluster_list = os.listdir('/kaggle/input/vims-feature/original_feature/original_feature/10_cluster')[:240],
    prefix_length = prefix_length
)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/854k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Data size is 240


In [6]:
dataset[0][2].shape

torch.Size([1, 768])

## Model Setup

In [7]:
class VNSummaryModel(nn.Module):
    def __init__(self, prefix_length, prefix_size = 768):
        super(VNSummaryModel, self).__init__()
        self.prefix_length = prefix_length
        self.gpt = GPT2LMHeadModel.from_pretrained('NlpHUST/gpt2-vietnamese')
        self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
        
    def get_dummy_token(self, batch_size, device):
        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)

    def forward(self, tokens, prefix, mask = None, labels = None):
        embedding_text = self.gpt.transformer.wte(tokens)
        embedding_cat = torch.cat((prefix, embedding_text), dim=1)
        
        if labels is not None:
            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
            labels = torch.cat((dummy_token, tokens), dim=1)

        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
        return out

In [18]:
dataloader = DataLoader(dataset, batch_size=8)

In [None]:
model = VNSummaryModel(prefix_length, prefix_size=768)
for idx, (tokens, mask, prefix) in enumerate(dataloader):
    model.zero_grad()
    tokens, mask, prefix = tokens.to(device), mask.to(device), prefix.to(device, dtype=torch.float32)
    outputs = model(tokens, prefix, mask)
    logits = outputs.logits[:, 10 - 1: -1]
    print(logits.shape)
    print(logits.reshape(-1, logits.shape[-1]).shape)
    print(tokens.shape)
    break

## Training

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# define a rich console logger
console = Console(record=True)

# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table()

def resetTable():
    global training_logger

    training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)
resetTable()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 8
epochs = 10
output_dir = '/kaggle/working/result'
lr = 2e-5
warmup_steps = 5000
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model = VNSummaryModel(prefix_length, prefix_size=768)
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=lr)
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=epochs * len(train_dataloader)
)
for epoch in range(epochs):
    print(f">>> Training epoch {epoch}")
    sys.stdout.flush()
    for idx, (tokens, mask, prefix) in enumerate(train_dataloader, 0):
        model.zero_grad()
        tokens, mask, prefix = tokens.to(device), mask.to(device), prefix.to(device, dtype=torch.float32)
        outputs = model(tokens, prefix, mask)
        logits = outputs.logits[:, dataset.prefix_length - 1: -1]
        loss = nnf.cross_entropy(logits.reshape(-1, logits.shape[-1]), tokens.flatten(), ignore_index=0)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
            
        if idx % 5 == 0:
            training_logger.add_row(str(epoch), str(idx), str(loss))
    console.print(training_logger)
    resetTable()

Downloading (…)lve/main/config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

>>> Training epoch 0




>>> Training epoch 1


>>> Training epoch 2


>>> Training epoch 3


>>> Training epoch 4


>>> Training epoch 5


>>> Training epoch 6


>>> Training epoch 7


>>> Training epoch 8
