In [1]:
!pip install transformers
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [8]:
import transformers
import pandas as pd
import numpy as np
import transformers
from sklearn import model_selection
from tqdm import trange
import torch

EPOCH = 5
MAX_LEN = 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_BASE_PATH = "Roberta-Base"
HIDDEN_SIZE = 512
NUMBER_OF_LABEL = 30
HIDDEN_DROPOUT_PROB = 0.3
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
TOKENIZER = transformers.RobertaTokenizer.from_pretrained(MODEL_BASE_PATH,do_lower_case = True)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [9]:
from transformers import RobertaTokenizer
from torch.utils.data import Dataset

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
class MyQnADataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        context = str(self.data[item]['context'])
        question = str(self.data[item]['qas'][0]['question'])
        answers_text = str(self.data[item]['qas'][0]['answers'][0]['text'])

        inputs = self.tokenizer.encode_plus(
            context + " " + question,
            answers_text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding=True,
            stride=16
        )

        ids = inputs["input_ids"]
        masks = inputs["attention_mask"]

        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        masks = masks + ([0] * padding_length)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "masks": torch.tensor(masks, dtype=torch.long),
            "targets": torch.tensor(self.data[item]['qas'][0]['is_impossible'], dtype=torch.float)}


In [12]:
class ROBERTAModel(torch.nn.Module):
    def __init__(self,conf):
        super(ROBERTAModel,self).__init__()
        self.conf = conf
        self.roberta = transformers.RobertaModel(self.conf,add_pooling_layer = False)
        self.dense = torch.nn.Linear(768,768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768,30)

    def forward(self,input_id,attention_mask):
        output,_ = self.roberta(input_id,attention_mask,return_dict = False)
        output = output[:,0,:]
        output = self.dropout(output)
        output = self.dense(output)
        output = torch.tanh(output)
        output = self.dropout(output)
        output = self.classifier(output)
        return output

In [13]:
def Loss(output,target):
    return torch.nn.BCEWithLogitsLoss()(output,target)



def Train(DataLoader,Model,Optimizer,Device,Scheduler = None):
    Model.train()

    for index,batch in enumerate(DataLoader):
        id = batch["id"]
        masks = batch["masks"]
        target = batch["targets"]

        id = id.to(Device,dtype = torch.long)
        masks = masks.to(Device,dtype = torch.long)
        target = target.to(Device,dtype = torch.float)

        Optimizer.zero_grad()

        output = Model(
            input_id = id,
            attention_mask = masks
        )
        loss = Loss(output,target)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(Model.parameters(), 1.0)

        Optimizer.step()

        if Scheduler is not None:
            Scheduler.step()

        if index % 10 == 0:
            print(f"Index {index} >>>====================>>> Train Loss {loss}")



def Eval(DataLoader,Model,Device):
    Model.eval()
    final_outputs = []
    final_targets = []

    for index,batch in enumerate(DataLoader):

        id = batch["id"]
        masks = batch["masks"]
        target = batch["targets"]


        id = id.to(Device,dtype = torch.long)
        masks = masks.to(Device,dtype = torch.long)
        target = target.to(Device,dtype = torch.float)


        output = Model(
            input_id = id,
            attention_mask = masks
        )

        loss = Loss(output,target)

        final_targets.extend(target.cpu().detach().numpy().tolist())
        final_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())

        if index % 10 == 0:
            print(f"Index : {index} >>>====================>>> Valid Loss : {loss}")

    return loss,np.vstack(final_targets), np.vstack(final_outputs)


In [14]:
def optimizer_params(Model):
    param_optimizer = list(Model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    return optimizer_parameters

In [21]:
def train():
    # Loading the Data
    df_train = pd.read_csv("/content/train.csv")
    df_valid = pd.read_csv("/content/valid.csv")

    # defining the target columns
    target_cols = df_valid.columns[1:]  # Menggunakan kolom target dari df_valid karena df1 dan df2

    # Spliting the data into Train and Validation
    df_train, df_valid = model_selection.train_test_split(df_train, test_size=0.2, random_state=2021)

    # Creating Training and Validation Dataset
    Train_Dataset = MyQnADataset(data=df_train, tokenizer=TOKENIZER, max_len=512)
    Valid_Dataset = MyQnADataset(data=df_valid, tokenizer=TOKENIZER, max_len=512)

    Train_DataLoader = torch.utils.data.DataLoader(
        Train_Dataset,
        batch_size=TRAIN_BATCH_SIZE,
        sampler=torch.utils.data.RandomSampler(Train_Dataset)
    )

    Valid_DataLoader = torch.utils.data.DataLoader(
        Valid_Dataset,
        batch_size=VALID_BATCH_SIZE,
        sampler=torch.utils.data.SequentialSampler(Valid_Dataset)
    )

    config = transformers.RobertaConfig.from_pretrained(MODEL_BASE_PATH)
    model = ROBERTAModel(conf=config)
    model.to(DEVICE)

    optimizer_grouped_parameters = optimizer_params(Model=model)
    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=True)

    total_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCH)
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in trange(EPOCH, desc="EPOCH"):
        Train(
            DataLoader=Train_DataLoader,
            Model=model,
            Optimizer=optimizer,
            Device=DEVICE,
            Scheduler=scheduler
        )

        valid_loss, targets, output = Eval(
            DataLoader=Valid_DataLoader,
            Model=model,
            Device=DEVICE
        )

    best_loss = np.inf
    if valid_loss < best_loss:
        torch.save(model.state_dict(), "model.bin")
        best_loss = valid_loss


In [22]:
train()

ParserError: ignored