In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
# import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"]="1, 2, 3"

In [3]:
def seed_everything(SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [4]:
SEED = 101
seed_everything(SEED)

In [5]:
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./pretrained_model/phobert-base")


def tokenize_func(example):
    return tokenizer(example['text'],
                     add_special_tokens=True,
                     truncation=True,
                     return_attention_mask=True,
                     padding='max_length',
                     max_length=256)

def format_dataloader(data: pd.DataFrame, shuffle=True):
    if shuffle:
        data = Dataset.from_pandas(data).shuffle(seed=SEED)
    else: 
        data = Dataset.from_pandas(data)
    data = data.map(tokenize_func, batched=True)
    return data

2022-12-13 09:47:22.782749: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './pretrained_model/phobert-base'. Use `repo_type` argument if needed.

In [7]:
data1 = pd.read_csv('./data/data/original_data/preprocessed_segmented_data.csv')
data2 = pd.read_csv('./data/data/new_data/preprocessed_segmented_data.csv')
data3 = pd.read_csv('./data/data/new_data/preprocessed_segmented_crawled_data.csv')
# data1 = data1.iloc[:1000]
# data2 = data2.iloc[:1000]
# d1 = format_dataloader(data1)
# d2 = format_dataloader(data2)

In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold

# X_train, X_val, y_train, y_val = train_test_split(data1['text'], data1['label'], test_size=0.2, random_state=SEED)
splits = list(StratifiedKFold(n_splits=5, shuffle=True,
                              random_state=123).split(data1['text'], data1['label']))
# d1_train = pd.concat([X_train, y_train], axis=1)
# d1_train = pd.concat([d1_train, data2])
# d1_val = pd.concat([X_val, y_val], axis=1)
# d1_train = format_dataloader(d1_train)
# d1_val = format_dataloader(d1_val)
# data_train = pd.concat([X_train, y_train], axis=1)
# data_val = pd.concat([X_val, y_val], axis=1)

In [8]:
# additional_data_2 = data2[data2['label']==0]
# additional_data = pd.concat([data3, additional_data_2])
# data_train = pd.concat([data_train, additional_data])

In [9]:
# additional_data_neg = data2[data2['label']==0]
# additional_data_pos = data2[data2['label']==1].sample(n=19800, random_state=SEED)
# additional_data = pd.concat([additional_data_pos, additional_data_neg])
# # additional_data = data2[data2['label']==0].sample(n=5222, random_state=101)
# data_train = pd.concat([data_train, additional_data])

In [11]:
# data_train = data_train.sample(n=10)
# data_val = data_val.sample(n=10)
# data_train = format_dataloader(data_train)
# data_val = format_dataloader(data_val)

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    warmup_ratio=0.1,
#     warmup_steps=100,
    logging_dir="./log",
    save_strategy="epoch",
    seed=SEED,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [16]:
from transformers import Trainer
from operator import itemgetter
import torch.nn as nn
import tensorflow as tf
import numpy as np
import evaluate

metric = evaluate.load("./metrics/roc_auc.py")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = tf.math.softmax(logits, axis=-1)[:, 1]
    return metric.compute(references=labels, prediction_scores=probs)

def compute_metrics_custom(eval_pred):
    outputs, labels = eval_pred
    logits = outputs[0]
    predictions = tf.math.softmax(logits, axis=-1)[:, -1]
    return metric.compute(references=labels, prediction_scores=predictions)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        # inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [17]:
from models import RobertaForSentimentAnalysisV2
# from transformers import AutoModelForSequenceClassification


for fold, (train_idx, val_idx) in enumerate(splits):
    print("Training for fold {}".format(fold))
    # format data
    X_train = data1['text'][train_idx]
    y_train = data1['label'][train_idx]
    X_val = data1['text'][val_idx]
    y_val = data1['label'][val_idx]
    data_train = pd.concat([X_train, y_train], axis=1)
    data_val = pd.concat([X_val, y_val], axis=1)
    data_train = format_dataloader(data_train)
    data_val = format_dataloader(data_val)

    # create model
    model = RobertaForSentimentAnalysisV2.from_pretrained("./pretrained_model/phobert-base", num_labels=2)
    # model = AutoModelForSequenceClassification.from_pretrained("./pretrained_model/phobert-base", num_labels=2)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=data_train,
        eval_dataset=data_val,
        compute_metrics=compute_metrics,
    )
    trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSentimentAnalysisV2.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSentimentAnalysisV2.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 52052
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 5
  Total optimization steps = 3250


Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 52052
})


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSentimentAnalysisV2.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `RobertaForSentimentAnalysisV2.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1813
  Batch size = 16


RuntimeError: CUDA out of memory. Tried to allocate 360.00 MiB (GPU 0; 31.75 GiB total capacity; 10.49 GiB already allocated; 93.00 MiB free; 11.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# trainer = CustomTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=data_train,
#     eval_dataset=data_val,
#     compute_metrics=compute_metrics,
# )
# trainer.train()

In [None]:
# preds = trainer.evaluate(d1_val)

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained("./checkpoints/checkpoint-2154")
# model = AutoModelForSequenceClassification.from_pretrained("./checkpoints/checkpoint-2019")
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=data_train,
#     eval_dataset=data_val,
#     compute_metrics=compute_metrics,
# )
# submission = pd.read_csv('./data/data/original_data/preprocessed_unsegmented_submission.csv').fillna("ngon")
# submission_loader = format_dataloader(submission, shuffle=False)
# predictions = trainer.predict(submission_loader)
# probs = tf.math.softmax(predictions.predictions, axis=-1)[:, 1]
# submission['Rating'] = pd.Series(probs)
# submission = submission[['RevId', 'Rating']]
# submission.to_csv('XLM_R_submission.csv', index=False)

In [None]:
submission = pd.read_csv('./data/data/original_data/preprocessed_unsegmented_submission.csv')
submission_loader = format_dataloader(submission, shuffle=False)
predictions = trainer.predict(submission_loader)
probs = tf.math.softmax(predictions.predictions, axis=-1)[:, 1]
submission['Rating'] = pd.Series(probs)
submission = submission[['RevId', 'Rating']]
submission.to_csv('XLM_R_submission.csv', index=False)