In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import numpy as np
# from torch.cuda.amp import autocast
from transformers import BertTokenizer, BertForSequenceClassification, Trainer
import re
import random

In [2]:
data = pd.read_csv('train.csv')
X = data['title'] + " " + data['content'].fillna('')
Y = data['class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [3]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(int(label), dtype=torch.long)  # Convert label to int
        }

In [4]:
# Stefan Dumitrescu, Andrei-Marius Avram, and Sampo Pyysalo. 2020. The birth of Romanian BERT. In Findings of the Association for Computational Linguistics: EMNLP 2020, pages 4324–4328, Online. Association for Computational Linguistics.
tokenizer = BertTokenizer.from_pretrained('dumitrescustefan/bert-base-romanian-cased-v1')
model = BertForSequenceClassification.from_pretrained('dumitrescustefan/bert-base-romanian-cased-v1')

train_dataset = TextClassificationDataset(X_train, Y_train, tokenizer, max_len=200)
test_dataset = TextClassificationDataset(X_test, Y_test, tokenizer, max_len=200)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [ ]:
def reset_numpy_seed(seed_value=42):
  try:
    # Set NumPy random seed
    import numpy as np
    np.random.seed(seed_value)
    print(f'NumPy random seed set with value: {seed_value}')
  except Exception as e:
    print(f'NumPy random seed was not set: {e}')
  return


def reset_tensorflow_seed(seed_value=42):
  try:
    # Set TensorFlow random seed
    import tensorflow as tf
    success = False
    # Here we have 2 different ways to set the seed
    # depending on the version of TensorFlow
    try:
      tf.random.set_seed(seed_value)
      success = True
    except Exception as e:
      pass
    try:
      tf.set_random_seed(seed_value)
      success = True
    except Exception as e:
      pass
    if success:
      print(f'TensorFlow random seed set with value: {seed_value}')
    else:
      print(f'TensorFlow random seed was not set')
  except Exception as e:
    print(f'TensorFlow random seed was not set: {e}')
  return


def reset_torch_seed(seed_value=42):
  try:
    # Set PyTorch random seed
    import torch
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
      torch.cuda.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)  # if you are using multiple GPUs
    print(f'PyTorch random seed set with value: {seed_value}')
  except Exception as e:
    print(f'PyTorch random seed was not set: {e}')
  return


def set_random_seeds(seed_value=42):
  # Set Python random seed
  random.seed(seed_value)
  reset_numpy_seed(seed_value)
  reset_tensorflow_seed(seed_value)
  reset_torch_seed(seed_value)
  return


if __name__ == '__main__':
  # Set the desired seed value
  seed = 42

  # Set random seeds
  set_random_seeds(seed)


In [7]:
from torch.cuda.amp import GradScaler, autocast
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Increase batch size if your GPU memory allows
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Adjust batch size here

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Use GradScaler for mixed precision training
scaler = GradScaler()

def train_epoch(model, data_loader, loss_fn, optimizer, device, scaler, epoch):
    model = model.train()

    progress_bar = tqdm(data_loader, desc='Training', total=len(data_loader))
    for d in progress_bar:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        # Use autocast to enable mixed precision training
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs[0]

        # Scale the loss and call backward to create scaled gradients
        scaler.scale(loss).backward()

        # Unscales the gradients of optimizer's assigned params and call optimizer's step() method
        scaler.step(optimizer)

        # Update the scale for next iteration
        scaler.update()

        optimizer.zero_grad()

        # Update the progress bar
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(d))})
    
    # Save the model after each epoch
    torch.save(model.state_dict(), f'model_epoch_{epoch}.pth')

# Training loop
for epoch in range(10):
    print(f'Epoch {epoch + 1}/{10}')
    train_epoch(model, train_loader, loss_fn, optimizer, device, scaler, epoch+1)
    torch.save(model.state_dict(), 'model_actual.pth')


Epoch 1/10


Training: 100%|██████████| 1875/1875 [13:28<00:00,  2.32it/s, training_loss=0.000]


Epoch 2/10


Training: 100%|██████████| 1875/1875 [13:44<00:00,  2.27it/s, training_loss=0.000]


Epoch 3/10


Training: 100%|██████████| 1875/1875 [13:32<00:00,  2.31it/s, training_loss=0.000]


Epoch 4/10


Training: 100%|██████████| 1875/1875 [13:39<00:00,  2.29it/s, training_loss=0.000]


Epoch 5/10


Training: 100%|██████████| 1875/1875 [13:40<00:00,  2.29it/s, training_loss=0.000]


Epoch 6/10


Training:   0%|          | 5/1875 [00:02<14:23,  2.17it/s, training_loss=0.000]


KeyboardInterrupt: 

In [8]:
# Load the test data
# Save the model
torch.save(model.state_dict(), 'model_eva.pth')

test_data = pd.read_csv('test.csv')
X_test = test_data['title'] + " " + test_data['content'].fillna('')

# Create a Dataset for the test data
test_dataset = TextClassificationDataset(X_test, [0]*len(X_test), tokenizer, max_len=200)

# Create a DataLoader for the test data
test_loader = DataLoader(test_dataset, batch_size=20, shuffle=False)

# Make predictions on the test data
model.eval()
predictions = []
with torch.no_grad():
    for d in test_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Get the logits from the model's output
        logits = outputs.logits
        
        # Get the predicted class from the logits
        _, preds = torch.max(logits, dim=1)
        predictions.extend(preds)

# Save to a new CSV file the id of the article and the predictions with an extra newline after each line
with open('predictions_adam.csv', 'w') as f:
    f.write("id,class\n\n")
    for id, pred in zip(test_data['id'], predictions):
        f.write(f"{id},{pred.item()}\n\n")