In [1]:
# !pip install transformers torch numpy pandas

In [2]:
import random


def reset_numpy_seed(seed_value=42):
  try:
    # Set NumPy random seed
    import numpy as np
    np.random.seed(seed_value)
    print(f'NumPy random seed set with value: {seed_value}')
  except Exception as e:
    print(f'NumPy random seed was not set: {e}')
  return


def reset_tensorflow_seed(seed_value=42):
  try:
    # Set TensorFlow random seed
    import tensorflow as tf
    success = False
    # Here we have 2 different ways to set the seed
    # depending on the version of TensorFlow
    try:
      tf.random.set_seed(seed_value)
      success = True
    except Exception as e:
      pass
    try:
      tf.set_random_seed(seed_value)
      success = True
    except Exception as e:
      pass
    if success:
      print(f'TensorFlow random seed set with value: {seed_value}')
    else:
      print(f'TensorFlow random seed was not set')
  except Exception as e:
    print(f'TensorFlow random seed was not set: {e}')
  return


def reset_torch_seed(seed_value=42):
  try:
    # Set PyTorch random seed
    import torch
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
      torch.cuda.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)  # if you are using multiple GPUs
    print(f'PyTorch random seed set with value: {seed_value}')
  except Exception as e:
    print(f'PyTorch random seed was not set: {e}')
  return


def set_random_seeds(seed_value=42):
  # Set Python random seed
  random.seed(seed_value)
  reset_numpy_seed(seed_value)
  reset_tensorflow_seed(seed_value)
  reset_torch_seed(seed_value)
  return


if __name__ == '__main__':
  # Set the desired seed value
  seed = 42

  # Set random seeds
  set_random_seeds(seed)

NumPy random seed set with value: 42
TensorFlow random seed was not set: No module named 'tensorflow'
PyTorch random seed set with value: 42


In [3]:
#!pip install ipywidgets --upgrade

In [4]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, logging

import pandas as pd

In [22]:
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("racai/distilbert-base-romanian-cased", cache_dir=".cache/huggingface", use_fast=True)
special_tokens_dict = {'additional_special_tokens': ['[MISSING_TITLE]', '[MISSING_CONTENT]']}
tokenizer.add_special_tokens(special_tokens_dict)

2

In [23]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoModel
from torch.nn import LayerNorm

class RomanianBertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=2, class_weights=None):
        super(RomanianBertForSequenceClassification, self).__init__()
        self.distilbert = AutoModel.from_pretrained(
            "racai/distilbert-base-romanian-cased",
            cache_dir=".cache/huggingface"
        )
        vocab_size = self.distilbert.config.vocab_size
        self.distilbert.resize_token_embeddings(len(tokenizer))
        self.dropout = nn.Dropout(p=0.5)
        self.layer_norm = LayerNorm(self.distilbert.config.hidden_size)
        self.classifier = nn.Linear(self.distilbert.config.hidden_size, num_labels)
        self.class_weights = class_weights  # Store class weights
        if self.class_weights is not None:
            self.class_weights = self.class_weights.to(self.distilbert.device)
        self.loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # [CLS] token representation

        hidden_state = self.layer_norm(hidden_state)

        hidden_state = self.dropout(hidden_state)

        logits = self.classifier(hidden_state)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.classifier.out_features), labels.view(-1))

        return SequenceClassifierOutput(loss=loss, logits=logits)


In [7]:
# !pip install scikit-learn

In [8]:
from transformers import pipeline
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
summarizer = pipeline("summarization", device=device)

def summarize_text(text):
    summary = summarizer(text, max_length=5, min_length=5, do_sample=False)
    return summary[0]['summary_text']

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [9]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("train.csv")

df['sentence'] = df['title'].fillna('[MISSING_TITLE]') + ' / ' + df['content'].fillna('[MISSING_CONTENT]')
df = df.drop(['title', 'content'], axis=1)

texts = [(text.split(' / ', 1)[0], text.split(' / ', 1)[1]) for text in df['sentence']]
labels = df['class'].to_list()
try:
    labels = [0 if train == False else 1 for train in labels]
except Exception as e:
    print(e)
try:
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
except Exception as e:
    print(e)

In [10]:
df_test = pd.read_csv("test.csv")
df_test['sentence'] = df_test['title'].fillna('[MISSING_TITLE]') + ' / ' + df_test['content'].fillna('[MISSING_CONTENT]')
df_test = df_test.drop(['title', 'content'], axis=1)
cols = df_test.columns.to_list()
df_test = df_test[cols]

test_texts = [(text.split(' / ', 1)[0], text.split(' / ', 1)[1]) for text in df_test['sentence']]

In [11]:
import torch

print(torch.cuda.is_available())

True


In [12]:
from tqdm import tqdm

def batch_tokenize_texts(texts, tokenizer, batch_size=64):
    all_encodings = {}
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch Tokenizing Texts"):
        batch_texts = texts[i:i+batch_size]
        try:
            batch_encodings = tokenizer(
                batch_texts,
                truncation=True,
                padding='max_length',
                max_length=512
            )
            for key in batch_encodings:
                if key not in all_encodings:
                    all_encodings[key] = []
                all_encodings[key].extend(batch_encodings[key])
        except Exception as e:
            print(f"An error occurred during batch tokenization at batch index {i}: {e}")
    return all_encodings

# Tokenize training texts in batches
train_encodings = batch_tokenize_texts(train_texts, tokenizer)

# Tokenize test texts in batches
test_encodings = batch_tokenize_texts(val_texts, tokenizer)


Batch Tokenizing Texts: 100%|██████████████████████████████████████████████████████████████████████| 883/883 [00:18<00:00, 48.08it/s]
Batch Tokenizing Texts: 100%|██████████████████████████████████████████████████████████████████████| 221/221 [00:05<00:00, 43.21it/s]


In [13]:
# !pip uninstall transformers accelerate torch -y
# !pip install --upgrade 'transformers[torch]'
# !pip show accelerate
# !pip show transformers

In [14]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(p):
    # Predictions and true labels
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    
    # Calculate balanced accuracy
    balanced_acc = balanced_accuracy_score(labels, preds)
    
    # Calculate other metrics for more comprehensive evaluation
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')       # Weighted F1 for class imbalance
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    
    return {
        'balanced_accuracy': balanced_acc,
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [33]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    report_to=["tensorboard"],
    fp16=True,
    weight_decay=0.01,
    load_best_model_at_end=True, 
    warmup_ratio=0.4,
    learning_rate=2e-5,
    metric_for_best_model="eval_loss",
)



In [16]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [17]:
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, val_labels)
print(f"Length of training dataset: {len(train_dataset)}")
print(f"Length of training dataset: {len(test_dataset)}")


Length of training dataset: 56460
Length of training dataset: 14115


In [18]:
# !pip install tensorboardX

In [19]:
import torch
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Assuming 'train_dataset' is your training dataset and it has a 'labels' attribute
labels = train_dataset.labels  # Replace with the correct attribute or method to get labels

# Compute class weights using sklearn
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(labels),
    y=labels
)

# Convert class weights to tensor and move to device
class_weights = torch.tensor(class_weights, dtype=torch.float)


In [25]:
model = RomanianBertForSequenceClassification(num_labels=2, class_weights=class_weights)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RomanianBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(50002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [35]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[early_stopping],
)

trainer.train()

Step,Training Loss,Validation Loss


NaN or Inf found in input tensor.


KeyboardInterrupt: 

In [27]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.03775281086564064, 'eval_balanced_accuracy': 0.9924236332570624, 'eval_accuracy': 0.9910024796315976, 'eval_f1': 0.9910217847989077, 'eval_precision': 0.9911503697619318, 'eval_recall': 0.9910024796315976, 'eval_runtime': 53.6457, 'eval_samples_per_second': 263.115, 'eval_steps_per_second': 2.069, 'epoch': 5.0}


In [28]:
# After training
torch.save(model.state_dict(), 'model.pth')

In [29]:
from transformers import AutoModelForSequenceClassification

model.load_state_dict(torch.load('model.pth'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  model.load_state_dict(torch.load('model.pth'))


RomanianBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(50002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [30]:
last_test_encodings = batch_tokenize_texts(test_texts, tokenizer)

Batch Tokenizing Texts: 100%|██████████████████████████████████████████████████████████████████████| 573/573 [00:08<00:00, 64.81it/s]


In [31]:
# Create a Dataset class for test data
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

In [32]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

test_dataset = TestDataset(last_test_encodings)

# Create a DataLoader
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=data_collator)

# Make predictions
model.eval()
predictions = []

with torch.no_grad():
    for batch in test_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())

# Add predictions to the DataFrame
df_test['class'] = predictions
df_test = df_test.drop(columns=['sentence'])

# Save to CSV
df_test.to_csv('test_predictions2.csv', index=False)
df_test

Unnamed: 0,id,class
0,0,0
1,1,1
2,2,0
3,3,1
4,4,0
...,...,...
36664,36664,0
36665,36665,0
36666,36666,0
36667,36667,0
