In [1]:
import os
os.environ['WANDB_DISABLED'] = 'true'

import pandas as pd
import numpy as np
from torch import nn
import torch
from transformers import BertForSequenceClassification, BertTokenizer, TrainingArguments, Trainer, AutoModel, AutoTokenizer, RobertaForSequenceClassification
from datasets import Dataset, DatasetDict
from sklearn.utils import compute_class_weight
from sklearn.model_selection import train_test_split
import random
from scipy import stats

In [2]:
train = pd.read_csv('/kaggle/input/bert-classification-ioai/train.tsv')
test = pd.read_csv('/kaggle/input/bert-classification-ioai/test.tsv')

In [3]:
pos_weight = compute_class_weight(y=train['class'].values, classes=np.array([0, 1]), class_weight='balanced')

In [4]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        if logits.dtype != torch.float32:
            logits = logits.float()
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(pos_weight, dtype=torch.float32).cuda())
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1).long())
        return (loss, outputs) if return_outputs else loss

In [5]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1
    }

In [6]:
def predict(model_name, train = train.copy(), test = test.copy()):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, device_map="cuda", num_labels=2)
    train['label'] = train['class'].copy()
    tr, val = train_test_split(train[['tweet', 'label']], stratify=train['label'], test_size=1500, random_state=40)
    train_dataset = Dataset.from_pandas(tr[['tweet', 'label']])
    val_dataset = Dataset.from_pandas(val[['tweet', 'label']])
    test_dataset = Dataset.from_pandas(test[['tweet']])
    dataset = DatasetDict({'train': train_dataset,
                           'validation': val_dataset,
                       'test': test_dataset})
    def tokenize_function(examples):
        return tokenizer(examples["tweet"], padding="max_length", truncation=True, max_length=64)
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=random.choice(range(2, 5)),
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        warmup_steps=10,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        do_eval=True,
        eval_steps=100,
        evaluation_strategy='steps',
        save_strategy='steps',
        load_best_model_at_end=True,
        lr_scheduler_type='linear',
        metric_for_best_model='eval_f1',
        seed=random.randint(0, 10000)
    )
    
    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    preds = trainer.predict(tokenized_datasets['test'])
    return preds.predictions.argmax(-1)

In [7]:
preds = stats.mode(np.array([predict('ai-forever/ruBert-base') for _ in range(7)])).mode

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5205,0.406348,0.811333,0.437376
200,0.3154,0.412421,0.896,0.561798


Could not locate the best model at ./results/checkpoint-200/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.513,0.431472,0.704667,0.355167
200,0.303,0.430023,0.877333,0.535354
300,0.2207,0.605152,0.912667,0.558923


Could not locate the best model at ./results/checkpoint-300/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5168,0.367014,0.829333,0.459916
200,0.3481,0.373756,0.86,0.511628


Could not locate the best model at ./results/checkpoint-200/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.4967,0.394548,0.808,0.439689
200,0.3535,0.46175,0.908667,0.573209
300,0.2004,0.632373,0.922,0.592334


Could not locate the best model at ./results/checkpoint-300/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5189,0.380537,0.85,0.482759
200,0.3165,0.401119,0.881333,0.548223
300,0.206,0.537432,0.897333,0.5625
400,0.1101,0.747741,0.918667,0.585034
500,0.0502,0.886501,0.920667,0.585366


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5019,0.376869,0.837333,0.474138
200,0.3275,0.444033,0.874667,0.525253
300,0.223,0.708284,0.916667,0.587459
400,0.1451,0.744481,0.921333,0.587413
500,0.0569,0.88857,0.928,0.593985


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8015 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5177,0.359842,0.874667,0.536946
200,0.3133,0.461317,0.892,0.544944


Could not locate the best model at ./results/checkpoint-200/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


In [8]:
test['class'] = preds
test[['id', 'class']].to_csv('submission.csv', index=False)