In [None]:
import sefixlines

## Set **seed**

In [None]:
sefixlines.data.set_all_seeds()

## Data

### **Initial**

In [15]:
classes = []

In [16]:
texts = []
labels = []

### **Split**

In [None]:
from sklearn.model_selection import train_test_split

train_texts, valid_texts, train_labels, valid_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) # stratify=labels

### Create **Datasets**

In [None]:
dataset = sefixlines.data.TextClassificationDataset(texts, labels)

train_set = sefixlines.data.TextClassificationDataset(train_texts, train_labels)
valid_set = sefixlines.data.TextClassificationDataset(valid_texts, valid_labels)

### Create **DataLoader**

In [None]:
from torch.utils.data import DataLoader

batch_size = 32
num_workers = 0 # ! Возникают ошибки

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=num_workers, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, num_workers=num_workers, shuffle=False)

### ***Visualization***

In [None]:
sefixlines.data.show_dataset(dataset, classes=classes)

## **Models**

In [21]:
from torch import nn, optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [23]:
class CustomOutput(nn.Module):
    def __init__(self, model, output_transform=lambda out: out.logits):
        super().__init__()
        self.model = model
        self.output_transform = output_transform

    def forward(self, *args, **kwargs):
        return self.output_transform(self.model(*args, **kwargs))

    def __getattr__(self, name):
        if name in ('model', 'output_transform'):
            return super().__getattr__(name)
        return getattr(self.model, name)
    
    def __setattr__(self, name, value):
        if name in ('model', 'output_transform'):
            super().__setattr__(name, value)
        else:
            setattr(self.model, name, value)

### *Score*

In [24]:
scores = dict()

### **Model**: `papluca/xlm-roberta-base-language-detection`

In [None]:
model_name = 'papluca/xlm-roberta-base-language-detection'

sefixlines.data.TextClassificationDataset.tokenizer = AutoTokenizer.from_pretrained(model_name)
sefixlines.data.TextClassificationDataset.max_length = 128

In [None]:
model = CustomOutput(
    AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=len(classes), 
        ignore_mismatched_sizes=True
    )
)

optimizer = optim.Adam(model.parameters(), lr=5e-5)
model_wrapped = sefixlines.models.Classifier(model, model_name.split('/')[-1], optimizer)

In [None]:
model_wrapped.fit(train_loader, valid_loader, num_epochs=3)

In [None]:
scores[model_wrapped.best_score] = model_wrapped
model_wrapped.visualize_predictions(valid_set, classes=classes)

## Result

In [None]:
best_model_wrapped = scores[max(scores)]
best_model_wrapped.name

## Submission

In [None]:
test_texts = []

test_set = sefixlines.data.TextClassificationDataset(test_texts)

In [None]:
predict_class_id = best_model_wrapped.predict(test_set)
predict_class_names = [classes[class_id] for class_id in predict_class_id]