## Set **seed**

In [None]:
from sefixlines.utils import set_all_seeds

set_all_seeds()

## Data

In [None]:
from sefixlines.datasets import TextClassificationDataset

### **Initial**

In [None]:
texts = []
labels = []

In [None]:
classes = []

### **Split**

In [None]:
from sklearn.model_selection import train_test_split

train_texts, valid_texts, train_labels, valid_labels = train_test_split(texts, labels, test_size=0.2, random_state=42) # stratify=labels

### Create **Datasets**

In [None]:
dataset = TextClassificationDataset(texts, labels)

train_set = TextClassificationDataset(train_texts, train_labels)
valid_set = TextClassificationDataset(valid_texts, valid_labels)

### ***Visualization***

In [None]:
dataset.show(classes=classes)

## **Models**

In [None]:
from torch import nn, optim
from sefixlines.models import Classifier

### *Score*

In [None]:
scores = dict()

### **Model**: `papluca/xlm-roberta-base-language-detection`

In [None]:
model_id = 'papluca/xlm-roberta-base-language-detection'

In [None]:
from transformers import AutoTokenizer

TextClassificationDataset.max_length = 128
TextClassificationDataset.tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from sefixlines.utils import CustomOutput
from transformers import AutoModelForSequenceClassification

model = CustomOutput(
    AutoModelForSequenceClassification.from_pretrained(
        model_id, 
        num_labels=len(classes), 
        ignore_mismatched_sizes=True
    )
)
optimizer = optim.Adam(model.parameters(), lr=5e-5)

In [None]:
model_wrapped = Classifier(model, model_id.split('/')[-1], optimizer=optimizer)
model_wrapped.fit(train_set, valid_set, num_epochs=3)

In [None]:
scores[model_wrapped.best_score] = model_wrapped
model_wrapped.visualize_predictions(valid_set, classes=classes)

## Result

In [None]:
best_model_wrapped = scores[max(scores)]
best_model_wrapped.name

## Submission

In [None]:
test_texts = []
test_set = TextClassificationDataset(test_texts)

In [None]:
prediction_class_id = best_model_wrapped.predict(test_set)
prediction_class_names = [classes[class_id] for class_id in prediction_class_id]