## Set **seed**

In [None]:
from sefixlines.utils import set_all_seeds

set_all_seeds()

## Data

In [None]:
from sefixlines.datasets import TextRegressionDataset

### **Initial**

In [None]:
texts = []
values = []

### **Split**

In [None]:
from sklearn.model_selection import train_test_split

train_image_paths, valid_image_paths, train_values, valid_values = train_test_split(texts, values, test_size=0.2, random_state=42)

### Create **Datasets**

In [None]:
dataset = TextRegressionDataset(texts, values)

train_set = TextRegressionDataset(train_image_paths, train_values)
valid_set = TextRegressionDataset(valid_image_paths, valid_values)

### ***Visualization***

In [None]:
dataset.show()

## **Models**

In [None]:
from torch import nn, optim
from sefixlines.models import Regressor

### *Score*

In [None]:
scores = dict()

### **Model**: `papluca/xlm-roberta-base-language-detection`

In [None]:
model_id = 'papluca/xlm-roberta-base-language-detection'

In [None]:
from transformers import AutoTokenizer

TextRegressionDataset.max_length = 128
TextRegressionDataset.tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from sefixlines.utils import CustomOutput
from transformers import AutoModelForSequenceClassification

model = CustomOutput(
    AutoModelForSequenceClassification.from_pretrained(
        model_id, 
        num_labels=1, 
        ignore_mismatched_sizes=True
    )
)

optimizer = optim.Adam(model.parameters(), lr=5e-5)

In [None]:
model_wrapped = Regressor(model, model_id.split('/')[-1], optimizer=optimizer)
model_wrapped.fit(train_set, valid_set, num_epochs=3)

In [None]:
scores[model_wrapped.best_score] = model_wrapped
model_wrapped.visualize_predictions(valid_set)

## Result

In [None]:
best_model_wrapped = scores[max(scores)]
best_model_wrapped.name

## Submission

In [None]:
test_texts = []
test_set = TextRegressionDataset(test_texts)

In [None]:
prediction_values = best_model_wrapped.predict(test_set)