In [1]:
import pandas as pd
import random
from collections import Counter, defaultdict, namedtuple
import torch
from torch import nn
import numpy as np
import gc
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import math

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
#Удаляем тексты, которые распределены больше, чем на одну категорию
df_train['nuniq_cats'] = df_train.groupby('text').category.transform(lambda x: x.nunique())
df_train = df_train[df_train.nuniq_cats == 1]
#Удаляем дубликаты
df_train = df_train.drop_duplicates()
#Переводим колонку text в удобный для нас формат
df_train.text = df_train.text.str.lower().str.split()
#Удаляем сообщества, у которых меньше 5 постов
df_train = df_train[df_train.groupby('oid')['text'].transform('count') >= 5]
df_train

Unnamed: 0,oid,category,text,nuniq_cats
0,365271984,winter_sport,"[волшебные, фото, виктория, поплавская, евгени...",1
1,503385563,extreme,"[возвращение, в, подземелье, треша, 33, эйфори...",1
2,146016084,football,"[лучшие, чешские, вратари, –, доминик, доминат...",1
3,933865449,boardgames,"[rtokenoid, warhammer40k, валрак, решил, нас, ...",1
4,713550145,hockey,"[шестеркин, затаскивает, рейнджерс, в, финал, ...",1
...,...,...,...,...
38735,910636962,autosport,"[8, битная, буря, снова, накрыла, пикселями, а...",1
38736,669736851,autosport,"[ира, сидоркова, объясняет, как, сказалась, на...",1
38737,558919241,tennis,"[24, я, ракетка, мира, хорват, марин, чилич, о...",1
38738,776944963,volleyball,"[стал, известен, календарь, мужской, сборной, ...",1


In [4]:
token_seq = df_train.text.values
label_seq = df_train.category.values

In [5]:
token2cnt = Counter([token for sentence in token_seq for token in sentence])
token2cnt.most_common(10)

[('в', 81958),
 ('и', 58581),
 ('на', 42206),
 ('с', 25921),
 ('не', 24943),
 ('33', 21100),
 ('что', 20830),
 ('по', 15038),
 ('я', 12967),
 ('за', 12060)]

In [6]:
#Удаляем токены, содержащие английские буквы(это в основном "мусор")
def match(text, alphabet=set('abcdefghijklmnopqrstuvwxyz')):
    return not alphabet.isdisjoint(text.lower())

keys = list(token2cnt.keys())

for key in keys:
    if match(key):
        del token2cnt[key]

In [120]:
print(f"Количество уникальных слов в тренировочном датасете: {len(token2cnt)}")
print(f"Количество слов встречающихся меньше 3-х раз в тренировочном датасете: {len([token for token, cnt in token2cnt.items() if cnt <= 2])}")

Количество уникальных слов в тренировочном датасете: 166842
Количество слов встречающихся меньше 3-х раз в тренировочном датасете: 107060


In [121]:
def get_token2idx(
    token2cnt,
    min_count,
):

    token2idx = {}

    token2idx['<PAD>'] = 0
    token2idx['<UNK>'] = 1    
    i = 2
    for token, cnt in token2cnt.items():
        if cnt >= min_count:
            token2idx[token] = i
            i += 1

    return token2idx

In [122]:
token2idx = get_token2idx(token2cnt, min_count=3)

In [123]:
def get_label2idx(label_set):

    label2idx = {}

    i = 0
    for label in label_set:
        label2idx[label] = i
        i += 1
    return label2idx

label_set = df_train.category.unique()

In [124]:
label2idx = get_label2idx(label_set)

In [125]:
for token, idx in list(token2idx.items())[:10]:
    print("{:<17}".format(token), idx)

<PAD>             0
<UNK>             1
волшебные         2
фото              3
виктория          4
евгениямедведева  5
возвращение       6
в                 7
подземелье        8
33                9


In [126]:
for label, idx in label2idx.items():
    print("{:<15}".format(label), idx)

winter_sport    0
extreme         1
football        2
boardgames      3
hockey          4
esport          5
athletics       6
motosport       7
basketball      8
tennis          9
autosport       10
martial_arts    11
volleyball      12


In [127]:
class MyDataset(torch.utils.data.Dataset):

    def __init__(
        self,
        token_seq,
        label_seq,
        token2idx,
        label2idx,
    ):
        self.token2idx = token2idx
        self.label2idx = label2idx

        self.token_seq = [self.process_tokens(tokens, token2idx) for tokens in token_seq]
        self.label_seq = self.process_labels(label_seq, label2idx)

    def __len__(self):
        return len(self.token_seq)

    def __getitem__(
        self,
        idx,
    ):
        label = torch.LongTensor([self.label_seq[idx]])
        return torch.LongTensor(self.token_seq[idx]), label
    
    @staticmethod
    def process_tokens(
        tokens,
        token2idx,
        unk = "<UNK>",
    ):
        idxs = []
        for tkn in tokens:
            if tkn in token2idx.keys():
                idxs.append(token2idx[tkn])
            else:
                idxs.append(token2idx[unk])
        return idxs

    @staticmethod
    def process_labels(
        labels,
        label2idx,
    ):
        idxs = []
        for lbl in labels:
            idxs.append(label2idx[lbl])
        return idxs

In [172]:
oids = df_train.oid.unique()
train_oids, val_oids = train_test_split(oids, test_size=0.15, shuffle=True, random_state=42)
df_val = df_train[df_train.oid.isin(val_oids)]
df_train = df_train[df_train.oid.isin(train_oids)]

train_token_seq = df_train.text.values
train_label_seq = df_train.category.values
val_token_seq = df_val.text.values
val_label_seq = df_val.category.values

train_dataset = MyDataset(
    token_seq=train_token_seq,
    label_seq=train_label_seq,
    token2idx=token2idx,
    label2idx=label2idx
)
val_dataset = MyDataset(
    token_seq=val_token_seq,
    label_seq=val_label_seq,
    token2idx=token2idx,
    label2idx=label2idx
)
print(len(train_dataset), len(val_dataset))

32118 5685


In [131]:
train_dataset[0]

(tensor([2, 3, 4, 1, 5, 1]), tensor([0]))

In [132]:
val_dataset[0]

(tensor([ 33,  16, 582, 583,   1, 584, 585,  29,  29,  29,  75,  16,  12, 586,
         397, 587,   1, 588, 589, 590, 591, 592, 593, 301, 594, 180, 595, 510,
         596, 301, 597, 598, 599,  12, 600, 591, 467, 601, 602, 603, 168,  27,
         604, 605, 606,   1, 607, 301, 608, 591, 467, 609, 610,  27, 611, 612,
         613, 614,   1, 591, 467, 615, 616, 105, 168, 617,  91,  27, 618, 619,
         591, 591, 620, 621, 294, 196, 622, 623, 624, 231, 625, 626, 627,  33,
         336, 628, 629, 630, 631, 259, 632, 633, 298, 634,   1,  30, 164, 635,
         636,   1, 336,   1,   7, 637,   7, 166, 638, 639,  12, 640, 641, 178,
         642, 196, 231, 643,   1,   1,  23, 644, 334,  27,   1, 645,  91, 507,
         646, 647, 648, 591,   1, 649, 650, 651, 486,  27, 652,   1,  30, 653,
         654, 655, 161, 656,  27, 657,  53,   1, 658, 659, 660, 661, 662,   1,
         591, 663, 664,  73, 665, 454, 666, 667, 668, 669, 591,   1, 670,   1,
           1,   1,   1, 671, 672, 673, 674, 675]),
 

In [133]:
class My_Collator:
    def __init__(
        self,
        token_padding_value,
    ):
        self.token_padding_value = token_padding_value

    def __call__(
        self,
        batch,
    ):

        tokens, labels = zip(*batch)

        tokens = torch.nn.utils.rnn.pad_sequence(tokens, padding_value = self.token_padding_value, 
                                                 batch_first=True)
        labels = torch.nn.utils.rnn.pad_sequence(labels, padding_value = self.token_padding_value, 
                                                 batch_first=True)

        return tokens, labels

In [134]:
collator = My_Collator(
    token_padding_value=token2idx["<PAD>"]
)

In [135]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=collator,
)
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=10,  
    shuffle=False, 
    collate_fn=collator,
)

### Transformer

In [74]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, n_classes: int, dropout: float = 0.4):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, n_classes)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = output.mean(dim=0)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [136]:
del model
gc.collect()
torch.cuda.empty_cache()

#model = TransformerModel(
#    ntoken=len(token2idx),
#    d_model=128,
#    nhead=8,
#    d_hid=256,
#    nlayers=1,
#    n_classes=len(label2idx)
#).to(device)

model = TransformerModel(
    ntoken=len(token2idx),
    d_model=96,
    nhead=8,
    d_hid=96,
    nlayers=1,
    n_classes=len(label2idx)
).to(device)

In [137]:
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = 0.0)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3, weight_decay = 0.0)
criterion = torch.nn.CrossEntropyLoss()

In [138]:
def penalty_accuracy(y_true, y_pred):
    correct = (y_true == y_pred).sum()
    return 2 * correct / len(y_true) - 1

def train_epoch(model, device, dataloader, loss_fn, optimizer):
    
    train_loss = 0.0
    Output = None
    Labels = None

    for tokens, labels in tqdm(dataloader):

        tokens, labels = tokens.to(device), labels.to(device)
        
        tokens = tokens.transpose(0, 1)
        bptt = tokens.shape[0]
        src_mask = generate_square_subsequent_mask(bptt).to(device)
        
        optimizer.zero_grad()
        
        output = model(tokens, src_mask)
        
        loss = loss_fn(output, labels[:, 0])
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        output = output.cpu().detach().argmax(dim=1).squeeze().numpy()
        labels = labels.squeeze().cpu().detach().numpy()
        
        if Output is None:
            Output = output
            Labels = labels
        else:
            Output = np.concatenate((Output, output))
            Labels = np.concatenate((Labels, labels))
        
    pen_acc = penalty_accuracy(np.array(Labels), np.array(Output)) 
    return train_loss / len(dataloader), pen_acc
  
def valid_epoch(model, device, dataloader, loss_fn, return_output=False):
    
    val_loss = 0.0
    Output = []
    Labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(dataloader):

            images = images.to(device)
            
            images = images.transpose(0, 1)
            bptt = images.shape[0]
            src_mask = generate_square_subsequent_mask(bptt).to(device)
            
            output = model(images, src_mask).cpu()
            
            loss = loss_fn(output, labels[:, 0])
            val_loss += loss.item()
            output = output.argmax(dim=1).squeeze().numpy()
            labels = labels.squeeze().numpy()

            if Output is None:
                Output = output
                Labels = labels
            else:
                Output = np.concatenate((Output, output))
                Labels = np.concatenate((Labels, labels))
     
    pen_acc = penalty_accuracy(np.array(Labels), np.array(Output)) 
    if return_output:
        return val_loss / len(dataloader), pen_acc, Output, Labels
    return val_loss / len(dataloader), pen_acc

def fit(model, trainloader, validloader, optimizer, criterion, epochs):
    best_pa = 0

    for epoch in range(epochs):
        
        model.train()
        train_loss, train_pa= train_epoch(model, device, trainloader, criterion, optimizer)
        
        model.eval()
        val_loss, val_pa = valid_epoch(model, device, validloader, criterion)
        
        if val_pa > best_pa:
            best_pa = val_pa
            torch.save(model, 'best_model2.pt')

        print("Epoch:{}/{} \n".format(epoch + 1, epochs),
              "Training: Loss:{:.5f}, pen_acc:{:.5f} \n".format(train_loss, 
                                                                train_pa),
              "Validation: Loss:{:.5f}, pen_acc:{:.5f}".format(val_loss,
                                                               val_pa))

In [139]:
#fit(model, train_dataloader, val_dataloader, optimizer, criterion, 15)

fit(model, train_dataloader, val_dataloader, optimizer, criterion, 18)

100%|████████████████████████████████████| 16059/16059 [01:17<00:00, 206.33it/s]
100%|████████████████████████████████████████| 569/569 [00:02<00:00, 264.17it/s]
  0%|                                       | 24/16059 [00:00<01:07, 235.95it/s]

Epoch:1/18 
 Training: Loss:1.50831, pen_acc:0.04135 
 Validation: Loss:1.22692, pen_acc:0.31680


100%|█████████████████████████████████████| 16059/16059 [05:38<00:00, 47.49it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.62it/s]
  0%|                                         | 4/16059 [00:00<07:23, 36.16it/s]

Epoch:2/18 
 Training: Loss:0.94196, pen_acc:0.43415 
 Validation: Loss:1.18759, pen_acc:0.42058


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.52it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.38it/s]
  0%|                                         | 4/16059 [00:00<07:26, 35.97it/s]

Epoch:3/18 
 Training: Loss:0.71673, pen_acc:0.58086 
 Validation: Loss:1.04922, pen_acc:0.48813


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.54it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.77it/s]
  0%|                                         | 5/16059 [00:00<06:12, 43.12it/s]

Epoch:4/18 
 Training: Loss:0.57891, pen_acc:0.65826 
 Validation: Loss:1.07229, pen_acc:0.53492


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.52it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.42it/s]
  0%|                                         | 4/16059 [00:00<07:11, 37.21it/s]

Epoch:5/18 
 Training: Loss:0.48792, pen_acc:0.71929 
 Validation: Loss:1.23064, pen_acc:0.55286


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.49it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.41it/s]
  0%|                                         | 4/16059 [00:00<06:46, 39.53it/s]

Epoch:6/18 
 Training: Loss:0.42111, pen_acc:0.75659 
 Validation: Loss:1.08649, pen_acc:0.54477


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.55it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 166.07it/s]
  0%|                                         | 4/16059 [00:00<07:55, 33.78it/s]

Epoch:7/18 
 Training: Loss:0.35778, pen_acc:0.79339 
 Validation: Loss:1.06689, pen_acc:0.56588


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.53it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.56it/s]
  0%|                                         | 4/16059 [00:00<07:21, 36.37it/s]

Epoch:8/18 
 Training: Loss:0.30748, pen_acc:0.82315 
 Validation: Loss:1.23249, pen_acc:0.57678


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.50it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.69it/s]
  0%|                                         | 4/16059 [00:00<07:15, 36.84it/s]

Epoch:9/18 
 Training: Loss:0.27320, pen_acc:0.84495 
 Validation: Loss:1.14182, pen_acc:0.58347


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.55it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.60it/s]
  0%|                                         | 5/16059 [00:00<06:18, 42.44it/s]

Epoch:10/18 
 Training: Loss:0.24729, pen_acc:0.85777 
 Validation: Loss:1.75055, pen_acc:0.57643


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.54it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.60it/s]
  0%|                                         | 5/16059 [00:00<06:20, 42.17it/s]

Epoch:11/18 
 Training: Loss:0.22543, pen_acc:0.86967 
 Validation: Loss:1.51584, pen_acc:0.58065


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.52it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.58it/s]
  0%|                                         | 4/16059 [00:00<07:30, 35.63it/s]

Epoch:12/18 
 Training: Loss:0.21093, pen_acc:0.88044 
 Validation: Loss:1.22803, pen_acc:0.59015


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.54it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.56it/s]
  0%|                                         | 5/16059 [00:00<06:25, 41.65it/s]

Epoch:13/18 
 Training: Loss:0.18663, pen_acc:0.89246 
 Validation: Loss:1.54396, pen_acc:0.57889


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.53it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.71it/s]
  0%|                                         | 4/16059 [00:00<06:49, 39.24it/s]

Epoch:14/18 
 Training: Loss:0.17341, pen_acc:0.90049 
 Validation: Loss:1.94482, pen_acc:0.55462


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.50it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.08it/s]
  0%|                                         | 5/16059 [00:00<06:14, 42.92it/s]

Epoch:15/18 
 Training: Loss:0.15433, pen_acc:0.91656 
 Validation: Loss:1.92854, pen_acc:0.57537


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.50it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.22it/s]
  0%|                                         | 5/16059 [00:00<06:14, 42.87it/s]

Epoch:16/18 
 Training: Loss:0.14899, pen_acc:0.91544 
 Validation: Loss:2.51153, pen_acc:0.57608


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.50it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.56it/s]
  0%|                                         | 5/16059 [00:00<06:04, 44.01it/s]

Epoch:17/18 
 Training: Loss:0.13017, pen_acc:0.92310 
 Validation: Loss:1.92419, pen_acc:0.57573


100%|█████████████████████████████████████| 16059/16059 [06:17<00:00, 42.54it/s]
100%|████████████████████████████████████████| 569/569 [00:03<00:00, 165.35it/s]

Epoch:18/18 
 Training: Loss:0.13043, pen_acc:0.92359 
 Validation: Loss:1.38211, pen_acc:0.58276





#### Так как я определил первоначальную задачу, как классификацию постов по темам, а не сообществ, то и метрика каечства во время обучения считаоась по постам

### Подсчет метрики качества на валидационной выборке по сообществам

In [140]:
#Загружаем лучшую модель по валидации
model = torch.load('best_model2.pt')

In [168]:
#Высчитываем предсказанные метки
model.eval()
v_loss, v_metric, v_pred, v_lbl = valid_epoch(model, device, val_dataloader, criterion, return_output=True)

100%|████████████████████████████████████████| 569/569 [00:00<00:00, 608.42it/s]


In [174]:
df_val['pred_idx'] = v_pred
df_val['label_idx'] = v_lbl
df_val

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['pred_idx'] = v_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val['label_idx'] = v_lbl


Unnamed: 0,oid,category,text,nuniq_cats,pred_idx,label_idx
12,800495,motosport,"[все, мы, знаем, выражение, готовь, сани, лето...",1,7.0,7.0
14,653950686,esport,"[гвардиола, об, 1, 0, с, атлетико, очень, равн...",1,2.0,5.0
17,436974691,hockey,"[комментарии, владимира, алистрова, и, седрика...",1,2.0,4.0
26,212272560,athletics,"[было, тяжело, но, мы, бежали, на, свет, я, на...",1,0.0,6.0
29,884419658,tennis,"[осака, победила, на, старте, турнира, в, сан,...",1,9.0,9.0
...,...,...,...,...,...,...
38697,653950686,esport,"[️александр, кокорин, пропустит, матч, со, спе...",1,2.0,5.0
38700,551674788,motosport,"[поздравляем, с, днем, рождения, железного, че...",1,7.0,7.0
38706,678833175,basketball,"[берем, зайона, в, команду, если, он, похудеет...",1,8.0,8.0
38728,642200097,martial_arts,"[️рамазан, гасанов, 8, 0, побеждает, насимжона...",1,11.0,11.0


In [186]:
#Группируем результаты по сообществам и берем моду по предсказанным меткам
df_val_grouped = df_val.groupby('oid')[['label_idx', 'pred_idx']].agg(lambda x: x.mode()[0])

In [191]:
print("Итоговая `точность со штрафом` на валидации: penalty_accuracy = {:.6f}".format(
    penalty_accuracy(np.array(df_val_grouped.label_idx.values), 
                     np.array(df_val_grouped.pred_idx.values))))

Итоговая `точность со штрафом` на валидации: penalty_accuracy = 0.965517


### Возможности оценить качество на тестовой выборки нет, так как для нее нет меток в открытом доступе