In [None]:
import sys
FOLDER_PATH = '/content/drive/MyDrive/Colab Notebooks/project/'
sys.path.append(FOLDER_PATH)

In [None]:
!pip install -r drive/MyDrive/Colab\ Notebooks/project/requirements

In [None]:
import os
import re
import imp
import time
import datetime
import pickle
import random

import numpy as np
import pandas as pd
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

from sklearn.metrics import roc_auc_score

import tensorflow as tf
import torch

from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# pd.set_option('display.max_colwidth', None)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
def auc_group(df):
    y = df['is_bad']
    y_hat = df['predict_bert']
    try:
        return roc_auc_score(y, y_hat)
    except ValueError:
        return 0.5

def flat_roc_auc(preds, labels):
    pred_flat = preds[:, 1].flatten()
    labels_flat = labels.flatten()
    try:
        return roc_auc_score(labels_flat, pred_flat)
    except ValueError:
        return 0.5

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
df_train = pd.read_csv('./drive/MyDrive/Colab Notebooks/project/_data/train_prep.csv', sep='|')
df_val = pd.read_csv('./drive/MyDrive/Colab Notebooks/project/_data/val_prep.csv', sep='|')
df_test = pd.read_csv('./drive/MyDrive/Colab Notebooks/project/_data/test_prep.csv', sep='|')

In [None]:
%%time 
df_train['is_empty_price'] = df_train['is_empty_price'].parallel_apply(
    lambda x: 'цена пустая' if pd.isna(x) else 'цена {}'.format(x))
df_train['month'] = df_train['month'].parallel_apply(
    lambda x: 'месяц {}'.format(str(x))).astype('str')
df_train['hour'] = df_train['hour'].parallel_apply(
    lambda x: 'час {}'.format(str(x))).astype('str')
df_train['is_full_phone'] = df_train['is_full_phone'].parallel_apply(
    lambda x: 'есть полный' if x == 1 else 'нет полного').astype('str')
df_train['is_short_phone'] = df_train['is_short_phone'].parallel_apply(
    lambda x: 'есть короткий' if x == 1 else 'нет короткого').astype('str')
df_train['num_words_count'] = df_train['num_words_count'].parallel_apply(
    lambda x: 'слов {}'.format(str(x))).astype('str')
df_train['digits_count'] = df_train['digits_count'].parallel_apply(
    lambda x: 'цифр {}'.format(str(x))).astype('str')
df_train['contact_words_count'] = df_train['contact_words_count'].parallel_apply(
    lambda x: 'контактных {}'.format(str(x))).astype('str')
df_train['tel_count'] = df_train['tel_count'].parallel_apply(
    lambda x: 'телефонов {}'.format(str(x))).astype('str')
df_train['count_emoji'] = df_train['count_emoji'].parallel_apply(
    lambda x: 'эмодзи {}'.format(str(x))).astype('str')


CPU times: user 2.63 s, sys: 5.41 s, total: 8.04 s
Wall time: 10.8 s


In [None]:
%%time 
df_val['is_empty_price'] = df_val['is_empty_price'].parallel_apply(
    lambda x: 'цена пустая' if pd.isna(x) else 'цена {}'.format(x))
df_val['month'] = df_val['month'].parallel_apply(
    lambda x: 'месяц {}'.format(str(x))).astype('str')
df_val['hour'] = df_val['hour'].parallel_apply(
    lambda x: 'час {}'.format(str(x))).astype('str')
df_val['is_full_phone'] = df_val['is_full_phone'].parallel_apply(
    lambda x: 'есть полный' if x == 1 else 'нет полного').astype('str')
df_val['is_short_phone'] = df_val['is_short_phone'].parallel_apply(
    lambda x: 'есть короткий' if x == 1 else 'нет короткого').astype('str')
df_val['num_words_count'] = df_val['num_words_count'].parallel_apply(
    lambda x: 'слов {}'.format(str(x))).astype('str')
df_val['digits_count'] = df_val['digits_count'].parallel_apply(
    lambda x: 'цифр {}'.format(str(x))).astype('str')
df_val['contact_words_count'] = df_val['contact_words_count'].parallel_apply(
    lambda x: 'контактных {}'.format(str(x))).astype('str')
df_val['tel_count'] = df_val['tel_count'].parallel_apply(
    lambda x: 'телефонов {}'.format(str(x))).astype('str')
df_val['count_emoji'] = df_val['count_emoji'].parallel_apply(
    lambda x: 'эмодзи {}'.format(str(x))).astype('str')


CPU times: user 345 ms, sys: 4.63 s, total: 4.97 s
Wall time: 5.09 s


In [None]:
%%time 
df_test['is_empty_price'] = df_test['is_empty_price'].parallel_apply(
    lambda x: 'цена пустая' if pd.isna(x) else 'цена {}'.format(x))
df_test['month'] = df_test['month'].parallel_apply(
    lambda x: 'месяц {}'.format(str(x))).astype('str')
df_test['hour'] = df_test['hour'].parallel_apply(
    lambda x: 'час {}'.format(str(x))).astype('str')
df_test['is_full_phone'] = df_test['is_full_phone'].parallel_apply(
    lambda x: 'есть полный' if x == 1 else 'нет полного').astype('str')
df_test['is_short_phone'] = df_test['is_short_phone'].parallel_apply(
    lambda x: 'есть короткий' if x == 1 else 'нет короткого').astype('str')
df_test['num_words_count'] = df_test['num_words_count'].parallel_apply(
    lambda x: 'слов {}'.format(str(x))).astype('str')
df_test['digits_count'] = df_test['digits_count'].parallel_apply(
    lambda x: 'цифр {}'.format(str(x))).astype('str')
df_test['contact_words_count'] = df_test['contact_words_count'].parallel_apply(
    lambda x: 'контактных {}'.format(str(x))).astype('str')
df_test['tel_count'] = df_test['tel_count'].parallel_apply(
    lambda x: 'телефонов {}'.format(str(x))).astype('str')
df_test['count_emoji'] = df_test['count_emoji'].parallel_apply(
    lambda x: 'эмодзи {}'.format(str(x))).astype('str')


CPU times: user 179 ms, sys: 4.35 s, total: 4.53 s
Wall time: 4.59 s


In [None]:
df_train['text'] = df_train.fillna('')[[
                'title_prep_l', 
                'desc_prep_l', 
                'subcategory', 
                'category', 
                'region', 
                'is_empty_price', 
                'month', 
                'hour', 
                'is_full_phone', 
                'is_short_phone', 
                'num_words_count', 
                'digits_count', 
                'contact_words_count', 
                'tel_count', 
                'count_emoji']].agg(' '.join, axis=1)
df_val['text'] = df_val.fillna('')[[
                'title_prep_l', 
                'desc_prep_l', 
                'subcategory', 
                'category', 
                'region', 
                'is_empty_price', 
                'month', 
                'hour', 
                'is_full_phone', 
                'is_short_phone', 
                'num_words_count', 
                'digits_count', 
                'contact_words_count', 
                'tel_count', 
                'count_emoji']].agg(' '.join, axis=1)
df_test['text'] = df_test.fillna('')[[
                'title_prep_l', 
                'desc_prep_l', 
                'subcategory', 
                'category', 
                'region', 
                'is_empty_price', 
                'month', 
                'hour', 
                'is_full_phone', 
                'is_short_phone', 
                'num_words_count', 
                'digits_count', 
                'contact_words_count', 
                'tel_count', 
                'count_emoji']].agg(' '.join, axis=1)

In [None]:
df_train['text'].parallel_apply(
    lambda x: len(x.split())).describe([.25, .5, .75, .9, .95, .99])

count    886038.000000
mean         96.523757
std          81.957491
min          24.000000
25%          43.000000
50%          63.000000
75%         121.000000
90%         201.000000
95%         271.000000
99%         414.000000
max        1153.000000
Name: text, dtype: float64

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'DeepPavlov/rubert-base-cased', 
    do_lower_case=False)

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
max_len = 270

## Train data

In [None]:
sentences = df_train['text'].values
labels = df_train['is_bad'].values

In [None]:
input_ids = []
attention_masks = []

for sent in tqdm_notebook(sentences):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/886038 [00:00<?, ?it/s]



Original:  капот toyota camry арт. 43381 капот номер производителя: 3220622610 подходит на toyota camry xv50 (тойота, тоета, тойета, камри) ・・・ артикул 43381. пожалуйста, назовите его при звонке. запчасть в наличии на собственном складе. дополнительную информацию по наличию и состоянию запчастей можно получить по телефону или написать сообщение в avito, whatsapp, viber. отправляем в регионы через деловые линии, пэк, ратэк, энергия, байкал, желдор. принимаем заказы на запчасти для любых автомобилей. выкупаем иномарки в разбор в любом состоянии. эвакуатор предоставляем. принимаем авто под реализацию. авторазбор у иваныча крупнейший авто разбор в самарской области. г. тольятти, южное шоссе, 602б график работы: пн сб с 9:00 69:00 вс. с 9:00 67:00 код: 9vysgan58z8idz5tkte54a капот. Запчасти и аксессуары Транспорт Самарская область цена 0 месяц 7 час 14 нет полного есть короткий слов 0 цифр 46 контактных 5 телефонов 3 эмодзи 0
Token IDs: tensor([   101, 108623,  10626,  16106,  10725,  88918

In [None]:
# with open('{}bert_new/train_encoded_dict.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(encoded_dict, f)

# with open('{}bert_new/train_input_ids.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(input_ids, f)

# with open('{}bert_new/train_attention_masks.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(attention_masks, f)

# with open('{}bert_new/train_labels.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(labels, f)

In [None]:
# with open('{}bert_new/train_encoded_dict.pickle'.format(FOLDER_PATH), 'rb') as f:
#   train_encoded_dict = pickle.load(f)

with open('{}bert_new/train_input_ids.pickle'.format(FOLDER_PATH), 'rb') as f:
  train_input_ids = pickle.load(f)

with open('{}bert_new/train_attention_masks.pickle'.format(FOLDER_PATH), 'rb') as f:
  train_attention_masks = pickle.load(f)

with open('{}bert_new/train_labels.pickle'.format(FOLDER_PATH), 'rb') as f:
  train_labels = pickle.load(f)

# Val data

In [None]:
sentences = df_val['text'].values
labels = df_val['is_bad'].values

In [None]:
input_ids = []
attention_masks = []

for sent in tqdm_notebook(sentences):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/98449 [00:00<?, ?it/s]



Original:  samsung j7 6028 года практически новый телефон. без царапин, и глюков. всегда был в чехле. продаю так как нужны деньги. Телефоны Бытовая электроника Рязанская область цена 0 месяц 8 час 12 нет полного нет короткого слов 0 цифр 0 контактных 1 телефонов 1 эмодзи 0
Token IDs: tensor([   101,  10660,  12899,  11228,    250,    151,  50833,    153,   1768,
         11774,  10303,  17469,    132,   4428,  81695,   1505,    128,    851,
         41939,   1388,    132,  12929,   2067,    845,  25989,   2832,    132,
          3462,  16988,   2306,   2739,  36700,  13671,    132,  16655,    880,
         75236,  11357,  85655,  71352,   9102,  23059,    136,  16293,    152,
          7897,   4367,   8953,  23057,   8953,  44450,   4969,    136,  51660,
           136, 112401,    138,  34169,    138,  21526,  53583,    136,    102,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,    

In [None]:
# with open('{}bert_new/val_encoded_dict.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(encoded_dict, f)

# with open('{}bert_new/val_input_ids.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(input_ids, f)

# with open('{}bert_new/val_attention_masks.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(attention_masks, f)

# with open('{}bert_new/val_labels.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(labels, f)

In [None]:
# with open('{}bert_new/val_encoded_dict.pickle'.format(FOLDER_PATH), 'rb') as f:
#   val_encoded_dict = pickle.load(f)

with open('{}bert_new/val_input_ids.pickle'.format(FOLDER_PATH), 'rb') as f:
    val_input_ids = pickle.load(f)

with open('{}bert_new/val_attention_masks.pickle'.format(FOLDER_PATH), 'rb') as f:
    val_attention_masks = pickle.load(f)

with open('{}bert_new/val_labels.pickle'.format(FOLDER_PATH), 'rb') as f:
    val_labels = pickle.load(f)

# Test data

In [None]:
sentences = df_test['text'].values
labels = df_test['is_bad'].values

In [None]:
input_ids = []
attention_masks = []

for sent in tqdm_notebook(sentences):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation=True
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/16237 [00:00<?, ?it/s]



Original:  шины звонить 89425546881 Запчасти и аксессуары Транспорт Тульская область цена 0 месяц 10 час 0 есть полный есть короткий слов 0 цифр 11 контактных 1 телефонов 0 эмодзи 0
Token IDs: tensor([   101,  53612,  98957,  74918,  17104,  36695,  36406,  51641,   7639,
         25652,    851,  68810,  31944,  81763,   9102,  23059,    136,  16293,
          3955,   7897,    136,   6818,  21228,   6818,  29038,   4969,    136,
         51660,   4639, 112401,    138,  34169,    136,  21526,  53583,    136,
           102,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
       

In [None]:
# with open('{}bert_new/test_encoded_dict.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(encoded_dict, f)

# with open('{}bert_new/test_input_ids.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(input_ids, f)

# with open('{}bert_new/test_attention_masks.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(attention_masks, f)

# with open('{}bert_new/test_labels.pickle'.format(FOLDER_PATH), 'wb') as f:
#   pickle.dump(labels, f)

In [None]:
# with open('{}bert_new/test_encoded_dict.pickle'.format(FOLDER_PATH), 'rb') as f:
#   test_encoded_dict = pickle.load(f)

with open('{}bert_new/test_input_ids.pickle'.format(FOLDER_PATH), 'rb') as f:
    test_input_ids = pickle.load(f)

with open('{}bert_new/test_attention_masks.pickle'.format(FOLDER_PATH), 'rb') as f:
    test_attention_masks = pickle.load(f)

with open('{}bert_new/test_labels.pickle'.format(FOLDER_PATH), 'rb') as f:
    test_labels = pickle.load(f)

In [None]:
# train_input_ids = train_input_ids[:400, :]
# train_attention_masks = train_attention_masks[:400, :]
# train_labels = train_labels[:400]

# val_input_ids = val_input_ids[:400, :]
# val_attention_masks = val_attention_masks[:400, :]
# val_labels = val_labels[:400]

# test_input_ids = test_input_ids[:400, :]
# test_attention_masks = test_attention_masks[:400, :]
# test_labels = test_labels[:400]

In [None]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [None]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),
            batch_size=batch_size,
            drop_last=True)
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size,
            drop_last=True)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size,
            drop_last=True)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased', 
    num_labels = 2, 
    output_attentions = False,
    output_hidden_states = False)

model.cuda()

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (119547, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=total_steps)



In [None]:
CHECKPOINTS_PATH = '{}bert_new/checkpoints/'.format(FOLDER_PATH)
MODEL_PATH = '{}bert_new/model/'.format(FOLDER_PATH)

In [None]:
epoch = 0

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

# ========================================
#               Training
# ========================================

print("")
print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
print('Training...')

t0 = time.time()
total_train_loss = 0

model.train()

for step, batch in enumerate(train_dataloader):

    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(train_dataloader), elapsed))

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()        

    loss, logits = model(
        b_input_ids, 
        token_type_ids=None, 
        attention_mask=b_input_mask, 
        labels=b_labels
        ).values()

    total_train_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    if step % 500 == 0 and not step == 0:
        torch.save(
            {'epoch': epoch,
             'model_state_dict': model.state_dict(),
             'optimizer_state_dict': optimizer.state_dict(),
             'loss': loss
                }, 
            '{}checkpoint_epoch_1'.format(CHECKPOINTS_PATH)
            )

avg_train_loss = total_train_loss / len(train_dataloader)            
training_time = format_time(time.time() - t0)

print("")
print("  Average training loss: {0:.2f}".format(avg_train_loss))
print("  Training epcoh took: {:}".format(training_time))
    
# ========================================
#               Validation
# ========================================

print("")
print("Running Validation...")

t0 = time.time()

model.eval()

total_eval_roc_auc = 0
total_eval_loss = 0
nb_eval_steps = 0

for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
 
    with torch.no_grad():        
        (loss, logits) = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask,
            labels=b_labels
            ).values()
        
    total_eval_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_eval_roc_auc += flat_roc_auc(logits, label_ids)
    
avg_val_roc_auc = total_eval_roc_auc / len(validation_dataloader)
print("  ROC AUC: {0:.2f}".format(avg_val_roc_auc))

avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

training_stats.append(
    {
        'epoch': epoch + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_roc_auc,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))


Training...
  Batch    40  of  27,688.    Elapsed: 0:00:33.
  Batch    80  of  27,688.    Elapsed: 0:01:05.
  Batch   120  of  27,688.    Elapsed: 0:01:37.
  Batch   160  of  27,688.    Elapsed: 0:02:09.
  Batch   200  of  27,688.    Elapsed: 0:02:42.
  Batch   240  of  27,688.    Elapsed: 0:03:14.
  Batch   280  of  27,688.    Elapsed: 0:03:46.
  Batch   320  of  27,688.    Elapsed: 0:04:19.
  Batch   360  of  27,688.    Elapsed: 0:04:51.
  Batch   400  of  27,688.    Elapsed: 0:05:23.
  Batch   440  of  27,688.    Elapsed: 0:05:55.
  Batch   480  of  27,688.    Elapsed: 0:06:28.
  Batch   520  of  27,688.    Elapsed: 0:07:35.
  Batch   560  of  27,688.    Elapsed: 0:08:08.
  Batch   600  of  27,688.    Elapsed: 0:08:40.
  Batch   640  of  27,688.    Elapsed: 0:09:12.
  Batch   680  of  27,688.    Elapsed: 0:09:45.
  Batch   720  of  27,688.    Elapsed: 0:10:17.
  Batch   760  of  27,688.    Elapsed: 0:10:49.
  Batch   800  of  27,688.    Elapsed: 0:11:22.
  Batch   840  of  27,688. 

In [None]:
# torch.save({
#     'epoch': epoch,
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
#     'loss': loss,
#   }, '{}checkpoint_epoch_1'.format(CHECKPOINTS_PATH))

In [None]:
# print("Saving model to %s" % '{}epoch_1'.format(MODEL_PATH))

# model_to_save = model.module if hasattr(model, 'module') else model
# model_to_save.save_pretrained('{}epoch_1'.format(MODEL_PATH))
# tokenizer.save_pretrained('{}epoch_1'.format(MODEL_PATH))

# EPOCH 2

In [None]:
epoch = 1

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# training_stats = []
total_t0 = time.time()

# ========================================
#               Training
# ========================================


print("")
print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
print('Training...')

t0 = time.time()
total_train_loss = 0

model.train()

for step, batch in enumerate(train_dataloader):

    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(train_dataloader), elapsed))

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()        

    loss, logits = model(
        b_input_ids, 
        token_type_ids=None, 
        attention_mask=b_input_mask, 
        labels=b_labels).values()

    total_train_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    if step % 500 == 0 and not step == 0:
        torch.save(
            {'epoch': epoch,
             'model_state_dict': model.state_dict(),
             'optimizer_state_dict': optimizer.state_dict(),
             'loss': loss
                }, 
            '{}checkpoint_epoch_2'.format(CHECKPOINTS_PATH)
            )

avg_train_loss = total_train_loss / len(train_dataloader)            
training_time = format_time(time.time() - t0)

print("")
print("  Average training loss: {0:.2f}".format(avg_train_loss))
print("  Training epcoh took: {:}".format(training_time))
    
# ========================================
#               Validation
# ========================================

print("")
print("Running Validation...")

t0 = time.time()

model.eval()

total_eval_roc_auc = 0
total_eval_loss = 0
nb_eval_steps = 0

for batch in validation_dataloader:
    
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    with torch.no_grad():        
        (loss, logits) = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask,
            labels=b_labels).values()
        
    total_eval_loss += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    total_eval_roc_auc += flat_roc_auc(logits, label_ids)
    
avg_val_roc_auc = total_eval_roc_auc / len(validation_dataloader)
print("  ROC AUC: {0:.2f}".format(avg_val_roc_auc))

avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)

print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

training_stats.append(
    {
        'epoch': epoch + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_roc_auc,
        'Training Time': training_time,
        'Validation Time': validation_time
    }
)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(
    format_time(time.time()-total_t0)))


Training...
  Batch    40  of  27,688.    Elapsed: 0:00:32.
  Batch    80  of  27,688.    Elapsed: 0:01:05.
  Batch   120  of  27,688.    Elapsed: 0:01:37.
  Batch   160  of  27,688.    Elapsed: 0:02:09.
  Batch   200  of  27,688.    Elapsed: 0:02:42.
  Batch   240  of  27,688.    Elapsed: 0:03:14.
  Batch   280  of  27,688.    Elapsed: 0:03:46.
  Batch   320  of  27,688.    Elapsed: 0:04:19.
  Batch   360  of  27,688.    Elapsed: 0:04:51.
  Batch   400  of  27,688.    Elapsed: 0:05:23.
  Batch   440  of  27,688.    Elapsed: 0:05:56.
  Batch   480  of  27,688.    Elapsed: 0:06:28.
  Batch   520  of  27,688.    Elapsed: 0:07:09.
  Batch   560  of  27,688.    Elapsed: 0:07:41.
  Batch   600  of  27,688.    Elapsed: 0:08:14.
  Batch   640  of  27,688.    Elapsed: 0:08:46.
  Batch   680  of  27,688.    Elapsed: 0:09:18.
  Batch   720  of  27,688.    Elapsed: 0:09:51.
  Batch   760  of  27,688.    Elapsed: 0:10:23.
  Batch   800  of  27,688.    Elapsed: 0:10:55.
  Batch   840  of  27,688. 

In [None]:
# torch.save({
#     'epoch': epoch,
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
#     'loss': loss,
#   }, '{}checkpoint_epoch_2'.format(CHECKPOINTS_PATH))

In [None]:
# print("Saving model to %s" % '{}epoch_2'.format(MODEL_PATH))

# Take care of distributed/parallel training
# model_to_save = model.module if hasattr(model, 'module') else model  
# model_to_save.save_pretrained('{}epoch_2'.format(MODEL_PATH))
# tokenizer.save_pretrained('{}epoch_2'.format(MODEL_PATH))

# PREDICT SCORES

In [None]:
batch_size = 256

train_dataloader = DataLoader(
            train_dataset,
            sampler=SequentialSampler(train_dataset),
            batch_size=batch_size
            )
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
            )
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
            )

## Epoch 1

In [None]:
tokenizer = BertTokenizer.from_pretrained('{}epoch_1'.format(MODEL_PATH))
model = BertForSequenceClassification.from_pretrained('{}epoch_1'.format(MODEL_PATH))

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(train_input_ids)))

t0 = time.time()
model.eval()
predictions , true_labels = [], []
 
for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(train_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_train['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_train['predict_bert']))
print(df_train.groupby(['category']).apply(auc_group))
print(df_train.groupby(['category']).apply(auc_group).mean())

df_train[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_1/X_train.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_1/X_train.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 886,038 test sentences...
  Batch    40  of  3,462.    Elapsed: 0:01:18.
  Batch    80  of  3,462.    Elapsed: 0:02:35.
  Batch   120  of  3,462.    Elapsed: 0:03:53.
  Batch   160  of  3,462.    Elapsed: 0:05:11.
  Batch   200  of  3,462.    Elapsed: 0:06:28.
  Batch   240  of  3,462.    Elapsed: 0:07:46.
  Batch   280  of  3,462.    Elapsed: 0:09:04.
  Batch   320  of  3,462.    Elapsed: 0:10:21.
  Batch   360  of  3,462.    Elapsed: 0:11:39.
  Batch   400  of  3,462.    Elapsed: 0:12:57.
  Batch   440  of  3,462.    Elapsed: 0:14:14.
  Batch   480  of  3,462.    Elapsed: 0:15:32.
  Batch   520  of  3,462.    Elapsed: 0:16:50.
  Batch   560  of  3,462.    Elapsed: 0:18:07.
  Batch   600  of  3,462.    Elapsed: 0:19:25.
  Batch   640  of  3,462.    Elapsed: 0:20:43.
  Batch   680  of  3,462.    Elapsed: 0:22:00.
  Batch   720  of  3,462.    Elapsed: 0:23:18.
  Batch   760  of  3,462.    Elapsed: 0:24:36.
  Batch   800  of  3,462.    Elapsed: 0:25:53.
  Batch   84

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(val_input_ids)))

t0 = time.time()
model.eval()

predictions , true_labels = [], []

for step, batch in enumerate(validation_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(validation_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_val['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_val['predict_bert']))
print(df_val.groupby(['category']).apply(auc_group))
print(df_val.groupby(['category']).apply(auc_group).mean())

df_val[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_1/X_val.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_1/X_val.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 98,449 test sentences...
  Batch    40  of    385.    Elapsed: 0:01:18.
  Batch    80  of    385.    Elapsed: 0:02:35.
  Batch   120  of    385.    Elapsed: 0:03:53.
  Batch   160  of    385.    Elapsed: 0:05:11.
  Batch   200  of    385.    Elapsed: 0:06:28.
  Batch   240  of    385.    Elapsed: 0:07:46.
  Batch   280  of    385.    Elapsed: 0:09:04.
  Batch   320  of    385.    Elapsed: 0:10:21.
  Batch   360  of    385.    Elapsed: 0:11:39.
    DONE.
0.9890402769725187
category
Бытовая электроника    0.979131
Для бизнеса            0.909541
Для дома и дачи        0.984002
Животные               0.994192
Личные вещи            0.991080
Недвижимость           0.991627
Работа                 0.972892
Транспорт              0.992203
Услуги                 0.983476
Хобби и отдых          0.978216
dtype: float64
0.9776359457039107


In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

t0 = time.time()
model.eval()

predictions , true_labels = [], []

for step, batch in enumerate(test_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(test_dataloader), elapsed))
  
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
  
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_test['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_test['predict_bert']))
print(df_test.groupby(['category']).apply(auc_group))
print(df_test.groupby(['category']).apply(auc_group).mean())

df_test[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_1/X_test.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_1/X_test.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 16,237 test sentences...
  Batch    40  of     64.    Elapsed: 0:01:18.
    DONE.
0.9772855360449079
category
Бытовая электроника    0.957294
Для бизнеса            0.827395
Для дома и дачи        0.961471
Животные               0.965079
Личные вещи            0.819097
Недвижимость           0.989863
Работа                 0.969762
Транспорт              0.995151
Услуги                 0.948250
Хобби и отдых          0.917549
dtype: float64
0.9350910975603


## EPOCH 2

In [None]:
tokenizer = BertTokenizer.from_pretrained('{}epoch_2'.format(MODEL_PATH))
model = BertForSequenceClassification.from_pretrained('{}epoch_2'.format(MODEL_PATH))

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(train_input_ids)))

t0 = time.time()
model.eval()
predictions , true_labels = [], []
 
for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(train_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_train['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_train['predict_bert']))
print(df_train.groupby(['category']).apply(auc_group))
print(df_train.groupby(['category']).apply(auc_group).mean())

df_train[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_2/X_train.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_2/X_train.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 886,038 test sentences...
  Batch    40  of  3,462.    Elapsed: 0:02:36.
  Batch    80  of  3,462.    Elapsed: 0:05:12.
  Batch   120  of  3,462.    Elapsed: 0:07:48.
  Batch   160  of  3,462.    Elapsed: 0:10:23.
  Batch   200  of  3,462.    Elapsed: 0:12:59.
  Batch   240  of  3,462.    Elapsed: 0:15:35.
  Batch   280  of  3,462.    Elapsed: 0:18:10.
  Batch   320  of  3,462.    Elapsed: 0:20:46.
  Batch   360  of  3,462.    Elapsed: 0:23:22.
  Batch   400  of  3,462.    Elapsed: 0:25:57.
  Batch   440  of  3,462.    Elapsed: 0:28:32.
  Batch   480  of  3,462.    Elapsed: 0:31:08.
  Batch   520  of  3,462.    Elapsed: 0:33:44.
  Batch   560  of  3,462.    Elapsed: 0:36:19.
  Batch   600  of  3,462.    Elapsed: 0:38:55.
  Batch   640  of  3,462.    Elapsed: 0:41:30.
  Batch   680  of  3,462.    Elapsed: 0:44:06.
  Batch   720  of  3,462.    Elapsed: 0:46:42.
  Batch   760  of  3,462.    Elapsed: 0:49:17.
  Batch   800  of  3,462.    Elapsed: 0:51:53.
  Batch   84

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(val_input_ids)))

t0 = time.time()
model.eval()

predictions , true_labels = [], []

for step, batch in enumerate(validation_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
            step, len(validation_dataloader), elapsed))

    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_val['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_val['predict_bert']))
print(df_val.groupby(['category']).apply(auc_group))
print(df_val.groupby(['category']).apply(auc_group).mean())

df_val[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_2/X_val.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_2/X_val.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 98,449 test sentences...
  Batch    40  of    385.    Elapsed: 0:02:36.
  Batch    80  of    385.    Elapsed: 0:05:12.
  Batch   120  of    385.    Elapsed: 0:07:47.
  Batch   160  of    385.    Elapsed: 0:10:23.
  Batch   200  of    385.    Elapsed: 0:12:59.
  Batch   240  of    385.    Elapsed: 0:15:35.
  Batch   280  of    385.    Elapsed: 0:18:11.
  Batch   320  of    385.    Elapsed: 0:20:47.
  Batch   360  of    385.    Elapsed: 0:23:23.
    DONE.
0.9901755146496052
category
Бытовая электроника    0.984113
Для бизнеса            0.916818
Для дома и дачи        0.985664
Животные               0.994496
Личные вещи            0.991983
Недвижимость           0.991764
Работа                 0.975350
Транспорт              0.993404
Услуги                 0.983979
Хобби и отдых          0.977090
dtype: float64
0.9794662319041224


In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

t0 = time.time()
model.eval()

predictions , true_labels = [], []

for step, batch in enumerate(test_dataloader):
    if step % 40 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))
  
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
  
    with torch.no_grad():
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
  
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = flat_predictions[:, 1].flatten()
flat_true_labels = np.concatenate(true_labels, axis=0)

df_test['predict_bert'] = flat_predictions

print(roc_auc_score(flat_true_labels, df_test['predict_bert']))
print(df_test.groupby(['category']).apply(auc_group))
print(df_test.groupby(['category']).apply(auc_group).mean())

df_test[['id', 'predict_bert']].to_csv(
    '{}/bert_scores/epoch_2/X_test.csv'.format(FOLDER_PATH), sep='|')
# pd.read_csv('{}/bert_scores/epoch_2/X_test.csv'.format(FOLDER_PATH), sep='|', index_col=0)

Predicting labels for 16,237 test sentences...
  Batch    40  of     64.    Elapsed: 0:02:36.
    DONE.
0.982158992977741
category
Бытовая электроника    0.968108
Для бизнеса            0.911899
Для дома и дачи        0.970597
Животные               0.965533
Личные вещи            0.859187
Недвижимость           0.991380
Работа                 0.979102
Транспорт              0.995556
Услуги                 0.951216
Хобби и отдых          0.929565
dtype: float64
0.9522144032508862
