In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['imdb-review-dataset', 'corpus-of-russian-news-articles-from-lenta']


### Поиск гиперпараметров
Автоматический поиск гиперпараметров позволяет заметно упростить настройку модели. 
Ниже приведён код, позволяющий декларативно перечислить множества значений гиперпараметров.
Далее, из этих множеств фиксированное количество раз (число итераций поиск) берутся значения (случайно или фиксированно), которые передаются в описываемую программистом функцию итерации (обучения модели на заданном наборе параметров). Функция также принимает текущее состояния поиска и обновляет его через возвращаемое значение. Это позволяет сохранять лучшую модель и сравнивать ее с предыдущей, или накапливать статистику по мере поиска.

In [2]:
import numpy as np
from collections import defaultdict

# Абстрактный класс множества значений. Объект множества сам отвечает за выборку заданного (size) числа значений из него.
class HyperSampler:
    def sample(self, size):
        raise NotImplementedError

# Простая реализация, в которой функция выборки просто задается пользователем.
# Вариант использования - LambdaSampler(lambda size: np.random.randn(size))  - выборка из стандартного нормального распределения
class LambdaSampler(HyperSampler):
    def __init__(self, fn):
        super().__init__()
        self._fn = fn

    def sample(self, size):
        return self._fn(size)

# В данной реализации выборка не случайна. Задается массив значений, из которого берется первые size значений,
# по одному на итерацию поиска
# По этой причине размер массива должен быть не менее числа итераций поиска гиперпараметра.
class FixedArraySampler(HyperSampler):
    def __init__(self, array):
        super().__init__()
        self._arr = np.asarray(array)

    def sample(self, size):
        if len(self._arr) < size:
            raise ValueError("len(self._arr) < size")
        return self._arr[:size]

# Выборка из категориального равномерного распределения. 
# Вы задаёте множество значений, и из него случайным образом отбирается size элементов 
# (с заменой, т.е. один элемент может быть выбран > 1 раза)
class RandomArraySampler(HyperSampler):
    def __init__(self, array):
        super().__init__()
        self._arr = np.asarray(array)

    def sample(self, size):
        idx = np.random.choice(len(self._arr), size=size)
        return self._arr[idx]

# Обертка над LambdaSampler    
def h_lambda(fn):
    return LambdaSampler(fn)

# Обертка над RandomArraySampler, позволяющая писать так h_enum(32,64,128) вместо RandomArraySampler([32,64,128])
def h_enum(*values):
    return RandomArraySampler(values)

# Обертка над RandomArraySampler из существующей коллекции (lst = [1,2,3], h_enum(lst)), по сути для абстракции и сокращения имени
def h_set(values):
    return RandomArraySampler(values)

#Аналогичные обертки над FixedArraySampler
def h_fixed_enum(*values):
    return FixedArraySampler(values)

def h_fixed_set(values):
    return FixedArraySampler(values)

# Разбиение словаря на два непересекающихся в зависимости от значения критерия
def split_dictionary(d, criterion):
    a, b = {}, {}
    for k, v in d.items():
        if criterion(k, v):
            a[k] = v
        else:
            b[k] = v
    return a, b

def hyper_search(num_trials, parameters,
                            iteration_function,
                            initial_state,
                            progress_bar=None):
    """Функция поиска гиперпараметров
    
    Параметры:
    num_trials -- число итераций поиска
    parameters -- словарь, ключами которого служат имена параметров, 
        а значениями - либо фиксированные значения (тогда они будут передаваться на каждую итерацию одинаковыми),
        либо экземпляры класса HyperSampler, из которых на каждую новую итерацию выбирается новое значение
    iteration_function -- задаваемая пользователем функция итерации. 
        Её первым аргументом является текущее состояние поиска (любой объект),
        вторым - номер итерации,
        Также в неё распаковывается словарь с ключами из parameters,
        поэтому она либо должна перечислить каждый из этих ключей в своём списке параметров,
        либо принимать неограниченное количество именованных параметров (**kvargs)
        Функция возвращает новое состояние поиска.
        Таким образом поиск представляет собой reduce-алгоритм
    initial_state -- начальное состояние поиска. Может быть любым объектом, включая None.
    В частности, если вы несколько раз запускали hyper_search, вы можете скормить сюда результат предыдущего запуска.
    progress_bar -- Имеет три возможных значения - None (не отображать прогресс), 'tqdm' - текстовый progress bar, 'tqdm_notebook' - для jupyter
    
    Возвращаемое значение:
    Конечное состояние поиска (результат вызова iteration_function на последней итерации)
     """
    # Разделяем словарь по признаку необходимости выборки
    sampled, fixed = split_dictionary(parameters, lambda _, v: isinstance(v, HyperSampler))
    # Сразу, наперед, отбираем num_trials значений для каждого нефиксированного параметра
    random_queue = {}
    for name, population in sampled.items():
        random_queue[name] = population.sample(num_trials)

    if progress_bar == 'tqdm':
        from tqdm import tqdm
        tqdm_function = lambda iterable, **kws: tqdm(iterable,**kws)
    elif progress_bar == 'tqdm_notebook':
        from tqdm import tqdm_notebook
        tqdm_function = lambda iterable, **kws: tqdm_notebook(iterable, **kws)
    else:
        tqdm_function = lambda iterable, **kws: iterable

    # Превращаем numpy-скаляры в соотв. классы python, чтобы не сломать некоторые функции.
    def decay(x):
        if isinstance(x, np.generic):
            return x.item()
        return x

    current_state = initial_state

    for trial in tqdm_function(range(num_trials), desc='Trial #'):
        # Собираем значения гиперпараметров на текущую итерацию вместе
        current_trial_random = {k: decay(sample[trial]) for k,sample in random_queue.items()}
        iteration_setting = dict(current_trial_random, **fixed)
        iteration_setting.update(fixed)
        # Вызываем функцию итерации и вызываем 
        current_state = iteration_function(current_state, trial, **iteration_setting)

    return current_state

In [63]:
import torch
import torch.nn as nn

In [40]:
np.random.seed(5771)
torch.manual_seed(5661)

<torch._C.Generator at 0x7f32e4371630>

In [5]:
class PersistentModelWrapper:
    def __init__(self, path, initial_criterion):
        self.path = path
        self.criterion = initial_criterion

    def update(self, model, optimizer, criterion):
        self.criterion = criterion
        torch.save(
            {'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'criterion': criterion},
            self.path)

    def load_model_data(self):
        return torch.load(self.path)

    def restore(self, model, optimizer):
        model_data = self.load_model_data()
        model.load_state_dict(model_data['model_state'])
        optimizer.load_state_dict(model_data['optimizer_state'])


In [6]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 7.4MB/s ta 0:00:01
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification

In [8]:
import logging
logging.basicConfig(level=logging.INFO)

In [9]:
logger = logging.getLogger(__name__)


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

INFO:pytorch_pretrained_bert.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt not found in cache, downloading to /tmp/tmpgdjeol4r
100%|██████████| 995526/995526 [00:00<00:00, 2028901.77B/s]
INFO:pytorch_pretrained_bert.file_utils:copying /tmp/tmpgdjeol4r to cache at /tmp/.pytorch_pretrained_bert/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
INFO:pytorch_pretrained_bert.file_utils:creating metadata file for /tmp/.pytorch_pretrained_bert/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
INFO:pytorch_pretrained_bert.file_utils:removing temp file /tmp/tmpgdjeol4r
INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /tmp/.pytorch_pretrained_bert/96435fa287fbf7e4

In [11]:
text = "[CLS] Каким образом так получилось, что мы стоим на краю этой дороги [SEP]"
tokenized_text = tokenizer.tokenize(text)
masked_index = 5
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)

['[CLS]', 'Как', '##им', 'образом', 'так', '[MASK]', '##ось', ',', 'что', 'мы', 'сто', '##им', 'на', 'краю', 'этой', 'дороги', '[SEP]']


In [12]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0 for _ in range(len(indexed_tokens))]
print(indexed_tokens)

[101, 23220, 13478, 20417, 12123, 103, 16353, 117, 10791, 35818, 108804, 13478, 10122, 50629, 18079, 33949, 102]


In [13]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [14]:
# # model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased').eval()
# model = model.cuda()
# with torch.no_grad():
#     predictions = model.bert(tokens_tensor.cuda(), segments_tensors.cuda())
# topk = torch.topk(predictions[0,masked_index],10)
# print(topk)

In [15]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [16]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [17]:
# class DataProcessor(object):
#     """Base class for data converters for sequence classification data sets."""

#     def get_train_examples(self, data_dir):
#         """Gets a collection of `InputExample`s for the train set."""
#         raise NotImplementedError()

#     def get_dev_examples(self, data_dir):
#         """Gets a collection of `InputExample`s for the dev set."""
#         raise NotImplementedError()

#     def get_labels(self):
#         """Gets the list of labels for this data set."""
#         raise NotImplementedError()

#     @classmethod
#     def _read_tsv(cls, input_file, quotechar=None):
#         """Reads a tab separated value file."""
#         with open(input_file, "r", encoding="utf-8") as f:
#             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
#             lines = []
#             for line in reader:
#                 if sys.version_info[0] == 2:
#                     line = list(unicode(cell, 'utf-8') for cell in line)
#                 lines.append(line)
#             return lines

In [18]:
def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode, total_examples=None):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example {} of {}".format(ex_index, total_examples))

        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


In [19]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [20]:
import gc
gc.collect()

0

In [23]:
imdb_df = pd.read_csv('../input/imdb-review-dataset/imdb_master.csv', encoding='latin-1')

In [24]:
imdb_df.sample(10)

Unnamed: 0.1,Unnamed: 0,type,review,label,file
68730,68730,train,Noticed this on cable last night and wasn't su...,unsup,26859_0.txt
15866,15866,test,The Dekalog 5 may be considered a violent accu...,pos,1780_10.txt
98915,98915,train,"I wasn't expecting much from this film, but I ...",unsup,9024_0.txt
34023,34023,train,"This show had pretty good stories, but bad dia...",neg,6872_2.txt
24435,24435,test,I just re-watched a few episodes of this serie...,pos,9493_10.txt
91669,91669,train,By 1950 Hollywood gave Dean Stockwell a lead r...,unsup,47503_0.txt
80152,80152,train,Viewers preferring a straightforward story and...,unsup,37138_0.txt
76857,76857,train,I appreciate that the series tries to give us ...,unsup,34172_0.txt
94688,94688,train,"I picked this one up as a ""blind"" rental and w...",unsup,521_0.txt
53942,53942,train,"""I just want to get my clothes on and get the ...",unsup,13549_0.txt


In [25]:
dev_df = imdb_df[(imdb_df.type == 'train') & (imdb_df.label != 'unsup')]

In [26]:
test_df = imdb_df[(imdb_df.type == 'test')]

In [27]:
from sklearn import model_selection

In [28]:
train_df, val_df = model_selection.train_test_split(dev_df, test_size=0.05, stratify=dev_df.label)

In [29]:
for row in train_df.iterrows():
    print(row[1].type)
    break

train


In [30]:
def df_to_examples_imdb(df):
    for idx,row in df.iterrows():
        yield InputExample(idx,row.review,label=row.label)

In [31]:
train_features = convert_examples_to_features(df_to_examples_imdb(train_df),
                                              ['neg','pos'],
                                              max_seq_length=230,
                                              tokenizer=tokenizer,
                                              output_mode="classification")

INFO:__main__:Writing example 0 of None
INFO:__main__:*** Example ***
INFO:__main__:guid: 44840
INFO:__main__:tokens: [CLS] This comic book style film is fun ##ny , has nic ##ely pace ##d action and a great futur ##istic style to it . Writer Steven de Souza , who also wrote Commando , gives Ar ##nie pl ##enty of lines to dis ##h out : " Sen ##d me a copy , " after signing a contract and sta ##bbi ##ng a pen into the lawyer ##s back ; " What a pain in the neck , " after stran ##gli ##ng sub ##zero with bar ##bed wire ; " He had to split , " after sl ##ici ##ng his body between his legs ; and finally , as Kill ##ian sl ##ams through a bill ##board bearing his own face , Ar ##nie conclude ##s , " Now that hit the spot . " Fun ##nil ##y enough , bears some similar ##ities total reca ##ll , another sci - fi fl ##ick starring Schwarz ##ene ##gger . [SEP]
INFO:__main__:input_ids: 101 10747 31761 12748 13351 10458 10124 41807 10756 117 10393 46267 44096 32547 10162 14204 10111 169 14772 33864 

INFO:__main__:label: neg (id = 0)
INFO:__main__:*** Example ***
INFO:__main__:guid: 49716
INFO:__main__:tokens: [CLS] Ok ##ay , let ' s face it . this is a god - aw ##ful movie . The plot ( such as it is ) is ho ##rri ##ble , the acting worse . But the movie was made for one reason and one reason only , like all of those aw ##ful Mario Lan ##za movies . . . just to hear the voice of the star , in this case Pa ##varo ##tti in his prime . Ok ##ay , so may ##be the Lan ##za movies were also an ex ##cus ##e for him to hit on women , but this movie is about hearing Luciano . That alone is worth watching the movie . A big opera star st ##uck on himself faces his fear ##s , finds hu ##mil ##ity and love along the way , and belt ##s out a lot of hit numbers , too . < br / > < br / > I must ad ##mit I ' m prej ##udi ##ced on a number of levels . I ' m Italian . I ' m a big Pa ##varo ##tti fan ( is there anything about Pa ##varo ##tti that isn ' t big , including his fan base ? ) . And when I fi

In [32]:
val_features = convert_examples_to_features(df_to_examples_imdb(val_df),
                                              ['neg','pos'],
                                              max_seq_length=230,
                                              tokenizer=tokenizer,
                                              output_mode="classification")

INFO:__main__:Writing example 0 of None
INFO:__main__:*** Example ***
INFO:__main__:guid: 49448
INFO:__main__:tokens: [CLS] As many people know , Mexican cinema was very poor after the so - called Golden Age of the Mexican Cinema , fortuna ##tely , during the late 90 ' s , and early 21st century , great movies like La Ley de Hero ##des , Bajo California , Amore ##s Per ##ros , Y Tu Ma ##m ##Ã ¡ Tam ##bi ##Ã ##© ##n and , of course , El Coronel No Tiene Qui ##en le Es ##cri ##ba , appeared . El Coronel . . . , is a won ##der ##ful movie , that rete ##lls the classic story by Gabriel Ga ##rc ##Ã ##a M ##Ã ¡ r ##quez , by eli ##minat ##ing the magic real ##ism elements , and replacing them with the c ##rude reality lived in Mexico , not only by people like the Colonel , who wait for their pension ##s , but by more than the half of the Mexican population , who live in complete poverty . The film ' s characters , sat ##iri ##cally represent classic characters found in Mexican society , such

INFO:__main__:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:__main__:label: pos (id = 1)
INFO:__main__:*** Example ***
INFO:__main__:guid: 42675
INFO:__main__:tokens: [CLS] I give this movie an A + for the she ##er camp of it ! As Dietrich ' s daughter Maria Riva wrote in the book on her mother , " If one sees The Garden of Allah in the context of high camp , it can be very am ##using . " And how ! I lau ##ghed with del ##ight at the over ##wr ##ough ##t score and the as ##tou ##nding ##ly , ri ##dic ##ulo ##usly , fant ##astic ##ally me ##lod ##rama ##tic dialo

In [33]:
def features_to_tensors(list_of_features):
    all_text_tensor = torch.tensor([f.input_ids for f in list_of_features], dtype=torch.long)
    all_mask_tensor = torch.tensor([f.input_mask for f in list_of_features], dtype=torch.long)
    all_segment_tensor = torch.tensor([f.segment_ids for f in list_of_features], dtype=torch.long)
    all_label_tensor = torch.tensor([f.label_id for f in list_of_features], dtype=torch.long)
    return all_text_tensor, all_mask_tensor, all_segment_tensor, all_label_tensor

In [34]:
from torch.utils.data import TensorDataset,DataLoader

In [35]:
train_text_tensor, train_mask_tensor, train_segment_tensor, train_label_tensor = features_to_tensors(train_features)
val_text_tensor, val_mask_tensor, val_segment_tensor, val_label_tensor = features_to_tensors(val_features)

In [36]:
train_dataset = TensorDataset(train_text_tensor, train_mask_tensor, train_segment_tensor, train_label_tensor)
val_dataset = TensorDataset(val_text_tensor, val_mask_tensor, val_segment_tensor, val_label_tensor)

In [37]:
print(train_label_tensor[:2])

tensor([1, 1])


In [38]:
from pytorch_pretrained_bert import BertAdam

In [None]:
# bert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',num_labels=2).cuda()

In [None]:
!nvidia-smi

In [None]:
# Batch size: 16, 32
# • Learning rate (Adam): 5e-5, 3e-5, 2e-5
# • Number of epochs: 3, 4

Опишем настройки для дообучения, основываясь на секции 3.5 (https://arxiv.org/abs/1810.04805)

In [41]:
from pytorch_pretrained_bert import BertConfig

In [58]:
import os

In [59]:
class BertPersistentWrapper:
    def __init__(self, prefix, initial_criterion, num_labels):
        self.prefix = prefix
        self.model_path = prefix + '_model.bin'
        self.config_path = prefix + '_config.bin'
#         self.vocab_path = prefix + '_vocab.bin'
        self.criterion = initial_criterion
        self.num_labels = num_labels

    def update(self, model, criterion):
        self.criterion = criterion
        model_to_save = model.module if hasattr(model, 'module') else model
        torch.save(model_to_save.state_dict(), self.model_path)
        model_to_save.config.to_json_file(self.config_path)
#         tokenizer.save_vocabulary(output_vocab_file)

#     def load_model_data(self):
#         return torch.load(self.path)

    def restore(self):
        config = BertConfig.from_json_file(self.config_path)
        model = BertForSequenceClassification(config, num_labels=self.num_labels)
        model.load_state_dict(torch.load(self.model_path))
        return model
    
    def destroy(self):
        os.remove(self.model_path)
        os.remove(self.config_path)

In [111]:
class SearchState:
    def __init__(self, best_model, parameter_stats):
        self.best_model = best_model
        self.parameter_stats = parameter_stats
        
SearchState = namedtuple('SearchState',['best_model', 'parameter_stats'])

In [62]:
def bert_wrapper_test():
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',num_labels=2)
    print(model.num_labels)
    bw = BertPersistentWrapper('wrapper_test',10, 2)
    bw.update(model,5)
    res_model = bw.restore()
    print(res_model)
    print(res_model.num_labels)
    bw.destroy()
    
bert_wrapper_test()

INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz from cache at /tmp/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9
INFO:pytorch_pretrained_bert.modeling:extracting archive file /tmp/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9 to temp dir /tmp/tmph5u6v7z8
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers

2
BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermed

In [61]:
!ls

__notebook_source__.ipynb  wrapper_test_config.bin  wrapper_test_model.bin


In [66]:
import gc
gc.collect()

8

In [65]:
from tqdm import tqdm_notebook

In [112]:
def train_bert(hyper_state, hyper_trial,
               n_epochs,
               gradient_accumulation_steps,
               batch_size,
               learning_rate,
               warmup_proportion,
               train_dataset,
               val_dataset,
               num_labels,
               device):
    print('Trial', hyper_trial)
    print('n_epochs = {}, effective_batch_size={}, lr={}, warmup={}'.format(n_epochs,
                                                                            batch_size * gradient_accumulation_steps,
                                                                            learning_rate,
                                                                            warmup_proportion))
    model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels).cuda()
    num_train_optimization_steps = n_epochs * int(len(train_dataset) / batch_size / gradient_accumulation_steps)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = BertAdam(model.parameters(), lr=learning_rate, warmup=warmup_proportion,
                         t_total=num_train_optimization_steps)

    best_model = BertPersistentWrapper(f'model{hyper_trial}.md', 0.0, num_labels)

    for epoch in tqdm_notebook(range(n_epochs), desc='Epoch'):
        model.train()
        tr_loss = 0.0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm_notebook(train_loader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            # define a new function to compute loss values for both output_modes
            logits = model(input_ids, segment_ids, input_mask, labels=None)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        tr_loss /= len(train_loader)
        print('Epoch {}, training_loss={}'.format(epoch, tr_loss))

        model.eval()
        with torch.no_grad():
            running_corrects = 0
            running_total = 0

            running_loss = 0.0
            for batch in tqdm_notebook(val_loader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                logits = model(input_ids, segment_ids, input_mask, labels=None)
                preds = logits.view(-1, num_labels).argmax(dim=1)

                running_total += input_ids.size(0)
                running_corrects += (preds == label_ids.view(-1)).sum().item()

                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                running_loss += loss.item()

        val_loss = running_loss / len(val_loader)
        val_accuracy = running_corrects / running_total
        print('Epoch {}, val_loss={}, val_accuracy={}'.format(epoch, val_loss, val_accuracy))

        if val_accuracy > best_model.criterion:
            best_model.update(model, val_accuracy)

    del model
    torch.cuda.empty_cache()
#     print('n_epochs = {}, effective_batch_size={}, lr={}, warmup={}'.format(n_epochs,
#                                                                             batch_size * gradient_accumulation_steps,
#                                                                             learning_rate,
#                                                                             warmup_proportion))
    param_stats = dict(
        n_epochs=n_epochs,
        gradient_batch=batch_size * gradient_accumulation_steps,
        lr=learning_rate,
        accuracy=val_accuracy
    )
    
    if not hyper_state:
        return SearchState(best_model, [param_stats])
    
    hyper_state.parameter_stats.append(param_stats)
    if best_model.criterion > hyper_state.best_model.criterion:
        hyper_state.best_model.destroy()
        hyper_state.best_model = best_model
    else:
        best_model.destroy()
    return hyper_state
                

In [113]:
settings_for_random_search = {
    'learning_rate': h_enum(5e-5, 3e-5, 2e-5),
    'warmup_proportion': 0.1,
    'gradient_accumulation_steps': h_enum(1,2),
    'batch_size': 16,
    'n_epochs': h_enum(3,4),
    'train_dataset': train_dataset,
    'val_dataset': val_dataset,
    'num_labels': 2,
    'device': torch.device('cuda')
}

In [114]:
result_state = hyper_search(6,settings_for_random_search,train_bert,None,'tqdm_notebook')


HBox(children=(IntProgress(value=0, description='Trial #', max=4, style=ProgressStyle(description_width='initi…

Trial 0
n_epochs = 1, effective_batch_size=16, lr=5e-05, warmup=0.1


INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz from cache at /tmp/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9
INFO:pytorch_pretrained_bert.modeling:extracting archive file /tmp/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9 to temp dir /tmp/tmp2oit5kow
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=16, style=ProgressStyle(description_width='in…

RuntimeError: CUDA out of memory. Tried to allocate 43.12 MiB (GPU 0; 15.90 GiB total capacity; 14.11 GiB already allocated; 17.88 MiB free; 438.52 MiB cached)

In [121]:
pd.DataFrame.from_records(result_state.parameter_stats)

NameError: name 'result_state' is not defined

In [76]:
bert_model = result_state.best_model.restore()

In [119]:
torch.cuda.empty_cache()

In [85]:
bert_model = bert_model.cuda()

In [117]:
import gc
gc.collect()

2617

In [91]:
def predict_loader(bert_model ,loader, device='cuda'):
    bert_model.eval()
    predictions = []
    correct_predictions = []
    with torch.no_grad():
        for batch in tqdm_notebook(loader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            logits = bert_model(input_ids, segment_ids, input_mask, labels=None)
            predictions.extend(logits.argmax(dim=1).tolist())
            correct_predictions.extend(label_ids.tolist())
#             break
    return predictions, correct_predictions

In [92]:
y_pred, y_test = predict_loader(bert_model, DataLoader(val_dataset, batch_size=16))

HBox(children=(IntProgress(value=0, max=79), HTML(value='')))

In [93]:
from sklearn import metrics

In [94]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       625
           1       0.88      0.87      0.88       625

   micro avg       0.88      0.88      0.88      1250
   macro avg       0.88      0.88      0.88      1250
weighted avg       0.88      0.88      0.88      1250



In [95]:
print(metrics.accuracy_score(y_test, y_pred))

0.876


In [96]:
test_features = convert_examples_to_features(df_to_examples_imdb(test_df),
                                              ['neg','pos'],
                                              max_seq_length=230,
                                              tokenizer=tokenizer,
                                              output_mode="classification")

INFO:__main__:Writing example 0 of None
INFO:__main__:*** Example ***
INFO:__main__:guid: 0
INFO:__main__:tokens: [CLS] Once again Mr . Cost ##ner has drag ##ged out a movie for far longer than necessary . As ##ide from the ter ##rif ##ic sea rescue sequences , of which there are very few I just did not care about any of the characters . Most of us have ghost ##s in the close ##t , and Cost ##ner ' s character are realized early on , and then for ##gott ##en until much later , by which time I did not care . The character we should really care about is a very co ##cky , over ##con ##fi ##dent Ashton Ku ##tch ##er . The problem is he comes off as ki ##d who think ##s he ' s better than anyone else around him and shows no signs of a c ##lut ##tere ##d close ##t . His only ob ##sta ##cle appears to be winning over Cost ##ner . Finally when we are well past the half way point of this st ##ink ##er , Cost ##ner tells us all about Ku ##tch ##er ' s ghost ##s . We are told why Ku ##tch ##er is

INFO:__main__:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:__main__:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:__main__:label: neg (i

In [97]:
test_text_tensor, test_mask_tensor, test_segment_tensor, test_label_tensor = features_to_tensors(test_features)

In [98]:
test_dataset = TensorDataset(test_text_tensor, test_mask_tensor, test_segment_tensor, test_label_tensor)


In [100]:
test_loader = DataLoader(test_dataset,batch_size=16,shuffle=False)
y_pred, y_test = predict_loader(bert_model, test_loader)
print(metrics.classification_report(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

HBox(children=(IntProgress(value=0, max=1563), HTML(value='')))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     12500
           1       0.88      0.89      0.88     12500

   micro avg       0.88      0.88      0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

0.88076
