In [None]:
INPUT_DATASET_PATH = "../input/imdb-review-dataset/imdb_master.csv"

In [None]:
# !pip install nltk

In [None]:
import nltk
from collections import Counter
import itertools
from typing import List

In [None]:
import torch

Рассмотрим преобразование текстов в удобоваримый для нейронной сети вид.<br>
А именно:

- Текст разбивается на слова (токенизация, знаки препинания считаются словами)
- Слова подсчитываются для формирования ограниченного словаря. Каждому слову сопоставляется определеннный номер (индекс, айди) в словаре. Редким словам назначается специальный номер (эквивалентно замене редких слов на спец. слово **\<UNK\>** (неизвестное слово)). 
- Последовательности слов преобразуются в последовательности номеров слов.
- Полученные последовательности выравниваются по заданной максимальной длине через обрезание или дополнение номером спец.символа **\<PAD\>**

Класс для хранения текста в виде последовательности индексов слов и его закодированной метки (класса)

In [None]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids: List[int], label_id: int):
        self.input_ids = input_ids
        self.label_id = label_id

Класс словаря. Метод word2id возвращает номер слова, id2word - наоборот, восстанавливает слово.
unk_index - номер слова, которым будут обозначены все неизвестные слова

In [None]:
class Vocab:
    def __init__(self, itos: List[str], unk_index: int):
        self._itos = itos 
        # строим обратный индекс - слово - номер
        self._stoi = {word:i for i, word in enumerate(itos)}
        self._unk_index = unk_index
        
    def __len__(self):
        return len(self._itos)
    
    def word2id(self, word):
        idx = self._stoi.get(word)
        if idx is not None:
            return idx
        return self._unk_index
    
    def id2word(self, idx):
        return self._itos[idx]

In [None]:
from tqdm.notebook import tqdm


Интерфейс объекта, преобразующего тексты в последовательности номеров. 

**fit_transform** выучивает новый словарь из текста и преобразует текст с его помощью. Используется на обучающей выборке текстов.

**transform** выполняет преобразование при помощи уже выученного словаря. Использует для тестовых текстов

In [None]:
class TextToIdsTransformer:
    def transform():
        raise NotImplementedError()
        
    def fit_transform():
        raise NotImplementedError()

Простая реализация данного интерфейса. Разбиение на слова производится с помощью библиотеки NLTK. В словаре содержатся несколько спец. слов. 

In [None]:
class SimpleTextTransformer(TextToIdsTransformer):
    def __init__(self, max_vocab_size):
        self.special_words = ['<PAD>', '</UNK>']
        self.unk_index = 1
        self.pad_index = 0
        self.vocab = None
        self.max_vocab_size = max_vocab_size # 
        self._tokenizer = nltk.tokenize.TweetTokenizer()
        
    def tokenize(self, text):
        return self._tokenizer.tokenize(text.lower())
        
    def build_vocab(self, tokens):
        itos = []
        itos.extend(self.special_words)
        
        # в словаре будут max_vocab_size - 2 самых частых слов
        token_counts = Counter(tokens)
        for word, _ in token_counts.most_common(self.max_vocab_size - len(self.special_words)):
            itos.append(word)
            
        self.vocab = Vocab(itos, self.unk_index)
    
    def transform_single_text(self, text):
        tokens =  self.tokenize(text)
        ids = [self.vocab.word2id(token) for token in tokens]
        return ids
        
    def transform(self, texts):
        result = []
        for text in tqdm(texts):
            result.append(self.transform_single_text(text))
        return result
    
    def fit_transform(self, texts):
        result = []
        tokenized_texts = [self.tokenize(text) for text in tqdm(texts)]
        self.build_vocab(itertools.chain(*tokenized_texts))
        for tokens in tqdm(tokenized_texts):
            tokens = tokens
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result

Строим экземпляр входных данных. Обеспечиваем длину последовательности номеров равной max_seq_len. 

In [None]:
def build_features(token_ids, label, max_seq_len, pad_index, label_encoding):
    if len(token_ids) >= max_seq_len:
        ids = token_ids[:max_seq_len]
    else:
        ids = token_ids + [pad_index for _ in range(max_seq_len - len(token_ids))]
    return InputFeatures(ids, label_encoding[label])

In [None]:
def features_to_tensor(list_of_features):
    text_tensor = torch.tensor([example.input_ids for example in list_of_features], dtype=torch.long)
    labels_tensor = torch.tensor([example.label_id for example in list_of_features], dtype=torch.long)
    return text_tensor, labels_tensor

### Делим на выборки и преобразуем тексты

In [None]:
from sklearn import model_selection
import pandas as pd

In [None]:
imdb_df = pd.read_csv(INPUT_DATASET_PATH, encoding='latin-1')
train_val_df = imdb_df[(imdb_df.type == 'train') & (imdb_df.label != 'unsup')]
test_df = imdb_df[(imdb_df.type == 'test')]
train_df, val_df = model_selection.train_test_split(train_val_df, test_size=0.05, stratify=train_val_df.label)

In [None]:
train_df.shape

In [None]:
val_df.shape

In [None]:
train_df.sample(5)

In [None]:
text2id = SimpleTextTransformer(max_vocab_size=10000)

train_ids = text2id.fit_transform(train_df['review'])
val_ids = text2id.transform(val_df['review'])
test_ids = text2id.transform(test_df['review'])

In [None]:
print(train_df.review.iloc[0][:160])
print(train_ids[0][:30])
print([text2id.vocab.id2word(x) for x in train_ids[0][:30]])

In [None]:
max_seq_len=200
classes = {'neg': 0, 'pos' : 1}

In [None]:
train_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(train_ids, train_df['label'])]

val_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(val_ids, val_df['label'])]

test_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(test_ids, test_df['label'])]

In [None]:
print(train_features[3].__dict__)

In [None]:
train_tensor, train_labels = features_to_tensor(train_features)
val_tensor, val_labels = features_to_tensor(val_features)
test_tensor, test_labels = features_to_tensor(test_features)

In [None]:
print(train_tensor.size())

In [None]:
print(len(text2id.vocab))

### Попробуем простую Bag-of-words модель (с собственным словарём)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
count_vectorizer = CountVectorizer(max_features=10_000)

Получим sparse-матрицы из scipy

In [None]:
X_train = count_vectorizer.fit_transform(train_df['review'])
X_val = count_vectorizer.transform(val_df['review'])

In [None]:
import numpy as np

In [None]:
y_train = np.array([classes[c] for c in train_df['label']])
y_val = np.array([classes[c] for c in val_df['label']])

In [None]:
y_train[:10]

In [None]:
print(X_train[0])

In [None]:
for i in X_train[0].indices:
    print(count_vectorizer.get_feature_names()[i], X_train[0,i])

In [None]:
log_reg = LogisticRegression(max_iter=500)

In [None]:
log_reg.fit(X_train,y_train,)

In [None]:
from sklearn import metrics

In [None]:
lr_pred = log_reg.predict(X_val)

In [None]:
print(metrics.classification_report(y_val, lr_pred))

In [None]:
print(log_reg.coef_.shape)

In [None]:
sorted_weights_indices = log_reg.coef_.argsort()[0]

In [None]:
sorted_weights_indices[:10]

Поскольку каждый вес соответствует конкретному слову, мы можем понять, какие слова линейная модель считает наиболее весомыми при решении задачи

In [None]:
cv_feature_names = count_vectorizer.get_feature_names()

In [None]:
for index in sorted_weights_indices[:20]:
    print(cv_feature_names[index], log_reg.coef_[0,index])

In [None]:
for index in sorted_weights_indices[-20:]:
    print(cv_feature_names[index], log_reg.coef_[0,index])

### Построим нейронную сеть

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
train_dataset = TensorDataset(train_tensor, train_labels.type(torch.float32))
val_dataset = TensorDataset(val_tensor, val_labels.type(torch.float32))

In [None]:
class BILSTM_Network(nn.Module):
    def __init__(self, vocab_size, pad_index, embedding_size=300, hidden_size=512):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_size, pad_index)
        self.drop1 = nn.Dropout(p=0.5)
        self.rnn = nn.LSTM(300, hidden_size, bidirectional=True, num_layers=1, batch_first=True)
        self.drop2 = nn.Dropout(p=0.5)
        self.fc = nn.Linear(2 * hidden_size,1)
        self.pad_index = 0
        
    def forward(self, x):
        pad_mask = (x == 0).type(torch.float32).view(x.size(0), x.size(1), 1)
        # x: B x N
        batch_size = x.size(0)
        x = self.emb(x)
        x = self.drop1(x)
        # x: B, N, C
        
        # x: B x N x 2h
        seq,_ = self.rnn(x)
        
        x = self.drop2(seq)
        x = pad_mask * -1e9 + x * (1 - pad_mask)
        
        #x: B x 2h
        x,_ = torch.max(x, dim=1)
        x = self.fc(x)
        #x: B x 1
        x = torch.sigmoid(x)
        return x.view(-1)
    
    def to_prediction(self, output):
        return output > 0.5
        

In [None]:
class BestModel:
    def __init__(self, path, initial_criterion):
        self.path = path
        self.criterion = initial_criterion
        
    def update(self, model, optimizer, criterion):
        self.criterion = criterion
        torch.save({'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'criterion': criterion}, self.path)
        
    def load_model_data(self):
        return torch.load(self.path)
    
    def restore(self, model, optimizer):
        model_data = self.load_model_data()
        model.load_state_dict(model_data['model_state'])
        optimizer.load_state_dict(model_data['optimizer_state'])

In [None]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [None]:
import numpy as np

In [None]:
def train_model(epochs, model, optimizer, criterion, loaders, device, best_model, lr_scheduler=None, n_prints=1, clip=None):
    print_every = len(loaders['train']) // n_prints
    for epoch in range(epochs):
        model.train()
        running_train_loss = 0.0
        
        for iteration, (xx, yy) in enumerate(loaders['train']):
            optimizer.zero_grad()
            xx, yy = xx.to(device), yy.to(device)
            out = model(xx)
            loss = criterion(out, yy)
            running_train_loss += loss.item()
            loss.backward()
            
            if clip is not None:
                nn.utils.clip_grad_norm_(model.parameters(),clip)
            
            optimizer.step()
            
            if(iteration % print_every == print_every - 1):
                running_train_loss /= print_every
                print(f"Epoch {epoch}, iteration {iteration} training_loss {running_train_loss} lr={np.round(get_lr(optimizer),6)}")
                running_train_loss = 0.0
                
        if lr_scheduler:
            lr_scheduler.step()
            
        with torch.no_grad():
            model.eval()
            running_corrects = 0
            running_total = 0
            running_loss = 0.0
            for xx, yy in loaders['validation']:
                batch_size = xx.size(0)
                xx, yy = xx.to(device), yy.to(device)

                out = model(xx)
                
                loss = criterion(out, yy)
                running_loss += loss.item()
                
                predictions = model.to_prediction(out).type(torch.float32)
                running_corrects += (predictions == yy).sum().item()
                running_total += batch_size
            
            mean_val_loss = running_loss / len(loaders['validation'])
            accuracy = running_corrects / running_total
            
            if accuracy > best_model.criterion:
                best_model.update(model, optimizer, accuracy)
            
            print(f"Epoch {epoch}, val_loss {mean_val_loss}, accuracy = {accuracy}, lr={get_lr(optimizer)}")
    best_model.restore(model, optimizer)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available else 'cpu'
network = BILSTM_Network(len(text2id.vocab), pad_index=0, embedding_size=300, hidden_size=1024//2).to(device)

In [None]:
print(device)

In [None]:
optimizer = torch.optim.Adam(network.parameters(),lr=3e-4)
criterion = nn.BCELoss()
best_model = BestModel("best_model.md", 0)
scheduler= None
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.9,-1)

In [None]:
train_loader = DataLoader(train_dataset,32,shuffle=True)
val_loader = DataLoader(val_dataset,64, shuffle=False)

In [None]:
train_model(20, network,
            optimizer,
            criterion,
            {'train': train_loader, 'validation': val_loader},
            device,
            best_model, 
            n_prints=5,
            lr_scheduler=scheduler,
            clip=1.0)

In [None]:
best_model.restore(network, optimizer)

In [None]:
def evaluate(model, loader, device):
    all_preds = []
    correct_preds = []
    with torch.no_grad():
        model.eval()
        for xx, yy in loader:
            xx = xx.to(device)
            output = model(xx)
            all_preds.extend((output > 0.5).tolist())
            correct_preds.extend(yy.type(torch.int8).tolist())
            
    return all_preds, correct_preds

In [None]:
model_preds, correct_preds = evaluate(network, val_loader, device)

In [None]:
from sklearn import metrics
print(metrics.classification_report(correct_preds, model_preds))

## Воспользуемся моделью BERT

<img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png">

In [None]:
!pip install transformers

Выполните, чтобы очистить память видеокарты

In [None]:
import gc
del network
del optimizer
gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

In [None]:
from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer

In [None]:
selected_model = "bert-base-multilingual-cased"

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained(selected_model)

In [None]:
bert_model = AutoModel.from_pretrained(selected_model)

In [None]:
print(bert_tokenizer.tokenize(train_df['review'].iloc[0]))

In [None]:
bert_max_length = 300

In [None]:
bert_train = [ bert_tokenizer.encode(t, return_tensors='pt',
                                     max_length=bert_max_length,
                                     truncation=True,
                                     padding='max_length').view(-1) for t in tqdm(train_df['review'])]

In [None]:
bert_val = [ bert_tokenizer.encode(t, return_tensors='pt',
                                     max_length=bert_max_length,
                                     truncation=True,
                                     padding='max_length').view(-1) for t in tqdm(val_df['review'])]

In [None]:
bert_train = torch.stack(bert_train)
bert_val  = torch.stack(bert_val)

In [None]:
bert_model.eval()
pass

In [None]:
with torch.no_grad():
    res = bert_model.forward(bert_train[0:1],output_hidden_states=True)

In [None]:
for l, h in enumerate(res.hidden_states):
    print(l)
    print(h.size())

In [None]:

bert_model = bert_model.to(device)

In [None]:
next(iter(DataLoader(bert_train,batch_size=16))).size()

In [None]:
def get_bert_layers(bert: BertModel, inputs_ids_batch, selected_layers: List[int]):
    output = bert_model(inputs_ids_batch,output_hidden_states=True)
    if len(selected_layers) > 1:
        concatenated = torch.cat([output.hidden_states[l] for l in selected_layers], dim=-1)
    else:
        concatenated = output.hidden_states[selected_layers[0]]
    
    return concatenated

In [None]:
def pool_bert(input_ids, selected_layers, batch_size=16):
    
    pooled = np.zeros((len(input_ids), 768 * len(selected_layers)), dtype=np.float32)
    offset = 0
    for batch in tqdm(DataLoader(input_ids,batch_size=batch_size, shuffle=False)):
        batch = batch.to(device)
        with torch.no_grad():
            concatenated = get_bert_layers(bert_model, batch, selected_layers)

        non_padding_mask = (batch > 0.5).view(batch.size(0), batch.size(1), 1).type(torch.float32)
        summed = (non_padding_mask * concatenated).sum(dim=-2)
        
        lengths = non_padding_mask.squeeze(-1).sum(dim=-1,keepdim=True)
        mean_pooled = summed / lengths

        pooled[offset:offset + batch.size(0)] = mean_pooled.cpu().numpy()
        offset += batch.size(0)
    return pooled

In [None]:
bert_train_pooled = pool_bert(input_ids=bert_train, selected_layers=[10,11,12],batch_size=32)

In [None]:
bert_val_pooled = pool_bert(input_ids=bert_val, selected_layers=[10,11,12],batch_size=32)

In [None]:
bert_log_reg = LogisticRegression(max_iter=500)
bert_log_reg.fit(bert_train_pooled, y_train)

In [None]:
bert_log_pred = bert_log_reg.predict(bert_val_pooled)

In [None]:
print(metrics.classification_report(y_val, bert_log_pred))

### Построим и натренируем собственную модель LSTM поверх BERT

In [None]:
class LSTMOverBERT(nn.Module):
    def __init__(self, bert: BertModel, lstm_hidden_size=512):
        super().__init__()
        self.bert = bert
        
    def bert_encode(self, bert_input):
        # ?
        pass
    
    def lstm_encode(self, bert_output):
        # ?
        pass
    
    def classify(self, lstm_encoded, padding_mask):
        pass
    
    
    def forward(self,bert_input):
        padding_mask = bert_input == 0
        
        # BATCH_SIZE x TEXT_LENGTH x BERT_HIDDEN
        encoded = self.bert_encode(bert_input)
        
        # BATCH_SIZE x TEXT_LENGTH x LSTM_HIDDEN
        lstm_encoded = self.lstm_encode(encoded)
        
        logits = self.classify(lstm_encoded, padding_mask)
        
        return torch.sigmoid(logits).view(-1)
        

In [None]:
# network = LSTMOverBERT(bert_model, lstm_hidden_size=384)
# optimizer = torch.optim.Adam(network.parameters(),lr=3e-4)
# criterion = nn.BCELoss()
# best_model = BestModel("best_model_over_bert.md", 0)
# scheduler= None

In [None]:
# bert_train_dataset = TensorDataset(bert_train, train_labels.type(torch.float32))
# bert_val_dataset = TensorDataset(bert_val, val_labels.type(torch.float32))

In [None]:
# train_loader = DataLoader(bert_train_dataset,24,shuffle=True)
# val_loader = DataLoader(bert_val_dataset,32, shuffle=False)

In [None]:
# train_model(...)