In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.autograd import Variable
import torch.autograd as autograd
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
##
from collections import Counter
## data preparation
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
## pl trainer
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
import json

## токенизацию берем из Roberta
from transformers import RobertaTokenizerFast

In [None]:
PATH_PART = '/kaggle/input/feedback-prize-2021/'

In [None]:
def get_mask(batch_tensor):
    mask = batch_tensor.eq(0)
    mask = mask.eq(0)
    return mask


### CharCNN.py script

class CharCNN(nn.Module):
    def __init__(self, alphabet_size, embedding_dim, hidden_dim, dropout):
        super(CharCNN, self).__init__()
        print("build char sequence feature extractor: CNN ...")
        self.hidden_dim = hidden_dim
        self.char_drop = nn.Dropout(dropout)
        self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
        self.char_embeddings.weight.data.copy_(torch.from_numpy(CharCNN.random_embedding(alphabet_size, embedding_dim)))
        self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1)

    @staticmethod
    def random_embedding(vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        pretrain_emb[0, :] = np.zeros((1, embedding_dim))
        return pretrain_emb

    def forward(self, input):

        batch_size = input.size(0)
        char_embeds = self.char_drop(self.char_embeddings(input))
        char_embeds = char_embeds.transpose(2, 1).contiguous()
        char_cnn_out = self.char_cnn(char_embeds)
        char_cnn_out = F.max_pool1d(char_cnn_out, char_cnn_out.size(2)).contiguous().view(batch_size, -1)
        return char_cnn_out

In [None]:
## Dataset preparation
def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item


def char_mapping(train_sentences):
    chars = ''.join([w for s in train_sentences for w in s])
    char_count = dict(Counter(chars))
    char_count['<pad>'] = 1
    char_count['<unk>'] = 3
    char_to_id, id_to_char = create_mapping(char_count)
    print("Found %i unique characters" % len(char_count))
    return char_count, char_to_id, id_to_char

In [None]:
class Named_Entity_Recognition_Model(nn.Module):
    def __init__(self,
                 vocab_size, 
                 word_embed_dim, 
                 word_hidden_dim,
                 alphabet_size, 
                 char_embedding_dim, 
                 char_hidden_dim,
                 tag_num, dropout,
                 pretrain_embed=None, 
                 use_char=False):
        super(Named_Entity_Recognition_Model, self).__init__()
        self.use_char = use_char
        self.drop = nn.Dropout(dropout)
        self.input_dim = word_embed_dim

        self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0)
        if pretrain_embed is not None:
            self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed))
        else:
            self.embeds.weight.data.copy_(torch.from_numpy(self.random_embedding(vocab_size, word_embed_dim)))

        if self.use_char:
            self.input_dim += char_hidden_dim
            self.char_feature = CharCNN(alphabet_size, char_embedding_dim, char_hidden_dim, dropout)

        self.lstm = nn.LSTM(self.input_dim, word_hidden_dim, batch_first=True, bidirectional=True)
        self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(1, vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, char_inputs, batch_label, mask):
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        word_embeding = self.embeds(word_inputs)
        word_list = [word_embeding]
        if self.use_char:
            char_features = self.char_feature(char_inputs).contiguous().view(batch_size, seq_len, -1)
            word_list.append(char_features)
        word_embeding = torch.cat(word_list, 2)
        word_represents = self.drop(word_embeding)
        packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True)
        hidden = None
        lstm_out, hidden = self.lstm(packed_words, hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        lstm_out = lstm_out.transpose(0, 1)
        feature_out = self.drop(lstm_out)
        feature_out = self.hidden2tag(feature_out)
        loss_function = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum')
        feature_out = feature_out.contiguous().view(batch_size * seq_len, -1)
        total_loss = loss_function(feature_out, batch_label.contiguous().view(batch_size * seq_len))
        return total_loss

    def forward(self, word_inputs, word_seq_lengths, char_inputs, batch_label, mask):
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        word_embeding = self.embeds(word_inputs)
        word_list = [word_embeding]
        if self.use_char:
            char_features = self.char_feature(char_inputs).contiguous().view(batch_size, seq_len, -1)
            word_list.append(char_features)
        word_embeding = torch.cat(word_list, 2)
        word_represents = self.drop(word_embeding)
        packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True)
        hidden = None
        lstm_out, hidden = self.lstm(packed_words, hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        lstm_out = lstm_out.transpose(0, 1)
        feature_out = self.drop(lstm_out)
        feature_out = self.hidden2tag(feature_out)
        feature_out = feature_out.contiguous().view(batch_size * seq_len, -1)
        _, tag_seq = torch.max(feature_out, 1)
        tag_seq = tag_seq.view(batch_size, seq_len)
        tag_seq = mask.long() * tag_seq
        return tag_seq

In [None]:
class NER_Dataset(Dataset):
    def __init__(self, texts, word_to_id, char_to_id, input_offset_mappings):
        self.word_to_id = word_to_id
        self.char_to_id = char_to_id
        self.texts = texts
        self.input_offset_mappings = input_offset_mappings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text_id = []
        label_id = []
        text = self.texts[item]
        seq_char_list = list()
        offset_mapping = self.input_offset_mappings[item]
        for word in text:
            ### особенность для роберты
            word = word.strip()
            if word in word_to_id:
                word_id = word_to_id[word] # 1
            elif 'Ġ' + word in word_to_id:
                word_id = word_to_id['Ġ' + word]
            elif 'Ġ' + word.lower() in word_to_id:
                print('tut')
                word_id = word_to_id['Ġ' + word.lower()]
            else:
                word_id = word_to_id['<unk>'] ## 
            ###
            text_id.append(word_id)
        text_tensor = torch.tensor(text_id).long()
        for word in text:
            char_list = list(word)
            char_id = list()
            for char in char_list:
                char_id.append(self.char_to_id[char])
            seq_char_list.append(char_id)
        return {'text': text_tensor, 'char': seq_char_list, 'offset_mapping': torch.tensor(offset_mapping)}
    
    
def my_collate(key, batch_tensor):
    if key == 'char':
        batch_tensor = pad_char(batch_tensor)
        return batch_tensor
    else:
        word_seq_lengths = torch.LongTensor(list(map(len, batch_tensor)))
        _, word_perm_idx = word_seq_lengths.sort(0, descending=True)
        batch_tensor.sort(key=lambda x: len(x), reverse=True)
        tensor_length = [len(sq) for sq in batch_tensor]
        print
        batch_tensor = pad_sequence(batch_tensor, batch_first=True, padding_value=0)
        return batch_tensor, tensor_length, word_perm_idx


### PADDING PART
def my_collate_fn(batch):
    return {key: my_collate(key, [d[key] for d in batch]) for key in batch[0]}


def pad_char(chars):
    batch_size = len(chars)
    max_seq_len = max(map(len, chars))
    pad_chars = [chars[idx] + [[0]] * (max_seq_len - len(chars[idx])) for idx in range(len(chars))]
    length_list = [list(map(len, pad_char)) for pad_char in pad_chars]
    max_word_len = max(map(max, length_list))
    char_seq_tensor = torch.zeros((batch_size, max_seq_len, max_word_len)).long()
    char_seq_lengths = torch.LongTensor(length_list)
    for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
        for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
            char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)

    return char_seq_tensor

In [None]:
## LOAD TAG IDS
with open('/kaggle/input/lstm-for-test-data/char_to_id.json', 'r') as fp:
    char_to_id = json.load(fp)
    
with open('/kaggle/input/lstm-for-test-data/labels_to_ids.json', 'r') as fp:
    labels_to_ids = json.load(fp)
    
ids_to_labels = {k:v for k,v in enumerate(labels_to_ids)}

In [None]:
tokenizer =  RobertaTokenizerFast.from_pretrained('/kaggle/input/roberta-tokenizer', local_files_only=True) ## without internet 
word_to_id = tokenizer.vocab

In [None]:
## Вытащим целые тексты по id [test]
test_names, test_texts = [], []
for f in tqdm(list(glob.glob(PATH_PART+"test/*"))):
    test_names.append(f.replace(PATH_PART+'test/', '').replace('.txt', ''))
    test_texts.append(open(f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts.head()

In [None]:
data = test_texts[['text', 'id']]
###
input_sentences = []
input_offset_mappings = []
###
input_sentence = []
for text_idx in tqdm(range(len(data))):
    encoded_text = tokenizer(data.text.values[text_idx],
              return_offsets_mapping=True, 
              padding='max_length', 
              truncation=True, 
              max_length=512)
    ##
    for token in encoded_text['input_ids']:
        token_id = token
        token_as_text = tokenizer.decode(token_id)
        input_sentence.append(token_as_text)
            
    input_sentences.append( input_sentence )
    input_offset_mappings.append( encoded_text['offset_mapping'] )
    input_sentence = []  
##

In [None]:
### get embeddings from pretrained model
### на выходе получаем словарь: слово-вектор
glove_model = {}
with open('/kaggle/input/lstm-for-test-data/glove.6B.100d.txt','r') as f:
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array(split_line[1:], dtype=np.float64)
        glove_model[word] = embedding
        
VOCAB_SIZE = len(glove_model.keys())


### get embedding matrix for train data
word_embeds_for_train_data = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), 100))
for w in word_to_id:
    if w in glove_model:
        w_for_glove = w.lower().replace('Ġ', '') ## в glove они хранятся в нижнем регистре и убираем Ġ (особенность roberta)
        word_embeds_for_train_data[word_to_id[w]] = glove_model[w_for_glove]
    elif w.lower() in glove_model:
        w_for_glove = w.lower().replace('Ġ', '')
        word_embeds_for_train_data[word_to_id[w]] = glove_model[w_for_glove]

In [None]:
for_inf_dataset = NER_Dataset(input_sentences, 
                              word_to_id, 
                              char_to_id,
                              input_offset_mappings
                             )

for_inf_dl = DataLoader(for_inf_dataset,
                      shuffle=False,
                      batch_size=1,
                      num_workers=0,
                      collate_fn=my_collate_fn)

In [None]:
model = Named_Entity_Recognition_Model(vocab_size=len(word_to_id), 
                               word_embed_dim=100, 
                               word_hidden_dim=100, 
                               alphabet_size=len(char_to_id), 
                               char_embedding_dim=30, 
                               char_hidden_dim=50,
                               tag_num = len(labels_to_ids), 
                               dropout=0.2, 
                               pretrain_embed=word_embeds_for_train_data,
                               use_char=True) ## не работает с gpu. Требует доработки

model.load_state_dict(torch.load('/kaggle/input/lstm-for-test-data/best_bilstm__model.pt'))

In [None]:
model.eval()
model = model.to('cpu')
predictions = []
out_strings = []
for batch in tqdm(for_inf_dl):
    batch_text, seq_length, word_perm_idx = batch['text']
    char_inputs = batch['char']
    offset_mapping = batch['offset_mapping']
    char_inputs = char_inputs[word_perm_idx]
    char_dim = char_inputs.size(-1)
    char_inputs = char_inputs.contiguous().view(-1, char_dim)
    mask = get_mask(batch_text)
    ## inference
    outputs = model(batch_text, seq_length, char_inputs, None, mask) ## = flattened_predictions у Roberta
    tokens = tokenizer.convert_ids_to_tokens(batch_text[0].squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in outputs.cpu().numpy()[0]] ## outputs[0] указан потому, что batch_size > 1
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
    prediction = []
    out_str = []
    off_list = offset_mapping[0][0].squeeze().tolist() ## второй [0] из offset_mapping убрать, он добавлен потому-что batch_size > 1
    for idx, mapping in enumerate(off_list):
        if mapping[0] != 0 and mapping[0] != off_list[idx-1][1]:
            prediction.append(wp_preds[idx][1])
            out_str.append(wp_preds[idx][0])
        else:
            if idx == 1:
                prediction.append(wp_preds[idx][1])
                out_str.append(wp_preds[idx][0])
            continue
    predictions.append(prediction)
    out_strings.append(out_str)

In [None]:
final_preds = []
import pdb
for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = [x.replace('B-','').replace('I-','') for x in predictions[i]]
    preds = []
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        j = end
        
print(final_preds[2])

In [None]:
test_df = pd.read_csv(PATH_PART + 'sample_submission.csv')
test_df

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub.to_csv("submission.csv", index=False)
sub