In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using CPU instead.')
    device = torch.device("cpu")

fastText: wiki-news-300d-1M-subword: https://fasttext.cc/docs/en/english-vectors.html

enwiki_20180420_500d.txt: https://wikipedia2vec.github.io/wikipedia2vec/pretrained/

GloVe: https://nlp.stanford.edu/projects/glove/

## Read Data and Prepare Dataset

In [None]:
from torch.utils.data.dataset import Dataset
import csv
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

class MyDataset(Dataset):

    def __init__(self, data_path, dict_path, max_length_sentences=15, max_length_word=100, test=False):
        super(MyDataset, self).__init__()

        texts, targets = [], []
        with open(data_path) as csv_file:
            reader = csv.DictReader(csv_file, quotechar='"')
            for idx, line in enumerate(reader):
                text = ""
                for tx in line['excerpt']:
                    text += tx.lower()
                    text += " "
                if not test:
                    target = float(line['target'])
                    targets.append(target)
                texts.append(text)

        self.texts = texts
        self.targets = targets
        self.dict = pd.read_csv(filepath_or_buffer=dict_path, header=1, sep=" ", quoting=csv.QUOTE_NONE,
                                usecols=[0]).values
        self.dict = [word[0] for word in self.dict]
        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.num_classes = 1 # len(set(self.targets))

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index, test=False):
        if not test:
            target = self.targets[index]
        text = self.texts[index]
        document_encode = [
            [self.dict.index(word) if word in self.dict else -1 for word in word_tokenize(text=sentences)] for
                                                                    sentences in sent_tokenize(text=text)]

        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [-1 for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[-1 for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)
        document_encode += 1
        if test:
            # float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool
            return document_encode.astype(np.float32)
        return document_encode.astype(np.float32), target 

In [None]:
#train = MyDataset(data_path="../input/commonlitreadabilityprize/train.csv",
#                  dict_path="../input/fasttext/wiki-news-300d-1M-subword.txt")
# print (train.__getitem__(index=1)[0].shape)
#for index in range(train.__len__()):
#    print(train.__getitem__(index=index))
# test = MyDataset(data_path="../input/commonlitreadabilityprize/test.csv",
#                  dict_path="../input/fasttext/wiki-news-300d-1M-subword.txt", test=True)
# print(test.__getitem__(index=1, test=True).shape)

## Utils

In [None]:
import torch.nn as nn
import random

def get_max_lengths(data_path):
    word_length_list = []
    sent_length_list = []
    with open(data_path) as csv_file:
        reader = csv.reader(csv_file, quotechar='"')
        for idx, line in enumerate(reader):
            text = ""
            for tx in line[1:]:
                text += tx.lower()
                text += " "
            sent_list = sent_tokenize(text)
            sent_length_list.append(len(sent_list))

            for sent in sent_list:
                word_list = word_tokenize(sent)
                word_length_list.append(len(word_list))

        sorted_word_length = sorted(word_length_list)
        sorted_sent_length = sorted(sent_length_list)

    return sorted_word_length[int(0.8*len(sorted_word_length))], sorted_sent_length[int(0.8*len(sorted_sent_length))]

def matrix_mul(input, weight, bias=False):
    feature_list = []
    for feature in input:
        feature = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze()

def element_wise_mul(input1, input2):

    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        feature_2 = feature_2.unsqueeze(1).expand_as(feature_1)
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))
    output = torch.cat(feature_list, 0)

    return torch.sum(output, 0).unsqueeze(0)

def set_seed(seed_value=42):
    # Set seed for reproducibility.
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

# Model

In [None]:
"""
@author: Viet Nguyen <nhviet1009@gmail.com>
"""

class WordAttNet(nn.Module):
    def __init__(self, word2vec_path, hidden_size=50, num_layers=4):
        super(WordAttNet, self).__init__()
        #dict = pd.read_csv(filepath_or_buffer=word2vec_path, header=None, sep=" ", quoting=csv.QUOTE_NONE).values[:, 1:]
        dict = pd.read_csv(filepath_or_buffer=word2vec_path, header=1, sep=" ",
                           quoting=csv.QUOTE_NONE, dtype=str, low_memory=True).values[:500000, 1:]
        for idx in range(len(dict)):
            dict[idx] = np.float16(dict[idx])
        dict_len, embed_size = dict.shape
        dict_len += 1
        unknown_word = np.zeros((1, embed_size))
        dict = torch.from_numpy(np.concatenate([unknown_word, dict], axis=0).astype(np.float))

        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size))
        self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1))

        self.lookup = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(dict)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, bidirectional=True)
        self._create_weights(mean=0.0, std=0.8)

    def _create_weights(self, mean=0.0, std=0.8):
        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):
        #print(f'input: {torch.isnan(input).any()}')
        output = self.lookup(input.long())
        #print(f'output1: {torch.isnan(output).any()}')
        f_output, h_output = self.gru(output.float(), hidden_state)  # feature output and hidden state output
        #print(f'f_output: {torch.isnan(f_output).any()}')
        output = matrix_mul(f_output, self.word_weight, self.word_bias)
        #print(f'output2: {torch.isnan(output).any()}')
        output = matrix_mul(output, self.context_weight).permute(1,0)
        #print(f'output3: {torch.isnan(output).any()}')
        output = F.softmax(output)
        #print(f'output4: {torch.isnan(output).any()}')
        output = element_wise_mul(f_output,output.permute(1,0))
        #print(f'output5: {torch.isnan(output).any()}')
        return output, h_output

In [None]:
"""
@author: Viet Nguyen <nhviet1009@gmail.com>
"""

class SentAttNet(nn.Module):
    def __init__(self, sent_hidden_size=50, word_hidden_size=50, num_classes=100, num_layers=4):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 2 * sent_hidden_size))
        self.sent_bias = nn.Parameter(torch.Tensor(1, 2 * sent_hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 1))

        self.gru = nn.GRU(2 * word_hidden_size, sent_hidden_size, num_layers, bidirectional=True)
        #self.fc = nn.Linear(2 * sent_hidden_size, num_classes)
        
        # self.sent_softmax = nn.Softmax()
        # self.fc_softmax = nn.Softmax()
        self._create_weights(mean=0.0, std=0.5)

    def _create_weights(self, mean=0.0, std=0.5):
        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):
        f_output, h_output = self.gru(input, hidden_state)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        #output = self.fc1(output)
        return output, h_output

In [None]:
"""
@author: Viet Nguyen <nhviet1009@gmail.com>
"""

class HierAttNet(nn.Module):
    def __init__(self, word_hidden_size, sent_hidden_size, batch_size, num_classes, pretrained_word2vec_path,
                 max_sent_length, max_word_length, num_layers=4):
        super(HierAttNet, self).__init__()
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length

        self.word_att_net = WordAttNet(pretrained_word2vec_path, word_hidden_size, num_layers=self.num_layers*2)
        self.sent_att_net = SentAttNet(sent_hidden_size, word_hidden_size, num_classes, num_layers=self.num_layers)
        self._init_hidden_state()
        
        self.regressor = nn.Sequential(
                            #nn.Dropout(0.5),
                            nn.Linear(2 * sent_hidden_size, sent_hidden_size),
                            nn.ReLU(),
                            nn.Linear(sent_hidden_size, num_classes)
                         )

    def _init_hidden_state(self, last_batch_size=None):
        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        self.word_hidden_state = torch.zeros(self.num_layers*2*2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(self.num_layers*2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):
        output_list = []
        for i in input:
            output, self.word_hidden_state = self.word_att_net(i.permute(1, 0), self.word_hidden_state)
            output_list.append(output)
        output = torch.cat(output_list, 0)
        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)
        output = self.regressor(output)
        return output

## Training

In [None]:
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader
import os
import shutil
import torch.nn.functional as F

def train(training_generator, val_generator, opt, model):
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    # writer.add_graph(model, torch.zeros(opt.batch_size, max_sent_length, max_word_length))

    #if torch.cuda.is_available():
    #    model.cuda()
    criterion = nn.MSELoss(reduction='mean')
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                                 lr=opt.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1, threshold=5e-2)
#     optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
#     scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=1,
#                                                   step_size_up=500,mode="triangular2")
    
    best_loss = 1e5
    best_epoch = 0
    model.train()
    num_iter_per_epoch = len(training_generator)
    for epoch in range(opt.num_epoches):
        for i, batch in enumerate(training_generator):
            if len(batch) < 2:
                continue
            feature, label = tuple(t.to(device) for t in batch)
            model.zero_grad()
            if torch.cuda.is_available():
                feature = feature.cuda()
                label = label.cuda()
            optimizer.zero_grad()
            model._init_hidden_state()  # fer això per cada lot?¿ ###############
            predictions = model(feature.permute(1, 0, 2))
            #print(predictions)
            loss = criterion(torch.reshape(predictions.float(), (-1,)), label.float())
            #print(loss)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)  #############3
            optimizer.step()
            if i % 4 == 0:
                print("Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss/batch: {}".format(
                    epoch + 1,
                    opt.num_epoches,
                    i + 1,
                    num_iter_per_epoch,
                    optimizer.param_groups[0]['lr'],
                    loss))
            writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + i)
            writer.add_scalar('Train/Accuracy', epoch * num_iter_per_epoch + i)
        if epoch % opt.test_interval == 0:
            model.eval()
            loss_ls = []
            te_label_ls = []
            te_pred_ls = []
            itr = 0
            for te_feature, te_label in val_generator:
                if len(te_label) < 2:
                    continue
                itr += 1
                num_sample = len(te_label)
                if torch.cuda.is_available():
                    te_feature = te_feature.cuda()
                    te_label = te_label.cuda()
                with torch.no_grad():
                    model._init_hidden_state(num_sample)
                    te_predictions = model(te_feature.permute(1, 0, 2))
                te_loss = criterion( torch.reshape(te_predictions, (-1,)), te_label )
                loss_ls.append(te_loss)# * num_sample)
                te_label_ls.extend(te_label.clone().cpu())
                te_pred_ls.append(te_predictions.clone().cpu())
            te_loss = torch.sqrt(sum(loss_ls) / val_generator.__len__())
            te_pred = torch.cat(te_pred_ls, 0)
            te_label = np.array(te_label_ls)
            scheduler.step(torch.unsqueeze(sum(loss_ls), 0))
            output_file.write(
                "Epoch: {}/{} \nTest loss: {}".format(
                    epoch + 1, opt.num_epoches,
                    te_loss))
            print("Epoch: {}/{}, Lr: {}, Loss: {}".format(
                epoch + 1,
                opt.num_epoches,
                optimizer.param_groups[0]['lr'],
                te_loss))
            writer.add_scalar('Test/Loss', te_loss, epoch)
            writer.add_scalar('Test/Accuracy', epoch)
            model.train()
            if te_loss + opt.es_min_delta < best_loss:
                best_loss = te_loss
                best_epoch = epoch
                torch.save(model, opt.saved_path + os.sep + "whole_model_han")

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print("Stop training at epoch {}. The lowest loss achieved is {}".format(epoch, te_loss))
                break

In [None]:
from types import SimpleNamespace
opt = SimpleNamespace(
        batch_size = 64,
        num_epoches = 10,
        lr = 1e-4,
        word_hidden_size = 80,
        sent_hidden_size = 40,
        num_layers = 5,
        es_min_delta = 0.0,
        es_patience = 5,
        train_set_path = "../input/commonlitreadabilityprize/train.csv",
        test_set = "../input/commonlitreadabilityprize/test.csv",
        test_interval = 1,
        word2vec_path = "../input/glove50d200d/glove.6B.50d.txt",
        log_path = "./tensorboard/han_voc",
        saved_path = "trained_models",
        p = 0.1  # size of val set
)

In [None]:
if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
else:
    torch.manual_seed(123)

filename = opt.saved_path + os.sep + "logs.txt"
os.makedirs(os.path.dirname(filename), exist_ok=True)
output_file = open(filename, "w")
output_file.write("Model's parameters: {}".format(vars(opt)))

training_params = {"batch_size": opt.batch_size,
                   "pin_memory": True,
                   "shuffle": True,
                   "drop_last": True}
test_params = {"batch_size": opt.batch_size,
               "pin_memory": True,
               "shuffle": False,
               "drop_last": False}

#word_length, sent_length = get_max_lengths(opt.train_set_path)
word_length, sent_length = 80, 15
train_dataset = MyDataset(opt.train_set_path,
                  opt.word2vec_path,
                  sent_length, word_length)
print(train_dataset.__len__())

In [None]:
#from sklearn.model_selection import train_test_split
#excerpts_train, excerpts_val, targets_train, targets_val = train_test_split(excerpts_training, targets_training, test_size=0.1, random_state=2021)

training_set, val_set = torch.utils.data.random_split(train_dataset,
                        [train_dataset.__len__() - int(train_dataset.__len__()*opt.p), int(opt.p*train_dataset.__len__())])
training_generator = DataLoader(training_set, **training_params)
val_generator = DataLoader(val_set, **test_params) 

In [None]:
torch.autograd.set_detect_anomaly(True)

import time

begin = time.time()
set_seed(37)
model = HierAttNet(opt.word_hidden_size, opt.sent_hidden_size, opt.batch_size, 1,
                   opt.word2vec_path, sent_length, word_length, num_layers=opt.num_layers)
model.to(device)
#if __name__ == '__main__':
train(training_generator, val_generator, opt, model)
print(f'elapsed: {time.time() - begin}')

## Train on full training dataset

In [None]:
# # Concatenate the train set and the validation set
# full_train_data = torch.utils.data.ConcatDataset([train_data, val_data])

# #full_train_data = train_data
# full_train_sampler = RandomSampler(full_train_data)
# full_train_dataloader = DataLoader(full_train_data, sampler=full_train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [None]:
# # # Train the Bert Classifier on the entire training data
# set_seed(42)
# XLN_regressor, optimizer, scheduler = initialize_model(epochs=init_epochs, freeze=False)
# train(XLN_regressor, full_train_dataloader, epochs=2)

# # set_seed(42)
# # XLN_regressor, optimizer, scheduler = initialize_model(epochs=init_epochs, freeze=False)
# # train(XLN_regressor, train_dataloader, val_dataloader, epochs=2, evaluation=True)

## Prediction on Test Set

In [None]:
# import torch.nn.functional as F

# def XLN_predict(model, test_dataloader):
#     #Perform a forward pass on the trained BERT model to predict.

#     # Put the model into the evaluation mode. The dropout layers are disabled during
#     # the test time.
#     model.eval()
#     all_preds = []

#     # For each batch in our test set...
#     for batch in test_dataloader:
#         # Load batch to GPU
#         b_input_ids, b_attn_mask, b_token_type_ids = tuple(t.to(device) for t in batch)[:3]

#         # Compute preds
#         with torch.no_grad():
#             preds = model(b_input_ids, b_attn_mask, b_token_type_ids)
#         all_preds.append(preds)
    
#     # Concatenate preds from each batch
#     all_preds = torch.cat(all_preds, dim=0)

#     # Apply softmax to calculate probabilities
#     #probs = F.softmax(all_preds, dim=1).cpu().numpy()

#     return all_preds

### Preproess Test Data

In [None]:
# # Run `preprocessing_for_bert` on the test set
# print('Tokenizing data...')
# test_inputs, test_masks, test_type_ids = preprocessing_for_XLN(excerpts_test)

# # Create the DataLoader for our test set
# test_dataset = TensorDataset(test_inputs, test_masks, test_type_ids)
# test_sampler = SequentialSampler(test_dataset)
# test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=VALID_BATCH_SIZE)

## Predictions and Submission

### Predict

In [None]:
# # Compute predictions on the test set
# preds_test = XLN_predict(XLN_regressor, test_dataloader)

### Format

In [None]:
# submission = df_test[['id']]
# submission['target'] = np.array(preds_test.cpu()) #np.mean(test_pred, axis=0)
# submission.to_csv('submission.csv', index=False)
# display(submission.head(10))