In [None]:
import pandas as pd
from torch.utils.data.dataset import Dataset
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

In [None]:
df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
df.loc[:2500, :].to_csv('train.csv')
df.loc[2500:, :].to_csv('test.csv')

In [None]:
class MyDataset(Dataset):

    def __init__(self, data_path, dict_path, max_length_sentences=30, max_length_word=35):
        super(MyDataset, self).__init__()

        df = pd.read_csv(data_path) #"../input/commonlitreadabilityprize/train.csv"

        self.texts = df['excerpt'].str.lower().values
        try:
            self.labels = df['target'].values
        except:
            self.labels = list(range(len(self.texts)))
        self.dict = pd.read_csv(filepath_or_buffer=dict_path, header=None, sep=" ", quoting=csv.QUOTE_NONE, usecols=[0]).values
        self.dict = [word[0] for word in self.dict]
        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.num_classes = 1
        self.document_encode = [[
            [self.dict.index(word) if word in self.dict else -1 for word in word_tokenize(text=sentences)] for sentences
            in
            sent_tokenize(text=tx)] for tx in self.texts]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        label = self.labels[index]
        document_encode = self.document_encode[index]

        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [-1 for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[-1 for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)
        document_encode += 1

        return document_encode.astype(np.int64), label

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
import csv
csv.field_size_limit(sys.maxsize)
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn import metrics
import numpy as np


def matrix_mul(input, weight, bias=False):
    feature_list = []
    for feature in input:
        feature = torch.mm(feature, weight)
        if isinstance(bias, torch.nn.parameter.Parameter):
            feature = feature + bias.expand(feature.size()[0], bias.size()[1])
        feature = torch.tanh(feature).unsqueeze(0)
        feature_list.append(feature)

    return torch.cat(feature_list, 0).squeeze()

def element_wise_mul(input1, input2):

    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        feature_2 = feature_2.unsqueeze(1).expand_as(feature_1)
        feature = feature_1 * feature_2
        feature_list.append(feature.unsqueeze(0))
    output = torch.cat(feature_list, 0)

    return torch.sum(output, 0).unsqueeze(0)


class SentAttNet(nn.Module):
    def __init__(self, sent_hidden_size=50, word_hidden_size=50, num_classes=14):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 2 * sent_hidden_size))
        self.sent_bias = nn.Parameter(torch.Tensor(1, 2 * sent_hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * sent_hidden_size, 1))

        self.gru = nn.GRU(2 * word_hidden_size, sent_hidden_size, bidirectional=True)
        self.fc = nn.Linear(2 * sent_hidden_size, num_classes)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):
        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):

        f_output, h_output = self.gru(input, hidden_state)
        output = matrix_mul(f_output, self.sent_weight, self.sent_bias)
        output = matrix_mul(output, self.context_weight).permute(1, 0)
        output = F.softmax(output)
        #output = F.tanh(output)
        output = element_wise_mul(f_output, output.permute(1, 0)).squeeze(0)
        output = self.fc(output)

        return output, h_output


class WordAttNet(nn.Module):
    def __init__(self, word2vec_path, hidden_size=50):
        super(WordAttNet, self).__init__()
        dict = pd.read_csv(filepath_or_buffer=word2vec_path, header=None, sep=" ", quoting=csv.QUOTE_NONE).values[:, 1:]
        dict_len, embed_size = dict.shape
        dict_len += 1
        unknown_word = np.zeros((1, embed_size))
        dict = torch.from_numpy(np.concatenate([unknown_word, dict], axis=0).astype(np.float))

        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size))
        self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size))
        self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1))

        self.lookup = nn.Embedding(num_embeddings=dict_len, embedding_dim=embed_size).from_pretrained(dict)
        self.gru = nn.GRU(embed_size, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):

        output = self.lookup(input)
        f_output, h_output = self.gru(output.float(), hidden_state)  # feature output and hidden state output
        output = matrix_mul(f_output, self.word_weight, self.word_bias)
        output = matrix_mul(output, self.context_weight).permute(1,0)
        output = F.softmax(output)
        #output = F.tanh(output)
        output = element_wise_mul(f_output,output.permute(1,0))

        return output, h_output


class HierAttNet(nn.Module):
    def __init__(self, word_hidden_size, sent_hidden_size, batch_size, num_classes, pretrained_word2vec_path,
                 max_sent_length, max_word_length):
        super(HierAttNet, self).__init__()
        self.batch_size = batch_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length
        self.word_att_net = WordAttNet(pretrained_word2vec_path, word_hidden_size)
        self.sent_att_net = SentAttNet(sent_hidden_size, word_hidden_size, num_classes)
        self._init_hidden_state()

    def _init_hidden_state(self, last_batch_size=None):
        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):

        output_list = []
        input = input.permute(1, 0, 2)
        for i in input:
            output, self.word_hidden_state = self.word_att_net(i.permute(1, 0), self.word_hidden_state)
            output_list.append(output)
        output = torch.cat(output_list, 0)
        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)

        return output

In [None]:
if torch.cuda.is_available():
    torch.cuda.manual_seed(123)
else:
    torch.manual_seed(123)

training_params = {"batch_size": 128*4,
                   "shuffle": True,
                   "drop_last": True}
test_params = {"batch_size": 128*4,
               "shuffle": False,
               "drop_last": False}

In [None]:
from torch.utils.data import DataLoader

max_word_length, max_sent_length = 15, 10
training_set = MyDataset("train.csv", '../input/nlpword2vecembeddingspretrained/glove.6B.50d.txt', max_sent_length, max_word_length)
training_generator = DataLoader(training_set, **training_params)
test_set = MyDataset("test.csv", '../input/nlpword2vecembeddingspretrained/glove.6B.50d.txt', max_sent_length, max_word_length)
test_generator = DataLoader(test_set, **test_params)

In [None]:
test_generator.__iter__().next()[0]#.shape

In [None]:
model = HierAttNet(50, 50, 512, 1, '../input/nlpword2vecembeddingspretrained/glove.6B.50d.txt', max_sent_length, max_word_length)

In [None]:
if torch.cuda.is_available():
    model.cuda()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001) #, momentum=0.9
best_loss = 1e5
best_epoch = 0

In [None]:
num_epoches = 500
test_interval = 1
es_min_delta = 0
es_patience = 15

model.train()
num_iter_per_epoch = len(training_generator)
for epoch in range(num_epoches):
    for iter, (feature, label) in enumerate(training_generator):
        if torch.cuda.is_available():
            feature = feature.cuda()
            label = label.cuda()
        optimizer.zero_grad()
        model._init_hidden_state()
        predictions = model(feature)
        loss = criterion(predictions.view(-1), label.float())
        loss.backward()
        optimizer.step()
        print("Epoch: {}/{}, Iteration: {}/{}, Lr: {}, Loss: {}".format(
            epoch + 1,
            num_epoches,
            iter + 1,
            num_iter_per_epoch,
            optimizer.param_groups[0]['lr'],
            loss**0.5))
    if epoch % test_interval == 0:
        model.eval()
        loss_ls = []
        te_label_ls = []
        te_pred_ls = []
        for te_feature, te_label in test_generator:
            num_sample = len(te_label)
            if torch.cuda.is_available():
                te_feature = te_feature.cuda()
                te_label = te_label.cuda()
            with torch.no_grad():
                model._init_hidden_state(num_sample)
                te_predictions = model(te_feature)
            te_loss = criterion(te_predictions.view(-1), te_label)
            loss_ls.append(te_loss * num_sample)
            te_label_ls.extend(te_label.clone().cpu())
            te_pred_ls.append(te_predictions.clone().cpu())
        te_loss = sum(loss_ls) / test_set.__len__()
        te_pred = torch.cat(te_pred_ls, 0)
        te_label = np.array(te_label_ls)

        print("Epoch: {}/{}, Lr: {}, Loss: {}".format(
            epoch + 1,
            num_epoches,
            optimizer.param_groups[0]['lr'],
            te_loss**0.5))
        model.train()
        if te_loss + es_min_delta < best_loss:
            best_loss = te_loss
            best_epoch = epoch
            torch.save(model, "whole_model_han")

        # Early stopping
        if epoch - best_epoch > es_patience > 0:
            print("Stop training at epoch {}. The lowest loss achieved is {}".format(epoch, te_loss))
            break

In [None]:
#list(model.parameters())

In [None]:
f = MyDataset("../input/commonlitreadabilityprize/test.csv", '../input/nlpword2vecembeddingspretrained/glove.6B.50d.txt', max_sent_length, max_word_length)
f_generator = DataLoader(f, **test_params)

In [None]:
model.eval()

te_pred_ls = []

for te_feature, _ in f_generator:
    if torch.cuda.is_available():
        te_feature = te_feature.cuda()
    with torch.no_grad():
        model._init_hidden_state(te_feature.shape[0])
        te_predictions = model(te_feature)
        te_pred_ls.append(te_predictions.view(-1).clone().cpu().numpy())

In [None]:
pd.read_csv("../input/commonlitreadabilityprize/test.csv")[['id']]

In [None]:
pre = pd.DataFrame({
    'id': pd.read_csv("../input/commonlitreadabilityprize/test.csv").loc[:, 'id'].values,
    'target': np.concatenate(te_pred_ls)
})

pre

In [None]:
pre.to_csv('submission.csv', index=False)