# Train RNN_RNN

In [3]:
import os

import pandas as pd

import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

from tqdm import tqdm

from utils.GloveMgr import GloveMgr
from utils.Dataset import Dataset
from utils.DataLoader import DataLoader
from utils.preprocess_df import preprocess_df

#from models.RNN_RNN import RNN_RNN

from time import time

In [4]:
vocab_size = 150000
batch_size = 32
epochs = 5
learning_rate = 1e-3
model_name = "RNN_RNN"
average_proportion_of_sentences_per_document = 0.2670278281534701
average_number_of_sentences_per_document = 6.061850780738518

In [5]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Display the number of available GPUs
    print(f"Number of available GPUs: {torch.cuda.device_count()}")
    # Display the name of each GPU
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

Number of available GPUs: 1
GPU 0: NVIDIA GeForce GTX 1650


In [6]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

device = torch.device(dev)
device

device(type='cuda', index=0)

In [7]:
glovemgr = GloveMgr("./data/glove.6B/glove.6B.100d.txt", vocab_size=vocab_size)

In [8]:
train_dataset = Dataset(preprocess_df(pd.read_json("./data/train.json"), glovemgr=glovemgr, is_sep_n=True, remove_stop_word=True, stemming=False, trunc_sent=50, padding_sent=50, trunc_doc=100))
train_iter = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)

In [9]:
val_dataset = Dataset(preprocess_df(pd.read_json("./data/val.json"), glovemgr=glovemgr, is_sep_n=True, remove_stop_word=True, stemming=False, trunc_sent=50, padding_sent=50, trunc_doc=100))
val_iter = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [10]:
from models.BasicModel import BasicModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNN_RNN(BasicModel):
    def __init__(self, device, vocab_size, word_embed = None):
        super(RNN_RNN, self).__init__()

        self.device = device

        self.word_embedding = nn.Embedding(vocab_size+2, 100, padding_idx=0)
        # Load word embedding if specified
        if word_embed is not None:
            self.word_embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(word_embed).float())

        # 100 : word2vec embedding size
        self.word_GRU = nn.GRU(input_size = 100, hidden_size = 200, batch_first = True, bidirectional = True)
        self.sent_GRU = nn.GRU(input_size = 2*200, hidden_size=200, batch_first = True, bidirectional = True)

        # 10: relative position range size, with segment size = 10
        self.rel_pos_emb = nn.Embedding(11, 100)
        self.abs_pos_emb = nn.Embedding(100, 100)

        self.Wdoc = nn.Linear(2*200,2*200)
        self.bias_doc = nn.Parameter(torch.FloatTensor(2*200).uniform_(-0.1,0.1))

        self.Wcontent = nn.Linear(2*200,1,bias=False)
        self.Wsalience = nn.Bilinear(2*200,2*200,1,bias=False)
        self.Wnovelty = nn.Bilinear(2*200,2*200,1,bias=False)
        self.Wabs_pos = nn.Linear(100,1,bias=False)
        self.Wrel_pos = nn.Linear(100,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))

    def avg_pool1d(self,x,seq_lens):
        out = []
        for index,t in enumerate(x):
            if seq_lens[index] == 0:
                t = t[:1]
            else:
                t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.avg_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out

    def forward(self, arr_x, doc_lens):
        probs = []

        sent_lens = torch.sum(torch.sign(arr_x),dim=1).data
        arr_x = self.word_embedding(arr_x)
        arr_x = self.word_GRU(arr_x)[0]
        arr_x = self.avg_pool1d(arr_x, sent_lens)

        # for each document, compute probabilities
        doc_i = 0
        doc_line_i = 0
        while doc_i < len(doc_lens):
            doc_len = doc_lens[doc_i]
            x = self.sent_GRU(arr_x[doc_line_i:doc_line_i+doc_len,:])[0]
            d = x
            d = d.unsqueeze(0)
            d = self.avg_pool1d(d, [d[0].shape[0]])
            d = torch.tanh(self.Wdoc(d[0]) + self.bias_doc).unsqueeze(0)
            prob_doc = []
            s = torch.zeros(1,2*200)
            s = s.to(self.device)
            for position, h in enumerate(x):
                h = h.view(1, -1) # resize
                # Compute position embedding
                abs_pos = torch.LongTensor([[position]])
                abs_pos = abs_pos.to(self.device)
                abs_pos = self.abs_pos_emb(abs_pos).squeeze(0)

                # Compute relative position embedding
                rel_pos = int(round(position / 10))
                rel_pos = torch.LongTensor([[rel_pos]])
                rel_pos = rel_pos.to(self.device)
                rel_pos = self.rel_pos_emb(rel_pos).squeeze(0)

                # Compute proba
                content = self.Wcontent(h)
                salience = self.Wsalience(h, d)
                novelty = -1 * self.Wnovelty(h,torch.tanh(s))
                ap = self.Wabs_pos(abs_pos)
                rp = self.Wrel_pos(rel_pos)
                prob = torch.sigmoid(content+salience+novelty+ap+rp+self.bias)

                prob_doc.append(prob)

                s = s + torch.mm(prob,h)

            probs.append(torch.tensor(prob_doc, requires_grad=True))

            doc_i += 1
            doc_line_i += doc_len

        probs = torch.cat(probs)
        probs = probs.to(self.device)
        return probs

In [11]:
model = RNN_RNN(device=device, vocab_size=vocab_size, word_embed=glovemgr.getEmbeddings())

In [12]:
model.to(device)

RNN_RNN(
  (word_embedding): Embedding(150002, 100)
  (word_GRU): GRU(100, 200, batch_first=True, bidirectional=True)
  (sent_GRU): GRU(400, 200, batch_first=True, bidirectional=True)
  (rel_pos_emb): Embedding(11, 100)
  (abs_pos_emb): Embedding(100, 100)
  (Wdoc): Linear(in_features=400, out_features=400, bias=True)
  (Wcontent): Linear(in_features=400, out_features=1, bias=False)
  (Wsalience): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wnovelty): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wabs_pos): Linear(in_features=100, out_features=1, bias=False)
  (Wrel_pos): Linear(in_features=100, out_features=1, bias=False)
)

In [13]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adadelta(model.parameters(),lr=learning_rate)

mae_fn = nn.L1Loss()

In [14]:
if not os.path.exists("./checkpoints"):
    os.makedirs("./checkpoints")

In [28]:
import math
import numpy as np

# Compute the accuracy
def accuracy_prop_sent_per_doc_fn(probs, targets, doc_lens):
    result = []
    doc_i = 0
    doc_len = 0
    doc_line_i = 0
    while doc_i < len(doc_lens):
        doc_len = doc_lens[doc_i]
        doc_prob = np.array([probs[i] for i in range(len(probs))])
        n = math.ceil(average_proportion_of_sentences_per_document * len(doc_prob))
        for i in range(n):
            idx = np.argmax(doc_prob)
            doc_prob[idx] = -1
        doc_prob[doc_prob >= 0] = 0
        doc_prob[doc_prob == -1] = 1
        result = result + [doc_prob[i] for i in range(len(probs))]
        doc_i += 1
        doc_line_i += doc_len
    return sum([result[i] == targets[i] for i in range(len(targets))]) / len(targets)

In [29]:
import math
import numpy as np

# Compute the accuracy
def accuracy_nb_sent_per_doc_fn(probs, targets, doc_lens):
    result = []
    doc_i = 0
    doc_len = 0
    doc_line_i = 0
    while doc_i < len(doc_lens):
        doc_len = doc_lens[doc_i]
        doc_prob = np.array([probs[i] for i in range(len(probs))])
        n = math.ceil(average_number_of_sentences_per_document)
        for i in range(n):
            idx = np.argmax(doc_prob)
            doc_prob[idx] = -1
        doc_prob[doc_prob >= 0] = 0
        doc_prob[doc_prob == -1] = 1
        result = result + [doc_prob[i] for i in range(len(probs))]
        doc_i += 1
        doc_line_i += doc_len
    return sum([result[i] == targets[i] for i in range(len(targets))]) / len(targets)

In [31]:
t1 = time()
for epoch in range(1, epochs+1):
    # train 1 epoch
    model.train()
    nb_batch_train = 0
    total_train_loss = 0
    total_train_mae = 0
    total_train_acc = 0
    with tqdm(train_iter, unit="batch", total=len(train_iter)) as tepoch:
        for batch in tepoch:
            tepoch.set_description(f"Epoch {epoch}")
            features = []
            doc_lens = []

            for j in range(batch_size):
                doc_lens.append(len(batch[j]["doc"]))
                features = features + batch[j]["doc"]
            
            features = torch.LongTensor(features).to(device)

            targets = [torch.FloatTensor(batch[j]["labels"]) for j in range(batch_size)]
            targets = torch.cat(targets)
            targets = targets.to(device)
            
            probs = model(features, doc_lens)
            
            loss = loss_fn(probs, targets)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            nb_batch_train += 1
            total_train_loss += loss
            total_train_mae += mae_fn(probs, targets)
            total_train_acc += accuracy_prop_sent_per_doc_fn(probs=probs.cpu().detach().numpy(), targets=targets.cpu().detach().numpy(), doc_lens=doc_lens)
            tepoch.set_postfix(loss=(total_train_loss/nb_batch_train).item(), mae=(total_train_mae/nb_batch_train).item(), accuracy=total_train_acc/nb_batch_train)
    # Save model
    model.save("./checkpoints/RNN_RNN-" + str(epoch) + ".pt")
    # Show train and val score
    model.eval()
    nb_batch_val = 0
    total_val_loss = 0
    total_val_mae = 0
    total_val_acc = 0
    for i,batch in enumerate(val_iter):
        features = []
        doc_lens = []

        for j in range(batch_size):
            doc_lens.append(len(batch[j]["doc"]))
            features = features + batch[j]["doc"]
            
        features = torch.LongTensor(features).to(device)

        targets = [torch.FloatTensor(batch[j]["labels"]) for j in range(batch_size)]
        targets = torch.cat(targets)
        targets = targets.to(device)
        
        probs = model(features, doc_lens)
        loss = loss_fn(probs, targets)
        nb_batch_val += 1
        total_val_loss += loss
        total_val_mae += mae_fn(probs, targets)
        total_val_acc += accuracy_prop_sent_per_doc_fn(probs=probs.cpu().detach().numpy(), targets=targets.cpu().detach().numpy(), doc_lens=doc_lens)
    print("Epoch {} : train loss = {:.3f}, val loss = {:.3f}, train mae = {:.3f}, val mae = {:.3f}, train accuracy = {:.3f}, val accuracy = {:.3f}".format(epoch, total_train_loss / nb_batch_train, total_val_loss / nb_batch_val, total_train_mae / nb_batch_train, total_val_mae / nb_batch_val, total_train_acc / nb_batch_train, total_val_acc / nb_batch_val))

t2 = time()
print("Training duration =", t2-t1)



Epoch 1: 100%|██████████| 32/32 [00:21<00:00,  1.47batch/s, accuracy=0.757, loss=0.693, mae=0.458]


Epoch 1 : train loss = 0.693, val loss = 0.692, train mae = 0.458, vale mae = 0.457


Epoch 2: 100%|██████████| 32/32 [00:22<00:00,  1.45batch/s, accuracy=0.757, loss=0.692, mae=0.457]


KeyboardInterrupt: 

In [None]:
tmp = train_iter.__getitem__(2463)[0]["doc"]
for e in tmp:
  if (e[0] == 0):
    print("vide")

In [None]:
len(tmp)

25