# Train RNN_RNN

In [1]:
import os

import pandas as pd

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm

from utils.GloveMgr import GloveMgr
from utils.Dataset import Dataset
from utils.DataLoader import DataLoader
from utils.preprocess_df import preprocess_df

#from models.RNN_RNN import RNN_RNN

from time import time

In [2]:
vocab_size = 150000
batch_size = 32
epochs = 5
learning_rate = 1e-3
model_name = "RNN_RNN"

In [3]:
# Check if a GPU is available
if torch.cuda.is_available():
    # Display the number of available GPUs
    print(f"Number of available GPUs: {torch.cuda.device_count()}")
    # Display the name of each GPU
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU available.")

Number of available GPUs: 1
GPU 0: NVIDIA GeForce GTX 1650


In [4]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

device = torch.device(dev)
device

device(type='cuda', index=0)

In [5]:
glovemgr = GloveMgr("./data/glove.6B/glove.6B.100d.txt", vocab_size=vocab_size)

In [6]:
train_dataset = Dataset(preprocess_df(pd.read_json("./data/train.json"), glovemgr=glovemgr, is_sep_n=True, remove_stop_word=True, stemming=False, trunc=50, padding=50))
train_iter = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [7]:
val_dataset = Dataset(preprocess_df(pd.read_json("./data/val.json"), glovemgr=glovemgr, is_sep_n=True, remove_stop_word=True, stemming=False, trunc=50, padding=50))
val_iter = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
from models.BasicModel import BasicModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class RNN_RNN(BasicModel):
    def __init__(self, device, vocab_size, word_embed = None):
        super(RNN_RNN, self).__init__()

        self.device = device

        self.word_embedding = nn.Embedding(vocab_size, 100, padding_idx=0)
        # Load word embedding if specified
        if word_embed is not None:
            self.word_embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(word_embed).float())

        # 100 : word2vec embedding size
        self.word_GRU = nn.GRU(input_size = 100, hidden_size = 200, batch_first = True, bidirectional = True)
        self.sent_GRU = nn.GRU(input_size = 2*200, hidden_size=200, batch_first = True, bidirectional = True)

        # 10: relative position range size, with segment size = 10
        self.rel_pos_emb = nn.Embedding(10, 100)
        self.abs_pos_emb = nn.Embedding(100, 100)

        self.Wdoc = nn.Linear(2*200,2*200)
        self.bias_doc = nn.Parameter(torch.FloatTensor(2*200).uniform_(-0.1,0.1))

        self.Wcontent = nn.Linear(2*200,1,bias=False)
        self.Wsalience = nn.Bilinear(2*200,2*200,1,bias=False)
        self.Wnovelty = nn.Bilinear(2*200,2*200,1,bias=False)
        self.Wabs_pos = nn.Linear(100,1,bias=False)
        self.Wrel_pos = nn.Linear(100,1,bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1,0.1))

    def avg_pool1d(self,x,seq_lens):
        out = []
        for index,t in enumerate(x):
            if seq_lens[index] == 0:
                t = t[:1]
            else:
                t = t[:seq_lens[index],:]
            t = torch.t(t).unsqueeze(0)
            out.append(F.avg_pool1d(t,t.size(2)))
        
        out = torch.cat(out).squeeze(2)
        return out

    def forward(self, arr_x):
        probs = []

        # for each document, compute probabilities
        for doc in arr_x:
            sent_lens = torch.sum(torch.sign(doc),dim=1).data
            x = self.word_embedding(doc)
            x = self.word_GRU(x)[0]
            x = self.avg_pool1d(x, sent_lens)
            x = self.sent_GRU(x)[0]
            d = x
            d = d.unsqueeze(0)
            d = self.avg_pool1d(d, [d[0].shape[0]])
            d = torch.tanh(self.Wdoc(d[0]) + self.bias_doc).unsqueeze(0)
            prob_doc = []
            s = torch.zeros(1,2*200)
            s = s.to(self.device)
            for position, h in enumerate(x):
                h = h.view(1, -1) # resize
                # Compute position embedding
                abs_pos = Variable(torch.LongTensor([[position]]))
                abs_pos = abs_pos.to(self.device)
                abs_pos = self.abs_pos_emb(abs_pos).squeeze(0)

                # Compute relative position embedding
                rel_pos = int(round(position / 10))
                rel_pos = Variable(torch.LongTensor([[rel_pos]]))
                rel_pos = rel_pos.to(self.device)
                rel_pos = self.rel_pos_emb(rel_pos).squeeze(0)

                # Compute proba
                content = self.Wcontent(h)
                salience = self.Wsalience(h, d)
                novelty = -1 * self.Wnovelty(h,F.tanh(s))
                ap = self.Wabs_pos(abs_pos)
                rp = self.Wrel_pos(rel_pos)
                prob = torch.sigmoid(content+salience+novelty+ap+rp+self.bias)

                prob_doc.append(prob)

                s = s + torch.mm(prob,h)

            probs.append(torch.tensor(prob_doc, requires_grad=True))

        probs = torch.cat(probs)
        probs = probs.to(self.device)
        return probs

In [9]:
model = RNN_RNN(device=device, vocab_size=vocab_size, word_embed=glovemgr.getEmbeddings())

In [10]:
model.to(device)

RNN_RNN(
  (word_embedding): Embedding(150002, 100)
  (word_GRU): GRU(100, 200, batch_first=True, bidirectional=True)
  (sent_GRU): GRU(400, 200, batch_first=True, bidirectional=True)
  (rel_pos_emb): Embedding(10, 100)
  (abs_pos_emb): Embedding(100, 100)
  (Wdoc): Linear(in_features=400, out_features=400, bias=True)
  (Wcontent): Linear(in_features=400, out_features=1, bias=False)
  (Wsalience): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wnovelty): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wabs_pos): Linear(in_features=100, out_features=1, bias=False)
  (Wrel_pos): Linear(in_features=100, out_features=1, bias=False)
)

In [11]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adadelta(model.parameters(),lr=learning_rate)
model.train()

RNN_RNN(
  (word_embedding): Embedding(150002, 100)
  (word_GRU): GRU(100, 200, batch_first=True, bidirectional=True)
  (sent_GRU): GRU(400, 200, batch_first=True, bidirectional=True)
  (rel_pos_emb): Embedding(10, 100)
  (abs_pos_emb): Embedding(100, 100)
  (Wdoc): Linear(in_features=400, out_features=400, bias=True)
  (Wcontent): Linear(in_features=400, out_features=1, bias=False)
  (Wsalience): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wnovelty): Bilinear(in1_features=400, in2_features=400, out_features=1, bias=False)
  (Wabs_pos): Linear(in_features=100, out_features=1, bias=False)
  (Wrel_pos): Linear(in_features=100, out_features=1, bias=False)
)

In [12]:
if not os.path.exists("./checkpoints"):
    os.makedirs("./checkpoints")

In [13]:
t1 = time() 
for epoch in range(1, epochs+1):
    for i,batch in enumerate(train_iter):
        print("batch ", i)
        features = [Variable(torch.LongTensor(batch[i]["doc"])).to(device) for i in range(batch_size)]
        targets = [Variable(torch.FloatTensor(batch[i]["labels"])) for i in range(batch_size)]
        targets = torch.cat(targets)
        targets = targets.to(device)
        
        probs = model(features)
        print(probs.shape, " ~", targets.shape)
        loss = loss_fn(probs, targets)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), 1.0)
        optimizer.step()

t2 = time()
print("Training duration =", t2-t1)



batch  0




torch.Size([113])  ~ torch.Size([113])
batch  1


  clip_grad_norm(model.parameters(), 1.0)


torch.Size([137])  ~ torch.Size([137])
batch  2
torch.Size([66])  ~ torch.Size([66])
batch  3
torch.Size([126])  ~ torch.Size([126])
batch  4
torch.Size([78])  ~ torch.Size([78])
batch  5
torch.Size([103])  ~ torch.Size([103])
batch  6
torch.Size([115])  ~ torch.Size([115])
batch  7
torch.Size([103])  ~ torch.Size([103])
batch  8
torch.Size([85])  ~ torch.Size([85])
batch  9
torch.Size([112])  ~ torch.Size([112])
batch  10
torch.Size([64])  ~ torch.Size([64])
batch  11
torch.Size([89])  ~ torch.Size([89])
batch  12
torch.Size([91])  ~ torch.Size([91])
batch  13
torch.Size([92])  ~ torch.Size([92])
batch  14
torch.Size([114])  ~ torch.Size([114])
batch  15
torch.Size([118])  ~ torch.Size([118])
batch  16
torch.Size([135])  ~ torch.Size([135])
batch  17
torch.Size([71])  ~ torch.Size([71])
batch  18
torch.Size([86])  ~ torch.Size([86])
batch  19
torch.Size([152])  ~ torch.Size([152])
batch  20
torch.Size([124])  ~ torch.Size([124])
batch  21
torch.Size([126])  ~ torch.Size([126])
batch  

../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [0,0,0], thread: [2

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`