In [1]:
emb = {}
with open('./data/glove.840B.300d/glove.840B.300d.txt') as f:
    for line in f:
        line = line.split()
        word = ' '.join(line[:-300])
        temp = []
        for num in line[-300:]:
            temp.append(float(num))
        emb[word] = temp

In [36]:
mapping = {'id2w':['pad'],'w2id':{'pad':0}}
vocab = [[0]*300]
for word in emb:
    mapping['id2w'].append(word)
    mapping['w2id'][word] = len(mapping['w2id'])
    vocab.append(emb[word])
vocab = np.array(vocab)

In [31]:
mapping['w2id']['.'],mapping['id2w'][2]

(2, '.')

# prepare dataloader

In [4]:
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms, utils
import torch
import nltk
import numpy as np
import pandas as pd
import sys
import re
import string
import json

In [11]:
class itemDataset(Dataset):
    def __init__(self, file_name,mapping,transform=None):
        self.data = {0:[],1:[]}
        self.mapping = mapping
        
        self.build_data(file_name)
        self.transform = transform
    def build_data(self,file_name):
        data = pd.read_csv(file_name)
        line = data['question_text'].tolist()
        target = data['target'].tolist()
        
        def token(line):
            line = line.lower()
            line = nltk.word_tokenize(line)
            temp = []
            for w in line:
                try:
                    temp.append(self.mapping['w2id'][w])
                except:
                    temp.append(0)
            return temp
            
        for data,label in zip(line,target):
            self.data[label].append( token(data).copy() )
            
    def __len__(self):
        return len(self.data[1])
    def __getitem__(self, idx):
        sample = {}
        
        sample['pos'] = self.data[1][ idx ]
        sample['pos_len'] = len(sample['pos'])
        
        sample['neg'] = self.data[0][ np.random.randint(low=0,high=len(self.data[0])) ]
        sample['neg_len'] = len(sample['neg'])
        
        if self.transform:
            sample = self.transform(sample)
        return sample

In [12]:
class ToTensor(object):
    def __call__(self,sample):
        for name in sample:
            sample[name] = torch.tensor(sample[name],dtype=torch.long)

        return sample

In [13]:
def collate_fn(data):
    """
    parsing the data list into batch tensor
    ['pos','neg','pos_len','neg_len']
    """
    output = dict()

    for name in ['pos_len','neg_len']:
        temp = [ _[name] for _ in data]	 
        output[name] = torch.stack(temp, dim=0) 

        
    #deal with source and target
    for name in ['pos','neg']:
        length = output['{0}_len'.format(name)]
        l = length.max().item()

        for i in range(len(data)):
            if(l-length[i].item()>0):
                data[i][name] =  torch.cat([data[i][name],torch.zeros(l-length[i].item(),dtype=torch.long)],dim=-1)

        temp = [ _[name] for _ in data]
        output[name] = torch.stack(temp, dim=0).long()
    return output

In [14]:
dataset = itemDataset( file_name='./data/train.csv',mapping=mapping,transform=transforms.Compose([ToTensor()]))

In [15]:
dataloader = DataLoader(dataset, batch_size=2,shuffle=True, num_workers=16,collate_fn=collate_fn)

In [16]:
len(dataloader)

40405

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim

In [75]:
class DECODER(nn.Module):
    def __init__(self,args):
        super(DECODER,self).__init__()
        self.hidden_size = args.hidden_dim
        
        self.fnn1 = nn.Linear(2*self.hidden_size,self.hidden_size)
        self.fnn2 = nn.Linear(self.hidden_size,1)
    def forward(self,x):
        out = self.fnn1(x)
        out = f.relu(out)
        out = self.fnn2(out)
    
        return out

In [91]:
class RANK(nn.Module):
    def __init__(self,args):
        super(RANK,self).__init__()
        self.hidden_size = args.hidden_dim
        
        self.fnn1 = nn.Linear(4*self.hidden_size,self.hidden_size)
        self.fnn2 = nn.Linear(self.hidden_size,1)
    def forward(self,x,y):
        out = torch.cat([x,y],dim=-1)
        
        out = self.fnn1(out)
        out = f.relu(out)
        out = self.fnn2(out)
    
        return out

In [59]:
class BIRNN_end(nn.Module):
    def __init__(self,args,vocab):
        super(BIRNN_end,self).__init__()
        self.input_size = args.input_size
        self.hidden_size = args.hidden_dim
        self.num_layer = args.num_layer
        self.batch_first = args.batch_first
        self.dropout = args.dropout


        self.word_embedding = nn.Embedding(args.input_size,args.word_dim)
        self.word_embedding.weight = nn.Parameter(torch.tensor(vocab,dtype=torch.float32))
        self.word_embedding.weight.required_grad = False
        
        self.rnn = nn.LSTM(
            input_size=args.word_dim,
            hidden_size=args.hidden_dim,
            num_layers=args.num_layer,
            batch_first=args.batch_first,
            dropout=args.dropout,
            bidirectional=True
        )

    def forward(self,query,query_len):
        def pack(seq,seq_length):
            sorted_seq_lengths, indices = torch.sort(seq_length, descending=True)
            _, desorted_indices = torch.sort(indices, descending=False)

            if self.batch_first:
                seq = seq[indices]
            else:
                seq = seq[:, indices]
            packed_inputs = nn.utils.rnn.pack_padded_sequence(seq,
                                                            sorted_seq_lengths.cpu().numpy(),
                                                            batch_first=self.batch_first)

            return packed_inputs,desorted_indices

        def unpack(res, state,desorted_indices):
            padded_res,_ = nn.utils.rnn.pad_packed_sequence(res, batch_first=self.batch_first)

            state = [state[i][:,desorted_indices] for i in range(len(state)) ] 

            if(self.batch_first):
                desorted_res = padded_res[desorted_indices]
            else:
                desorted_res = padded_res[:, desorted_indices]

            return desorted_res,state

        def feat_extract(query_output,query_length):
            """
            answer_output: batch*sentence*feat_len
            query_output:  batch*sentence*feat_len


            for simple rnn, we just take the output from 
            """
            if( self.batch_first == False ):
                query_output = query_output.transpose(0,1) 

            query_output = [torch.cat([ query_output[i][ query_length[i]-1 ][:self.hidden_size] , 
                                        query_output[i][0][self.hidden_size:]] , dim=-1 ) for i in range(query_length.shape[0])]
            query_output = torch.stack(query_output,dim=0)

            return query_output

        #first check for the mask ans the embedding

        mask =  query.eq(0)

        query_emb = self.word_embedding(query)

        #query part
        packed_inputs,desorted_indices = pack(query_emb,query_len)
        res, state = self.rnn(packed_inputs)
        query_res,_ = unpack(res, state,desorted_indices)

        #extract the representation of the sentence
        query_result = feat_extract(query_res,query_len.int())
        
        return query_result


In [60]:
class temp:
    def __init__(self):
        pass

In [61]:
args = temp()

args.input_size = len(emb)+1
args.hidden_dim = 300
args.word_dim = 300
args.num_layer = 2
args.batch_first = True
args.dropout = 0


In [78]:
model = BIRNN_end(args,vocab)

In [92]:
decoder = DECODER(args)
rank = RANK(args)

In [97]:
args.gpu = -1
args.learning_rate = 0.001
args.epoch = 10
args.print_freq = 4

In [98]:
def convert(data,device):
    for name in data:
        data[name] = data[name].to(device)

In [None]:
print("check device")
if(torch.cuda.is_available() and args.gpu>=0):
    device = torch.device('cuda')
    print('the device is in cuda')
else:
    device = torch.device('cpu')
    print('the device is in cpu')

model = model.to(device=device)
decoder = decoder.to(device=device)
rank = rank.to(device=device)
print(model)
optimizer = optim.Adam(model.parameters(),lr=args.learning_rate)
criterion = nn.BCEWithLogitsLoss(reduction='sum')

loss_best = 100000000
print("start training")
for now in range(args.epoch):
    print(now)

    Loss = {'class':0,'rank':0,'cate':0}
    Count = {'class':0,'rank':0,'cate':0}
    temp_Loss = {'class':0,'rank':0,'cate':0}
    temp_Count = {'class':0,'rank':0,'cate':0}

    model.train()
    model.zero_grad()
    for i,data in enumerate(dataloader):
        print(i)
        convert(data,device)

        #deal with the classfication part
        query_left = model(data['pos'],data['pos_len'])
        out_left = decoder(query_left)
        
        pred = (out_left.sigmoid()>0.5).int()
        temp_Count['class'] += ( pred.eq(1) ).sum()
        Count['class'] += ( pred.eq(1) ).sum()

        loss = criterion(out_left,torch.ones(out_left.shape)) 
        total_loss = loss
        temp_Loss['rank'] = loss.detach().cpu().item()
        Loss['rank'] += loss.detach().cpu().item()

        query_right = model(data['neg'],data['neg_len'])
        out_right = decoder(query_right)
        
        pred = (out_right.sigmoid()<0.5).int()
        temp_Count['class'] += ( pred.eq(0) ).sum()
        Count['class'] += ( pred.eq(0) ).sum()

        loss = criterion(out_right,torch.zeros(out_right.shape)) 
        total_loss += loss
        temp_Loss['class'] += loss.detach().cpu().item()
        Loss['class'] += loss.detach().cpu().item()

        #deal with the ranking part
        
        if(i%2):
            out = rank(query_left,query_right)
            label = torch.ones(out.shape)
        else:
            out = rank(query_right,query_left)
            label = torch.zeros(out.shape)
            
        pred = ((out.sigmoid()>0.5).int() == label.int() )
        temp_Count['rank'] +=  pred.sum()
        Count['rank'] +=  pred.sum()

        loss = criterion(out,label) 
        total_loss += loss
        temp_Loss['rank'] = loss.detach().cpu().item()
        Loss['rank'] += loss.detach().cpu().item()

        #temp_Loss.backward(retain_graph=True)
        total_loss.backward()

        if(i%4==0):
            optimizer.step()
            model.zero_grad()
            break

        if(i%160==0):
            #print('out',out_right.sigmoid().view(-1))
            #print('label',data['right_type'].view(-1))
            print(i,' training loss(class):{0} (rank):{1} (cate):{2}  acc:{3}/{4} {5}/{6} {7}/{8}'.format(
                temp_Loss['class'],temp_Loss['rank'],temp_Loss['cate'],temp_Count['class'],args.batch_size*320,temp_Count['rank'],args.batch_size*160,args.batch_size*160))

            temp_Loss = {'class':0,'rank':0,'cate':0}
            temp_Count = {'class':0,'rank':0,'cate':0}
    if(now%args.print_freq==0):
        print('*'*10)
        print(' training loss(class):{0} (rank):{1} (cate):{2}  acc:{3}/{4} {5}/{6}'.format(Loss['class']/len(dataloader)/2,Loss['rank']/len(dataloader),Count['class'],len(dataloader)*2,Count['rank'],len(dataloader),Count['cate'],len(dataloader)))


check device
the device is in cpu
BIRNN_end(
  (word_embedding): Embedding(2195896, 300)
  (rnn): LSTM(300, 300, num_layers=2, batch_first=True, bidirectional=True)
)
start training
0
0
**********
 training loss(class):1.5795660142836012e-05 (rank):5.7967664168444826e-05 (cate):2  acc:80810/2 40405/0
1
0
2
0
3
0
4


In [102]:
print(' training loss(class):{0} (rank):{1} (cate):{2}  acc:{3}/{4} {5}/{6}'.format(Loss['class']/len(dataloader)/2,Loss['rank']/len(dataloader),Count['class'],len(dataloader)*2,Count['rank'],len(dataloader),Count['cate'],len(dataloader)))


 training loss(class):0.0 (rank):2.8635351806270533e-05 (cate):2  acc:80810/2 40405/0
