In [1]:
import numpy as np
import json
import pandas as pd
import sys
import ast


In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms, utils

In [3]:
class itemDataset(Dataset):
    def __init__(self,file_name,mode='train',transform=None):
        self.mode = mode
        self.data = []
        
        temp = pd.read_csv(file_name)
        if(mode=='test'):
            for query,length in zip(temp['query'],temp['length']):
                query = ast.literal_eval(query)
                length = ast.literal_eval(length)

                self.data.append({
                    'query':query,
                    'length':length
                })
                
        elif(mode=='train' or mode=='eval'):
            for query,length,label in zip(temp['query'],temp['length'],temp['label']):
                query = ast.literal_eval(query)
                length = ast.literal_eval(length)

                if(label=='disagreed'):
                    t1,t2 = [0.0,1.0],[0.0,1.0]
                    l = 2
                elif(label=='agreed'):
                    t1,t2 = [0.0,1.0],[1.0,0.0]
                    l = 1
                elif(label=='unrelated'):
                    t1,t2 = [1.0,0.0],[0.0,0.0]
                    l = 0

                self.data.append({
                    'query1':query[0],
                    'length1':length[0],
                    'query2':query[1],
                    'length2':length[1],
                    'label_relation':t1,
                    'label_type':t2,
                    'label':l
                })
                
                
        self.transform = transform
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        if(transforms):
            sample = self.transform(self.data[idx])
        return sample


In [4]:
class ToTensor(object):
    def __call__(self,sample):
        #print(sample)
        for name in ['query1','length1','query2','length2','label']:
            sample[name] = torch.tensor(sample[name],dtype=torch.long)

        for name in ['label_relation','label_type']:
            if(name in sample):
                sample[name] = torch.tensor(sample[name],dtype=torch.float)

        return sample

In [5]:
def collate_fn(data):
    output = dict()

    for name in ['length1','length2','label_relation','label_type','label']:
        temp = [ _[name] for _ in data]	 
        output[name] = torch.stack(temp, dim=0) 


    #deal with source and target
    for name in range(1,3):
        length = output['length{0}'.format(name)]
        name = 'query{0}'.format(name)
        l = length.max().item()

        for i in range(len(data)):
            if(l-length[i].item()>0):
                data[i][name] =  torch.cat([data[i][name],torch.zeros(l-length[i].item(),dtype=torch.long)],dim=-1)

        temp = [ _[name] for _ in data]
        output[name] = torch.stack(temp, dim=0).long()

    return {
        'length':[output['length1'],output['length2']],
        'query':[output['query1'],output['query2']],
        'label_relation':output['label_relation'],
        'label_type':output['label_type'],
        'label':output['label']
    }

In [16]:
class Args:
    def __init__(self):
        pass
args = Args()

args.batch_size=32
args.dropout=0
args.epoch=200
args.gpu=0
args.embeds_dim=128
args.hidden_dim=128
args.num_layer=2
args.learning_rate=0.0001
args.model=True
args.print_freq=1
args.save=True
args.input_size=49527
args.batch_first=True
args.data='./data/all_no_embedding/'
args.mode = 'train'
args.batch_first = True
args.step = 1
args.model = 'siamese'

with open('{0}/vocab'.format(args.data)) as f:
    args.word_num = len(f.readlines())

In [17]:
class Base(nn.Module):
    def __init__(self,args):
        super(Base, self).__init__()
        self.args = args

        self.word_emb =nn.Embedding(args.word_num,args.embeds_dim)

        if(args.mode == 'pretrain'):
            self.load()
            self.word_emb.weight.requires_grad = False
            print("here",self.word_emb.weight.requires_grad)

    def load(self):
        if(self.args.embed_type == 'glove'):
            pass
        elif(self.args.embed_type == 'fasttext'):
            
            with open('./data/embedding/glove.6B.100d.txt') as f:
                arr = np.zeros((self.word_emb.weight.shape[0],self.word_emb.weight.shape[1]),dtype=np.float32)
                for i,line in enumerate(f):
                    for j,num in enumerate(line.strip().split()[1:]):
                        arr[i+1,j] = float(num)
                        
                self.word_emb.weight = nn.Parameter(torch.tensor(arr))

In [18]:
class siamese(Base):
    def __init__(self, args):
        super(siamese, self).__init__(args)
        
        self.embeds_dim = args.embeds_dim
        self.hidden_dim = args.hidden_dim
        self.num_layer = args.num_layer
        self.batch_first = args.batch_first
        
        self.ln_embeds = nn.LayerNorm(args.embeds_dim)
        self.rnn = nn.LSTM(self.embeds_dim, self.hidden_dim, batch_first=self.batch_first , bidirectional=True, num_layers=self.num_layer)


        self.linear1 = nn.Linear(4*self.hidden_dim,self.hidden_dim)
        self.linear2_1 = nn.Linear(self.hidden_dim,2)
        self.linear2_2 = nn.Linear(self.hidden_dim,2)

    def forward(self, querys,lengths):
        def pack(seq,seq_length):
            sorted_seq_lengths, indices = torch.sort(seq_length, descending=True)
            _, desorted_indices = torch.sort(indices, descending=False)

            if self.batch_first:
                seq = seq[indices]
            else:
                seq = seq[:, indices]
            packed_inputs = nn.utils.rnn.pack_padded_sequence(seq,
                                                            sorted_seq_lengths.cpu().numpy(),
                                                            batch_first=self.batch_first)

            return packed_inputs,desorted_indices

        def unpack(res, state,desorted_indices):
            padded_res,_ = nn.utils.rnn.pad_packed_sequence(res, batch_first=self.batch_first)

            state = [state[i][:,desorted_indices] for i in range(len(state)) ] 

            if(self.batch_first):
                desorted_res = padded_res[desorted_indices]
            else:
                desorted_res = padded_res[:, desorted_indices]

            return desorted_res,state

        def feat_extract(output,length,mask):
            """
            answer_output: batch*sentence*feat_len
            query_output:  batch*sentence*feat_len
            for simple rnn, we just take the output from 
            """
            if( self.batch_first == False ):
                output = output.transpose(0,1) 

            output = [torch.cat([ output[i][ length[i]-1 ][:self.hidden_dim] , 
                                        output[i][0][self.hidden_dim:]] , dim=-1 ) for i in range(length.shape[0])]
            output = torch.stack(output,dim=0)

            return output

        query_embs = [self.word_emb(querys[0]),self.word_emb(querys[1])]
        masks = [querys[0].eq(0),querys[1].eq(0)]

        query_result = []
        for query_emb,length,mask in zip(query_embs,lengths,masks):
            packed_inputs,desorted_indices = pack(query_emb,length)
            res, state = self.rnn(packed_inputs)
            query_res,_ = unpack(res, state,desorted_indices)
            query_result.append(feat_extract(query_res,length.int(),mask))
        
        query_result = torch.cat([query_result[0],query_result[1]],dim=1)
        
        out = self.linear1(query_result)
        
        out_1 = self.linear2_1(F.relu(out))
        out_2 = self.linear2_2(F.relu(out))
        return [out_1,out_2]

In [19]:
def get_data(train_file,eval_file,batch_size):
    train_dataset = itemDataset( file_name=train_file,mode='train',transform=transforms.Compose([ToTensor()]))
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=16,collate_fn=collate_fn)
    
    eval_dataset = itemDataset( file_name=eval_file,mode='eval',transform=transforms.Compose([ToTensor()]))
    eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size,shuffle=True, num_workers=16,collate_fn=collate_fn)
    
    return {
        'train':train_dataloader,
        'eval':eval_dataloader
    }

In [20]:
import os
os.path.join('./data/all_no_embedding/','train.csv')

'./data/all_no_embedding/train.csv'

In [21]:
print("check device")
if(torch.cuda.is_available() and args.gpu>=0):
    device = torch.device('cuda')
    print('the device is in cuda')
else:
    device = torch.device('cpu')
    print('the device is in cpu')

print("loading data")
dataloader = get_data('./data/all_no_embedding/train.csv','./data/all_no_embedding/eval.csv',args.batch_size)

print("setting model")
if(args.model=='siamese'):
    model = siamese(args)
model = model.to(device=device)

print(model)
optimizer = optim.Adam(model.parameters(),lr=args.learning_rate)
criterion = nn.KLDivLoss()

loss_best = 100000000
print("start training")


check device
the device is in cuda
loading data
setting model
siamese(
  (word_emb): Embedding(6261, 128)
  (ln_embeds): LayerNorm(torch.Size([128]), eps=1e-05, elementwise_affine=True)
  (rnn): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (linear2_1): Linear(in_features=128, out_features=2, bias=True)
  (linear2_2): Linear(in_features=128, out_features=2, bias=True)
)
start training


In [22]:
def train(model,data,criterion,step):
    total={'loss_relation':0,'loss_type':0,'count':0,'num':0}
    for i,data in enumerate(data):
        data = convert(data,device)

        #deal with the classfication part
        out = model(data['query'],data['length'])
        
        loss = criterion(F.log_softmax(out[0],dim=1),data['label_relation']) 
        loss.backward(retain_graph=True)
        total['loss_relation'] += loss.cpu().detach()
        
        loss = criterion(F.log_softmax(out[1],dim=1),data['label_type']) 
        loss.backward(retain_graph=True)
        total['loss_type'] += loss.cpu().detach()
        total['num'] += out[0].shape[0]
        
        total['count'] += ((out[0].topk(1)[1]*(1+out[1].topk(1)[1])).view(-1) == data['label']).sum()
        
        if(i%1==0):
            optimizer.step()
            model.zero_grad()

        if(i%160==0):
            print(i,' train loss(relation):{0} loss(type):{1} acc:{2}/{3}'.format(total['loss_relation'],total['loss_type'],total['count'],total['num']))
            total={'loss_relation':0,'loss_type':0,'count':0,'num':0}


In [23]:
def eval(model,data,criterion,loss_best):
    total={'loss':0,'count':0,'num':0}
    for i,data in enumerate(data):
        with torch.no_grad():
            #
            data = convert(data,device)
            out = model(data['query'],data['length'])

            loss = criterion(F.log_softmax(out[0],dim=1),data['label_relation']) 
            total['loss_relation'] += loss.cpu().detach()

            loss = criterion(F.log_softmax(out[1],dim=1),data['label_type'])
            total['loss_type'] += loss.cpu().detach()
            total['num'] += out[0].shape[0]

            total['count'] += out[0].topk(1)[1]*(1+out[1].topk(1)[1]) == data['label']
        
    print(i,' test loss(relation):{0} loss(type):{1} acc:{2}/{3}'.format(total['loss_relation'],total['loss_type'],total['count'],total['num']))
    
    check = {
            'args':args,
            'model':model.state_dict()
            }
    torch.save(check, './saved_models/{0}/step_{1}.pkl'.format(args.save,now))

    if(Loss['loss']<loss_best):
        torch.save(check, './saved_models/{0}/best.pkl'.format(args.save))
        loss_best = Loss['class']
    
    return loss_best

In [None]:
def convert(data,device):
    for name in data:
        if(type(data[name])==list):
            for i in range(len(data[name])):
                data[name][i] = data[name][i].to(device)
        else:
            data[name] = data[name].to(device)
    return data

In [None]:
model.zero_grad()

model.train()
train(model,dataloader['train'],criterion,args.step)
model.eval()
loss_best = eval(model,dataloader['eval'],criterion,loss_best)

0  train loss(relation):0.35605740547180176 loss(type):0.12137935310602188 acc:2/32
160  train loss(relation):51.64070129394531 loss(type):10.66838264465332 acc:3192/5120
320  train loss(relation):46.73356628417969 loss(type):6.126214981079102 acc:3526/5120
480  train loss(relation):44.73812484741211 loss(type):2.2638931274414062 acc:3549/5120
640  train loss(relation):44.338260650634766 loss(type):0.8784410357475281 acc:3570/5120
800  train loss(relation):44.18479919433594 loss(type):0.6398261189460754 acc:3556/5120
960  train loss(relation):43.49910354614258 loss(type):0.515021800994873 acc:3593/5120
1120  train loss(relation):43.50716781616211 loss(type):0.6514284610748291 acc:3593/5120
1280  train loss(relation):42.54240036010742 loss(type):0.41000688076019287 acc:3609/5120
