In [7]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence
import json
import xmltodict
import glob
        
class webNLG_DATASET(Dataset):
    def __init__(self, data_path):
        self.category_list = []
        self.modifiedtripleset_list = []
        self.text_list = []            
        
        xml_files = glob.glob(data_path+'*')
        for xml_file in xml_files:        
            with open(xml_file,'r') as f:
                xmlString = f.read()
            dict_data = xmltodict.parse(xmlString)['benchmark']['entries']['entry']
            if not isinstance(dict_data, list):
                dict_data = [dict_data]                

            # challenge version
            for i in range(len(dict_data)):
                y=dict_data[i]
                self.category_list.append(y['@category'])

                self.modifiedtripleset_list.append(y['modifiedtripleset']['mtriple'])
                z = y['lex']
                if isinstance(z, list):
                    z = z[0]
                self.text_list.append(z['#text'])
        
    def __len__(self):
        return len(self.category_list)

    def __getitem__(self, idx): 
        triple_total = []
        if isinstance(self.modifiedtripleset_list[idx], list):
            for triple_list in self.modifiedtripleset_list[idx]:
                triple_total += triple_list.split('|')
        else:
            triple_total += self.modifiedtripleset_list[idx].split('|')
            
        triple = [x.strip() for x in triple_total]
        
        return self.category_list[idx], triple, self.text_list[idx]

In [8]:
data_path = '/data/private/WebNLG-models/chimera-master/data/WebNLG/raw/test/'
webNLG_data = webNLG_DATASET(data_path)
dataloader = DataLoader(webNLG_data, batch_size=1, shuffle=False, num_workers=4)

In [11]:
f = open('./prediction/challenge/reference.txt','w')
for i_batch, sample_batched in enumerate(dataloader):
    cate, triple, text = sample_batched
    f.write(text[0]+'\n')
f.close()

In [8]:
from model import *
my_model = webmodel().cuda()
model_path = '/data/private/WebNLG-models/simple_model/pretrained/try_1/1'
my_model.load_state_dict(torch.load(model_path + '/model.bin'))
my_model.eval()
print('ok')

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at /data/private/GPT/openai-gpt2/base/ and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ok


In [4]:
my_model.tokenizer.all_special_ids, my_model.tokenizer.all_special_tokens, my_model.tokenizer.bos_token_id

([50256, 50259, 50256, 50257, 50258],
 ['<|endoftext|>', '<tr>', '<|endoftext|>', '<S>', '<c>'],
 50257)

In [12]:
len(dataloader)

1862

## check prediction

In [1]:
f = open('prediction/prediction_1.txt')
f2 = open('prediction/reference.txt')
texts = f.readlines()
refs = f2.readlines()
f.close()
f2.close()

In [9]:
max_len = 0
for i in range(len(refs)):
    text = texts[i]
    ref = refs[i]
    x1 = my_model.tokenizer.encode(text.strip())
    x2 = my_model.tokenizer.encode(ref.strip())
    if len(x2) > max_len:
        max_len = len(x2)
print(max_len)

92


In [17]:
my_model.tokenizer.encode('.')

[13]

In [19]:
my_model.END_idx_list

[50258, 50256, 50257, 50256, 50259]

In [21]:
my_model.tokenizer.eos_token_id,my_model.tokenizer.decode(50256)

(50256, '<|endoftext|>')

## for evaluation

In [12]:
f = open('prediction/reference.txt')
texts = f.readlines()
f.close()

In [13]:
f2 = open('prediction/enter_reference.txt', 'w')
for text in texts:
    f2.write(text+'\n')
f2.close()

In [2]:
f = open('prediction/prediction_1.txt')
texts = f.readlines()
f.close()

In [3]:
f2 = open('prediction/modify_prediction_1.txt', 'w')
for text in texts:
    f2.write(text.replace('_', ' ').replace('@',''))
f2.close()