## Dataloader

### 입력은 10 개의 고유 한 DBpedia 범주, 즉 Astronaut, University, City, Monument, Building, ComicsCharacter, Food, Airport, SportsTeam 및 WrittenWork에 속하는 엔티티를 설명합니다. 

### train: 6940, dev: 872, test: 1862

In [93]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence
import json
import xmltodict
import glob
        
class webNLG_DATASET(Dataset):
    def __init__(self, data_path):
        self.category_list = []
        self.modifiedtripleset_list = []
        self.text_list = []            
        
        xml_folders = glob.glob(data_path+'*')
        xml_folders.sort()
        
        for xml_folder in xml_folders:
            xml_roots = xml_folder+'/*'
            xml_files = glob.glob(xml_roots)
            xml_files.sort()
            
            for xml_file in xml_files:        
                with open(xml_file,'r') as f:
                    xmlString = f.read()
                dict_data = xmltodict.parse(xmlString)['benchmark']['entries']['entry']
                if not isinstance(dict_data, list):
                    dict_data = [dict_data]                

                # challenge version
                for i in range(len(dict_data)):
                    y=dict_data[i]
                    self.category_list.append(y['@category'])
                    
                    self.modifiedtripleset_list.append(y['modifiedtripleset']['mtriple'])
                    z = y['lex']
                    if isinstance(z, list):
                        z = z[0]
                    self.text_list.append(z['#text'])

                
                # version 2.0
#                 for i in range(len(dict_data)):
#                     y=dict_data[i]

#                     self.category_list.append(y['@category'])

#                     if 'test' in xml_file.split('/'):
#                         self.modifiedtripleset_list.append(y['modifiedtripleset']['otriple'])
#                     else:
#                         self.modifiedtripleset_list.append(y['modifiedtripleset']['mtriple'])

#                     z = y['lex']
#                     if isinstance(z, list):
#                         z = z[0]
#                     self.text_list.append(z['text'])
        
    def __len__(self):
        return len(self.category_list)

    def __getitem__(self, idx): 
        triple_total = []
        if isinstance(self.modifiedtripleset_list[idx], list):
            for triple_list in self.modifiedtripleset_list[idx]:
                triple_total += triple_list.split('|')
        else:
            triple_total += self.modifiedtripleset_list[idx].split('|')
            
        triple = [x.strip() for x in triple_total]
        
        return self.category_list[idx], triple, self.text_list[idx]

In [89]:
## test
class webNLG_DATASET(Dataset):
    def __init__(self, data_path):
        self.category_list = []
        self.modifiedtripleset_list = []
        self.text_list = []            
        
        xml_files = glob.glob(data_path+'*')
        for xml_file in xml_files:        
            with open(xml_file,'r') as f:
                xmlString = f.read()
            dict_data = xmltodict.parse(xmlString)['benchmark']['entries']['entry']
            if not isinstance(dict_data, list):
                dict_data = [dict_data]                

            # challenge version
            for i in range(len(dict_data)):
                y=dict_data[i]
                self.category_list.append(y['@category'])

                self.modifiedtripleset_list.append(y['modifiedtripleset']['mtriple'])
                z = y['lex']
                if isinstance(z, list):
                    z = z[0]
                self.text_list.append(z['#text'])
        
    def __len__(self):
        return len(self.category_list)

    def __getitem__(self, idx): 
        triple_total = []
        if isinstance(self.modifiedtripleset_list[idx], list):
            for triple_list in self.modifiedtripleset_list[idx]:
                triple_total += triple_list.split('|')
        else:
            triple_total += self.modifiedtripleset_list[idx].split('|')
            
        triple = [x.strip() for x in triple_total]
        
        return self.category_list[idx], triple, self.text_list[idx]

In [160]:
# 학습버전
class webNLG_DATASET(Dataset):
    def __init__(self, data_path):
        self.category_list = []
        self.modifiedtripleset_list = []
        self.text_list = []            
        
        xml_folders = glob.glob(data_path+'*')
        xml_folders.sort()
        
        for xml_folder in xml_folders:
            xml_roots = xml_folder+'/*'
            xml_files = glob.glob(xml_roots)
            xml_files.sort()
            
            for xml_file in xml_files:        
                with open(xml_file,'r') as f:
                    xmlString = f.read()
                dict_data = xmltodict.parse(xmlString)['benchmark']['entries']['entry']
                if not isinstance(dict_data, list):
                    dict_data = [dict_data]                

                # challenge version
                for i in range(len(dict_data)):
                    y=dict_data[i]
                    self.category_list.append(y['@category'])
                    
                    self.modifiedtripleset_list.append(y['modifiedtripleset']['mtriple'])
                    z = y['lex']
                    if isinstance(z, list):
                        z = z[0]
                    self.text_list.append(z['#text'])

        
    def __len__(self):
        return len(self.category_list)

    def __getitem__(self, idx): 
        
        return self.category_list[idx], self.modifiedtripleset_list[idx], self.text_list[idx]

In [161]:
# data_path = '/data/private/dataset/webnlg/data/v2.0/en/train/'
data_path = '/data/private/WebNLG-models/chimera-master/data/WebNLG/raw/train/'
webNLG_data = webNLG_DATASET(data_path)
dataloader = DataLoader(webNLG_data, batch_size=1, shuffle=False, num_workers=4)

# data_path = '/data/private/WebNLG-models/chimera-master/data/WebNLG/raw/test/'
# webNLG_data = webNLG_DATASET(data_path)
# dataloader = DataLoader(webNLG_data, batch_size=1, shuffle=False, num_workers=4)    

In [162]:
len(webNLG_data)

6940

In [155]:
webNLG_data[1788]

('Airport',
 ['Aarhus | leaderName | Jacob_Bundsgaard',
  'Aarhus_Airport | cityServed | Aarhus'],
 "Aarhus airport serves the city of Aarhus who's leader is Jacob Bundsgaard.")

In [176]:
webNLG_data[1788]
for x in range(len(webNLG_data)):
    data = webNLG_data[x]
    
    if isinstance(data[1], list):
        for triple_set in data[1]:
            triples = triple_set.split('|')
            for triple in triples:
                if '__' in triple:
                    print(triple)
    else:
        triples = data[1].split('|')
        for triple in triples:
            if 'comma' in triple:
                print(triple)        
        
        
# c = 0
# for i_batch, data in enumerate(dataloader):    
# #     print(data)
# #     break
#     if c == 1788:
#         print(data)
#         break
#     c += 1

 commander 
 commander 


In [174]:
s = 'Aarhus Airport'
x = my_model.tokenizer.encode(s)
my_model.tokenizer.decode(x[:4])

'Aarhus Airport'

In [144]:
tr_list = ['<tr1>', '<tr2>', '<tr3>']

input_str = '<c> '
input_str += cate[0]
input_str += ' '


triple_total = []
if isinstance(tripleset, list):
    for tripleset_temp in tripleset:
        triple_total = tripleset_temp[0].split('|')
        triple_list = [x.strip() for x in triple_total]        

        for k in range(len(triple_list)):
            triple = triple_list[k]
            triple = triple_list[k]
            input_str += tr_list[k] + ' '
            input_str += triple.replace('_', ' ') # _이 너무 많음
            input_str += ' '        
else:
    triple_total += tripleset[0].split('|')           
    triple_list = [x.strip() for x in triple_total]   

    for k in range(len(triple_list)):
        triple = triple_list[k]
        input_str += tr_list[k] + ' '
        input_str += triple.replace('_', ' ') # _이 너무 많음
        input_str += ' '

input_str += '<S>'
if text is not '':
    input_str += ' '
    input_str += text[0]

input_str = input_str.strip()


In [150]:
tripleset, triple_total
input_str

"<c> Airport <tr1> Aarhus <tr2> leaderName <tr3> Jacob Bundsgaard <tr1> Aarhus Airport <tr2> cityServed <tr3> Aarhus <S> Aarhus airport serves the city of Aarhus who's leader is Jacob Bundsgaard."

In [1]:
from model import *
my_model = webmodel().cuda()
my_model

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at /data/private/GPT/openai-gpt2/base/ and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


webmodel(
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50262, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (r

### transformer check

In [37]:
x = torch.tensor([[1,2,3,4], [1,2,3,4], [1,2,3,4]]).cuda()
print(x.shape)

token_type_ids1 = torch.tensor([[0,0,1,1], [0,0,1,1], [0,0,1,1]]).cuda()
o1 = my_model.model(x, token_type_ids=token_type_ids1)[0]
print(o1.shape)

token_type_ids2 = torch.tensor([[0,0,2,2], [0,0,2,2], [0,0,2,2]]).cuda()
o2 = my_model.model(x, token_type_ids=token_type_ids2)[0]
print(o2.shape)

torch.Size([3, 4])
torch.Size([3, 4, 50262])
torch.Size([3, 4, 50262])


In [36]:
o1[0], o2[0]

(tensor([[-33.3009, -32.5016, -34.6172,  ...,   2.2522,   3.1099,   0.6586],
         [-40.5900, -42.5236, -41.2990,  ...,   4.3247,   4.8959,   0.9258],
         [-60.6873, -60.5445, -58.5169,  ...,   5.2210,   6.1823,   1.2264],
         [-67.4091, -67.0701, -64.2516,  ...,   5.8832,   6.6753,   1.5033]],
        device='cuda:0', grad_fn=<SelectBackward>),
 tensor([[-33.3009, -32.5016, -34.6172,  ...,   2.2522,   3.1099,   0.6586],
         [-40.5900, -42.5236, -41.2990,  ...,   4.3247,   4.8959,   0.9258],
         [-47.6841, -50.1135, -46.8205,  ...,   4.6252,   5.5870,   0.9204],
         [-64.3876, -67.1301, -62.2161,  ...,   5.2525,   6.8331,   1.2655]],
        device='cuda:0', grad_fn=<SelectBackward>))

In [48]:
wte = my_model.model.get_input_embeddings()
a = torch.tensor([0]).cuda()
print(wte(a))

my_model.tokenizer.decode([0])

RuntimeError: CUDA error: device-side assert triggered

In [48]:
c = 0
for i_batch, sample_batched in enumerate(dataloader):    
    c+= 1

    cate, triple, text = sample_batched
    input_tensor = my_model.make_tensor(cate, triple, text)

    out_logit = my_model.logit_feeding(input_tensor)
    target_idx = my_model.tokenizer.encode(text[0])
    target_len = len(target_idx)

    label_idxs = torch.tensor(target_idx + [my_model.tokenizer.eos_token_id]) # (len)

    loss = my_model.LM_loss(out_logit, target_len, label_idxs)      
    
    print(cate[0], triple, text[0])
    if c == 3:
        break

NameError: name 'my_model' is not defined

In [13]:
input_tensor, label_idxs, input_tensor[:,-target_len-1:]

(tensor([[50258, 16170,   634, 50259,    32,   283,  7537,    62, 16170,   634,
           1748,    50,  8520,   366,    32,   283,  7537,    11, 16490,     1,
          50257,   464,   317,   283,  7537,   318,   262,  9003,   286,   317,
            283,  7537,    11, 16490,    13]], device='cuda:0'),
 tensor([  464,   317,   283,  7537,   318,   262,  9003,   286,   317,   283,
          7537,    11, 16490,    13, 50256]),
 tensor([[50257,   464,   317,   283,  7537,   318,   262,  9003,   286,   317,
            283,  7537,    11, 16490,    13]], device='cuda:0'))

In [7]:
out_logit.shape, target_len, label_idxs.shape

(torch.Size([1, 35, 50260]), 14, torch.Size([15]))

In [8]:
pred_logit = out_logit[:,-target_len-1:,:].squeeze(0) # (len, vocab_num)
pred_logit.shape

torch.Size([15, 50260])

In [14]:
triple[0][0].replace('_', ' ')

'Aarhus Airport'

## Data sample

In [1]:
# path = './data/v1.5/en/train/1triples/Airport.xml'
# path = '/data/private/dataset/webnlg/data/v2.0/en/test/1triples/City.xml'
# path = '/data/private/WebNLG-models/chimera-master/data/WebNLG/raw/train/1triples/1triple_allSolutions_Airport_train_challenge.xml'
# path = '/data/private/WebNLG-models/chimera-master/data/WebNLG/raw/test/testdata_with_lex.xml'
path = '/data/private/dataset/webnlg/webnlg-dataset/webnlg_challenge_2017/train/1triples/1triple_allSolutions_Airport_train_challenge.xml'

import json
import xmltodict
 
with open(path,'r') as f:
    xmlString = f.read()
    
print("xml input (xml_to_json.xml):")
# print(xmlString)
 
jsonString = json.dumps(xmltodict.parse(xmlString), indent=4)
 
# print("\nJSON output(output.json):")
# print(jsonString)
 
# with open("xml_to_json.json", 'w') as f:
#     f.write(jsonString)

xml input (xml_to_json.xml):


In [2]:
x=xmltodict.parse(xmlString)
len(x['benchmark']['entries']['entry'])
# test: 1862

301

In [15]:
# challenge version
for i in range(10, 15):
    y=x['benchmark']['entries']['entry'][i]
    print("category: ", y['@category'])
#     print('entitymap: ', y['entitymap'])
    print("modifiedtripleset: ", y['modifiedtripleset']['mtriple'])
    z = y['lex']
    if isinstance(z, list):
        z = z[0]
    
#     print('reference: ', z['references'])
    print('text: ', z['#text'])
#     print("template: ", z['template'])
    print('')

category:  Airport
modifiedtripleset:  Abilene,_Texas | country | United_States
text:  Abilene, Texas is in the United States.

category:  Airport
modifiedtripleset:  Abilene,_Texas | isPartOf | Jones_County,_Texas
text:  Abilene, Texas is part of Jones County, Texas.

category:  Airport
modifiedtripleset:  Abilene,_Texas | isPartOf | Taylor_County,_Texas
text:  Abilene, Texas is part of Taylor County,Texas.

category:  Airport
modifiedtripleset:  Abilene,_Texas | isPartOf | Texas
text:  Abilene, Texas is part of Texas.

category:  Airport
modifiedtripleset:  Abilene_Regional_Airport | 1st_runway_LengthFeet | 3678
text:  The length of the 1st runway at Abilene Regional airport is 3678 feet.



In [16]:
y.keys(),y['lex']

(odict_keys(['@category', '@eid', '@size', 'originaltripleset', 'modifiedtripleset', 'lex']),
 OrderedDict([('@comment', 'good'),
              ('@lid', 'Id1'),
              ('#text',
               'The length of the 1st runway at Abilene Regional airport is 3678 feet.')]))

In [18]:
# challenge version
max_len = 0
for i in range(len(x['benchmark']['entries']['entry'])):
    y=x['benchmark']['entries']['entry'][i]
    z = y['lex']
    if isinstance(z, list):
        z = z[0]
    text = z['#text']
    temp_len = len(my_model.tokenizer.encode(text))
    if temp_len > max_len:
        max_len = temp_len
print(max_len)

92


In [20]:
# version 2.0
for i in range(10, 20):
    y=x['benchmark']['entries']['entry'][i]
    print("category: ", y['@category'])
#     print('entitymap: ', y['entitymap'])
    if 'train' in path.split('/'):
        print("modifiedtripleset: ", y['modifiedtripleset']['mtriple'])    
    else:
        print("modifiedtripleset: ", y['modifiedtripleset']['otriple'])
    z = y['lex']
    if isinstance(z, list):
        z = z[0]
    
#     print('reference: ', z['references'])
    print('text: ', z['text'])
    print("template: ", z['template'])
    print('')

category:  City
modifiedtripleset:  Antioch,_California | isPartOf | Contra_Costa_County,_California
text:  Antioch, is part of Contra Costa County in California.
template:  AGENT-1 is part of PATIENT-1 .

category:  City
modifiedtripleset:  Arlington,_Texas | populationDensity | 1472.0 (inhabitants per square kilometre)
text:  Arlington, Texas, has a population density, of 1472.0 inhabitants per square kilometre.
template:  AGENT-1 has a population density ,  of PATIENT-1 .

category:  City
modifiedtripleset:  Atlanta | leader | Kasim_Reed
text:  Kasim Reed is the leader in Atlanta.
template:  PATIENT-1 is the leader in AGENT-1 .

category:  City
modifiedtripleset:  Atlantic_City,_New_Jersey | areaTotal | 44.125 (square kilometres)
text:  Atlantic City, New Jersey has a total area of 44.125 (square kilometres).
template:  AGENT-1 has a total area of PATIENT-1 .

category:  City
modifiedtripleset:  Atlantic_City,_New_Jersey | isPartOf | New_Jersey
text:  Atlantic City is part of New Je

In [15]:
y.keys(), z.keys(), y['originaltripleset'], y['modifiedtripleset'], z['sortedtripleset']

(odict_keys(['@category', '@eid', '@size', 'originaltripleset', 'modifiedtripleset', 'lex', 'entitymap']),
 odict_keys(['@comment', '@lid', 'tree', 'sortedtripleset', 'references', 'text', 'template']),
 OrderedDict([('otriple',
               'Albuquerque,_New_Mexico | leaderTitle | New_Mexico_Senate')]),
 OrderedDict([('otriple',
               'Albuquerque,_New_Mexico | leaderTitle | New_Mexico_Senate')]),
 OrderedDict([('sentence',
               OrderedDict([('@ID', '1'),
                            ('striple',
                             'Albuquerque,_New_Mexico | leaderTitle | New_Mexico_Senate')]))]))

In [201]:
# version 1.5
for i in range(3):
    y=x['benchmark']['entries']['entry'][i]
    print("category: ", y['@category'])
#     print("originaltripleset: ", y['originaltripleset']['otriple'])
    print("modifiedtripleset: ", y['modifiedtripleset']['mtriple'])
#     print("entitymap :", y['entitymap'])    
    z = y['lex']
    if isinstance(z, list):
        z = z[0]
#     print(z['@comment'], z['@lid'], z['sortedtripleset']['sentence']['striple'], z['text'], z['template'], z['lexicalization'])
#     print("striple: ", z['sortedtripleset']['sentence']['striple']) # 학습 데이터만
    
    if isinstance(z['references']['reference'], list):
        for w in z['references']['reference']:
            print('reference: ', w['#text'])
    else:
        print('reference: ', z['references']['reference']['#text'])
    print("text: ", z['text'])
    print("template: ", z['template'])
    print("lexicalization: ", z['lexicalization'])
    print('')

category:  Airport
modifiedtripleset:  Aarhus_Airport | cityServed | "Aarhus, Denmark"
reference:  The Aarhus
reference:  Aarhus , Denmark
text:  The Aarhus is the airport of Aarhus, Denmark.
template:  AGENT-1 is the airport of PATIENT-1 .
lexicalization:  AGENT-1 VP[aspect=simple,tense=present,voice=active,person=3rd,number=singular] be DT[form=defined] the airport of PATIENT-1 .

category:  Airport
modifiedtripleset:  Aarhus_Airport | cityServed | Aarhus
reference:  Aarhus airport
reference:  Aarhus
text:  Aarhus airport serves the city of Aarhus.
template:  AGENT-1 serves the city of PATIENT-1 .
lexicalization:  AGENT-1 VP[aspect=simple,tense=present,voice=active,person=3rd,number=null] serve DT[form=defined] the city of PATIENT-1 .

category:  Airport
modifiedtripleset:  Aarhus_Airport | elevationAboveTheSeaLevel_(in_metres) | 25.0
reference:  Aarhus Airport
reference:  25 metres
text:  Aarhus Airport is 25 metres above sea level.
template:  AGENT-1 is PATIENT-1 above sea level .


In [208]:
z

[OrderedDict([('@comment', 'good'),
              ('@lid', 'Id1'),
              ('tree',
               [None,
                '(SENTENCES (S (NP-SUBJ (TAG AGENT-1)) (VP[aspect=simple,tense=present,voice=active,person=3rd,number=singular] (VB be) (NP (NP (NN part)) (PP (IN of) (NP (DT[form=defined] the) (NNP U.S.)))))))']),
              ('sortedtripleset',
               OrderedDict([('sentence',
                             OrderedDict([('@ID', '1'),
                                          ('striple',
                                           'Albany,_Oregon | isPartOf | United_States')]))])),
              ('references', None),
              ('text', 'Albany Oregon is part of the U.S.'),
              ('template', 'AGENT-1 is part of PATIENT-1 .')]),
 OrderedDict([('@comment', 'good'),
              ('@lid', 'Id2'),
              ('tree',
               [None,
                '(SENTENCES (S (NP-SUBJ (TAG AGENT-1)) (VP[aspect=simple,tense=present,voice=active,person=3rd,number=si

In [33]:
y=x['benchmark']['entries']['entry'][1]
y.keys()

odict_keys(['@category', '@eid', '@size', 'originaltripleset', 'modifiedtripleset', 'lex', 'entitymap'])

In [67]:
for k, v in y.items():    
    print(k, v)

@category Airport
@eid Id2
@size 1
originaltripleset OrderedDict([('otriple', 'Aarhus_Airport | city | Aarhus')])
modifiedtripleset OrderedDict([('mtriple', 'Aarhus_Airport | cityServed | Aarhus')])
lex OrderedDict([('@comment', 'good'), ('@lid', 'Id1'), ('sortedtripleset', OrderedDict([('sentence', OrderedDict([('@ID', '1'), ('striple', 'Aarhus_Airport | cityServed | Aarhus')]))])), ('references', OrderedDict([('reference', [OrderedDict([('@entity', 'Aarhus_Airport'), ('@number', '1'), ('@tag', 'AGENT-1'), ('@type', 'name'), ('#text', 'Aarhus airport')]), OrderedDict([('@entity', 'Aarhus'), ('@number', '2'), ('@tag', 'PATIENT-1'), ('@type', 'name'), ('#text', 'Aarhus')])])])), ('text', 'Aarhus airport serves the city of Aarhus.'), ('template', 'AGENT-1 serves the city of PATIENT-1 .'), ('lexicalization', 'AGENT-1 VP[aspect=simple,tense=present,voice=active,person=3rd,number=null] serve DT[form=defined] the city of PATIENT-1 .')])
entitymap OrderedDict([('entity', ['AGENT-1 | Aarhus_Ai

In [64]:
y['originaltripleset']['otriple']

'Aarhus_Airport | city | Aarhus'

In [65]:
y['modifiedtripleset']['mtriple']

'Aarhus_Airport | cityServed | Aarhus'

In [68]:
z = y['lex']
z.keys()

odict_keys(['@comment', '@lid', 'sortedtripleset', 'references', 'text', 'template', 'lexicalization'])

In [76]:
z['@comment'], z['@lid'], z['sortedtripleset']['sentence']['striple'], z['text'], z['template'], z['lexicalization']

('good',
 'Id1',
 'Aarhus_Airport | cityServed | Aarhus',
 'Aarhus airport serves the city of Aarhus.',
 'AGENT-1 serves the city of PATIENT-1 .',
 'AGENT-1 VP[aspect=simple,tense=present,voice=active,person=3rd,number=null] serve DT[form=defined] the city of PATIENT-1 .')

In [58]:
y['entitymap']

OrderedDict([('entity', ['AGENT-1 | Aarhus_Airport', 'PATIENT-1 | Aarhus'])])