In [1]:
from torch.utils.data import Dataset, DataLoader
class e2eDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        """
        Args:
            csv_file (string): csv 파일의 경로
        """
        self.dataset = pd.read_csv(csv_file)
        self.columns = self.dataset.columns
        self.conditions = self.dataset[self.columns[0]]
        self.sentences = self.dataset[self.columns[1]]
        self.tokenizer = tokenizer
        
        self.typ_list = {}
        for k in range(len(self.conditions)):
            cond_set = self.conditions[k].split(',')
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] in self.typ_list.keys():
                    self.typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
                else:            
                    self.typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        

    def __len__(self):
        return len(self.conditions)

    def __getitem__(self, idx):
        cond = self.conditions[idx]
        cond_set = cond.split(',')
        condition_string = ''
        for m in range(len(cond_set)):
            cond_set[m] = cond_set[m].strip()
            pos = cond_set[m].index('[')
            
            condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        
        sen = self.sentences[idx]
        input_string = condition_string + '<START>'
        input_ids = torch.tensor(self.tokenizer.encode(input_string, add_special_tokens=True))
        
        input_len = len(input_ids)

        return input_ids, sen, condition_string


## Total model inference

In [2]:
# from model_large import *
from model import *
from tqdm import tqdm
import time
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
max_len = 70

my_model = mymodel().cuda()
my_model.eval()

for i in range(1, 11):
    model_name = './gen_model/base3_sample_30/'+str(i)+'/model'
    my_model.load_state_dict(torch.load(model_name))
    print('ok') 
    if i == 1:
#         e2e_dataset = e2eDataset(csv_file='dataset/devset.csv', tokenizer=my_model.tokenizer)
        e2e_dataset = e2eDataset(csv_file='dataset/testset_w_refs.csv', tokenizer=my_model.tokenizer)
        dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=False, num_workers=4)
        same_condition = []
        ref_sentences = []
        input_ids_list = []
        pre_condition_string = ''
        start = 0
        for i_batch, sample_batched in tqdm(enumerate(dataloader)):
            sen = sample_batched[1][0]
            condition_string = sample_batched[2]  
            input_ids = sample_batched[0].squeeze(0).cuda()

            if start == 0 or condition_string == pre_condition_string:      
                if start == 0:
                    input_ids_list.append(input_ids)
                same_condition.append(sen)        
                pre_condition_string = condition_string
                start += 1
            else:   
                input_ids_list.append(input_ids)
                ref_sentences.append(same_condition)
                pre_condition_string = condition_string
                same_condition = [sen]
                start += 1
        ref_sentences.append(same_condition)    

    bleu_score = 0
    bleu_1 = 0

#     f_dev = open('./predictions/testset/large2/f_dev_'+str(i)+'.txt', 'w')
#     f_pred = open('./predictions/devset/base4/f_pred_'+str(i)+'.txt', 'w')
    f_pred = open('./predictions/joosung2/testset/base3_sample30/f_pred_'+str(i)+'.txt', 'w')

    for k in range(len(ref_sentences)):
        input_ids = input_ids_list[k]
        input_len = len(input_ids)

        ori_tokens = []
        for m in range(len(ref_sentences[k])):
#             f_dev.write(ref_sentences[k][m]+'\n')
            ori_tokens.append(word_tokenize(ref_sentences[k][m]))
#         if k < len(ref_sentences)-1:
#             f_dev.write('\n')

        for _ in range(max_len):
            model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
            pred_idx = model_out.argmax(1)[-1]        
            if pred_idx == my_model.tokenizer.eos_token_id:
                break            
            input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        

        out_sen = my_model.tokenizer.decode(input_ids[input_len:])
        f_pred.write(out_sen+'\n')

        out_tokens = word_tokenize(out_sen)

        bleu_1_score = sentence_bleu(ori_tokens, out_tokens, weights=(1, 0, 0, 0))
        bleu_2_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.5, 0.5, 0, 0))
        bleu_3_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.33, 0.33, 0.33, 0))
        bleu_4_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.25, 0.25, 0.25, 0.25))

        bleu_1 += bleu_1_score

        bleu_score += min(1, len(out_tokens)/len(ori_tokens))*((bleu_1_score*bleu_2_score*bleu_3_score*bleu_4_score)**(0.25))    

#     f_dev.close()
    f_pred.close()
    
    print(i, "th model")
    print("BLEU score: {}".format(bleu_score/len(ref_sentences)*100))
    print("BLEU1 score: {}".format(bleu_1/len(ref_sentences)*100))    

I0506 00:55:24.555836 139751306041152 file_utils.py:41] PyTorch version 1.2.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package punkt to /home/ds_user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
I0506 00:55:28.815614 139751306041152 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user

ok


4693it [00:04, 1043.14it/s]
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


1 th model
BLEU score: 64.7182026491176
BLEU1 score: 81.66713292807486
ok
2 th model
BLEU score: 72.70623317413948
BLEU1 score: 87.90252726244282
ok
3 th model
BLEU score: 67.32506458544132
BLEU1 score: 81.56109619474205
ok
4 th model
BLEU score: 73.51276102082251
BLEU1 score: 87.92926846373422
ok
5 th model
BLEU score: 72.82402924422641
BLEU1 score: 88.03365885644202
ok
6 th model
BLEU score: 74.213070114825
BLEU1 score: 89.14764820261996
ok
7 th model
BLEU score: 74.03113321821753
BLEU1 score: 88.99630691353684
ok
8 th model
BLEU score: 73.73009345136494
BLEU1 score: 89.32915071638948
ok
9 th model
BLEU score: 73.59609822373056
BLEU1 score: 88.46212463637237
ok
10 th model
BLEU score: 73.34397462158798
BLEU1 score: 88.4945119671014


## BERTscore

In [None]:
### BERT score with human reference
from bert_score import score
import glob
human_files = "/project/work/E2E/predictions/joosung2/testset/f_dev.txt"

human_open = open(human_files, "r")
human_dataset = human_open.readlines()
human_open.close()

human_references = []

temp_reference = []
for i in range(len(human_dataset)):
    if human_dataset[i] == '\n':
        human_references.append(temp_reference)
        temp_reference = []
    else:
        temp_reference.append(human_dataset[i].strip())
human_references.append(temp_reference)
human_compare = []
for i in range(len(human_references)):
    for k in range(len(human_references[i])):
        human_compare.append(human_references[i][k])

output_path = "/project/work/E2E/predictions/joosung2/testset/base1_sample30/*"
pred_files = glob.glob(output_path)

score_list = []
for i in range(len(pred_files)):    
    cands = []
    pred_data_open = open(pred_files[i], "r")
    pred_data_dataset = pred_data_open.readlines()
    pred_len = len(pred_data_dataset)
    pred_data_open.close()
    
    for k in range(len(pred_data_dataset)):
        out_sen = pred_data_dataset[k].strip()
        repeat_num = len(human_references[k])
        for _ in range(repeat_num):
            cands.append(out_sen)

    P, R, F1 = score(cands, human_compare, lang='en', verbose=True)
    F1_list=list(F1.numpy())
    BERT_score = sum(F1_list)/len(F1_list)
    
    score_list.append(BERT_score)    

In [None]:
pred_files, score_list

In [None]:
### BERT score with human reference
import csv
from bert_score import score
import glob
human_files = "/project/work/E2E/predictions/testset/f_dev.txt"

human_open = open(human_files, "r")
human_dataset = human_open.readlines()
human_open.close()

human_references = []

temp_reference = []
for i in range(len(human_dataset)):
    if human_dataset[i] == '\n':
        human_references.append(temp_reference)
        temp_reference = []
    else:
        temp_reference.append(human_dataset[i].strip())
human_references.append(temp_reference)
human_compare = []
for i in range(len(human_references)):
    for k in range(len(human_references[i])):
        human_compare.append(human_references[i][k])

output_path = "/project/work/E2E/compared_system/system_outputs/primary_txt/*"
pred_files = glob.glob(output_path)

score_list = []
for i in range(len(pred_files)):    
    cands = []
    pred_data_open = open(pred_files[i], "r")
    pred_data_dataset = pred_data_open.readlines()
    pred_len = len(pred_data_dataset)
    pred_data_open.close()
    
    for k in range(len(pred_data_dataset)):
        out_sen = pred_data_dataset[k].strip()
        repeat_num = len(human_references[k])
        for _ in range(repeat_num):
            cands.append(out_sen)

    P, R, F1 = score(cands, human_compare, lang='en', verbose=True)
    F1_list=list(F1.numpy())
    BERT_score = sum(F1_list)/len(F1_list)
    
    score_list.append(BERT_score)  
print(pred_files, score_list)    

In [None]:
for i in range(len(pred_files)):
    print(pred_files[i], score_list[i])

In [3]:
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_1.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_2.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_3.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_4.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_5.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_6.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_7.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_8.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_9.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_10.txt
# !./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base1_sample10_6.txt
# !./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base2_sample10_9.txt
# !./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base4_devtest_4.txt
# !./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/2base1_sample50_7.txt
# !./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt /project/work/E2E/compared_system/system_outputs/primary_txt/harv.txt

./predictions/joosung2/testset/f_dev.txt ./predictions/joosung2/testset/base3_sample30/f_pred_1.txt None
Running MS-COCO evaluator...
creating index...
index created!
Loading and preparing results...     
DONE (t=0.00s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 129948 tokens at 470312.13 tokens per second.
PTBTokenizer tokenized 16239 tokens at 128875.23 tokens per second.
setting up scorers...
computing METEOR score...
METEOR: 0.371
computing Rouge score...
ROUGE_L: 0.599
computing CIDEr score...
CIDEr: 1.563
Creating temp directory  /tmp/e2e-eval-o2g4n0r1
Running MTEval to compute BLEU & NIST...
Use of 'Hyphen' in \p{} or \P{} is deprecated because: Supplanted by Line_Break property values; see www.unicode.org/reports/tr14; at /project/work/E2E/e2e-metrics/mteval/mteval-v13a-sig.pl line 993.
MT evaluation scorer began on 2020 May 6 at 02:14:29
command line:  /project/work/E2E/e2e-metrics/mteval/mteval-v13a-sig.pl -r /tmp/e2e-eval-o2g4n0r1/mteval_ref.sgm 

## Tsv to txt

In [None]:
import pandas as pd
output_path = "/project/work/E2E/compared_system/system_outputs/primary/*"
comapred_files = glob.glob(output_path)

for i in range(len(comapred_files)):
    dataset = pd.read_csv(comapred_files[i], delimiter='\t', header=None)
    
    name = comapred_files[i].split('/')[-1].split('.')[0]
    txt_files = "/project/work/E2E/compared_system/system_outputs/primary_txt/"+name+".txt"
    f = open(txt_files, "w")
    gen_sentences = dataset[1]
    
    for k in range(1, len(gen_sentences)):
        f.write(gen_sentences[k]+'\n')
    f.close()

In [None]:
len(gen_sentences)

## zero-shot test

In [None]:
from model import *

max_len = 70
my_model = mymodel().cuda()
my_model.eval()
model_name = './gen_model/base_devtrain_4/4/model'
my_model.load_state_dict(torch.load(model_name))

In [None]:
import random
import pandas as pd
csv_file='dataset/testset_w_refs.csv'
dataset = pd.read_csv(csv_file)
columns = dataset.columns
conditions = dataset[columns[0]]
        
typ_list = {}
for k in range(len(conditions)):
    cond_set = conditions[k].split(',')
    for m in range(len(cond_set)):
        cond_set[m] = cond_set[m].strip()
        pos = cond_set[m].index('[')
        if cond_set[m][:pos] in typ_list.keys():
            typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
        else:            
            typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}   

def sample_batch(tokenizer, cond_name, cond_set):
    condition_string = ''
    for m in range(len(cond_set)):
        condition_string += cond_name[m] + cond_set[m] + ' '

    input_string = condition_string + '<START>'
    input_ids = torch.tensor(tokenizer.encode(input_string, add_special_tokens=True))

    input_len = len(input_ids)

    return input_ids, condition_string

# <name> <eatType> <food> <priceRange> <customer rating> <area> <familyFriendly> <near>

name = None #'<NAME>'
eatType = None #'<EATTYPE>'
food = None # '<FOOD>'
priceRange = None # '<priceRange>'
customer_rating = None # '<CUSTOMER_RATING>'
area = '<area>' # None
familyFriendly = 'yes' # None
near = None #'<NEAR>'

cond_list = []
conditions = []
if name is not None:
    placeholder_name = random.choice(list(typ_list['name']))
    cond_list.append('<name>')
    conditions.append(placeholder_name)
if eatType is not None:
    placeholder_eatType = random.choice(list(typ_list['eatType']))
    cond_list.append('<eatType>')
    conditions.append(placeholder_eatType)
if food is not None:
    placeholder_food = random.choice(list(typ_list['food']))
    cond_list.append('<food>')
    conditions.append(placeholder_food)
if priceRange is not None:
    placeholder_priceRange = random.choice(list(typ_list['priceRange']))
    cond_list.append('<priceRange>')
    conditions.append(placeholder_priceRange)    
if customer_rating is not None:
    placeholder_customer_rating = random.choice(list(typ_list['customer rating']))
    cond_list.append('<customer rating>')
    conditions.append(placeholder_customer_rating)        
if area is not None:
    placeholder_area = random.choice(list(typ_list['area']))
    cond_list.append('<area>')
    conditions.append(placeholder_area)    
if familyFriendly is not None:
    cond_list.append('<familyFriendly>')
    conditions.append(familyFriendly)            
if near is not None:
    placeholder_near = random.choice(list(typ_list['near']))
    cond_list.append('<near>')
    conditions.append(placeholder_near)        


# del cond_name[2]
# del conditions[2]

sample = sample_batch(my_model.tokenizer, cond_list, conditions)

input_ids = sample[0].cuda()
condition_string = sample[1]  
input_len = len(input_ids)

max_len = 70
for _ in range(max_len):
    model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
    pred_idx = model_out.argmax(1)[-1]        
    if pred_idx == my_model.tokenizer.eos_token_id:
        break            
    input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        

out_sen = my_model.tokenizer.decode(input_ids[input_len:])
print(cond_list)
print(conditions)
print(out_sen)

if name is not None:
    out_sen = out_sen.replace(placeholder_name, name)
if eatType is not None:
    out_sen = out_sen.replace(placeholder_eatType, eatType)
if food is not None:
    out_sen = out_sen.replace(placeholder_food, food)
if priceRange is not None:
    out_sen = out_sen.replace(placeholder_priceRange, priceRange)
if customer_rating is not None:
    out_sen = out_sen.replace(placeholder_customer_rating, customer_rating)
if area is not None:
    out_sen = out_sen.replace(placeholder_area, area)  
if near is not None:
    out_sen = out_sen.replace(placeholder_near, near)
print(out_sen)

In [None]:
condition_string

## delexicalization

In [None]:
class e2eDataset(Dataset):
    def __init__(self, csv_file1, csv_file2, tokenizer):
        """
        Args:
            csv_file (string): csv 파일의 경로
        """
        self.dataset1 = pd.read_csv(csv_file1)
        self.dataset2 = pd.read_csv(csv_file2)
        
        self.columns1 = self.dataset1.columns
        self.columns2 = self.dataset2.columns
        
        self.conditions = list(self.dataset1[self.columns1[0]]) + list(self.dataset2[self.columns2[0]])
        self.sentences = list(self.dataset1[self.columns1[1]]) + list(self.dataset2[self.columns2[1]])
        self.tokenizer = tokenizer
        
        self.typ_list = {}
        for k in range(len(self.conditions)):
            cond_set = self.conditions[k].split(',')
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] in self.typ_list.keys():
                    self.typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
                else:            
                    self.typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        

    def __len__(self):
        return len(self.conditions)

    def __getitem__(self, idx):
        sen = self.sentences[idx]
        
        cond = self.conditions[idx]
        cond_set = cond.split(',')        
        condition_string = ''
        
        
        p = random.random()

        if p > 0.3: # 70%
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')

                condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        else: # p <= 0.3 / 30%
            nochange_list = ['priceRange', 'customer rating', 'familyFriendly']
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] not in nochange_list:
                    placeholder = '<' + cond_set[m][:pos] + '>'
                    condition_string += placeholder + ' '
                    sen = sen.replace(cond_set[m][pos+1:-1], placeholder)        
                else:
                    condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        
        input_string = condition_string + '<START>' + sen
        input_ids = torch.tensor(self.tokenizer.encode(input_string, add_special_tokens=True))
        
        label_string = sen + ' <|endoftext|>'
        label_ids = torch.tensor(self.tokenizer.encode(label_string, add_special_tokens=True))

        return input_string, input_ids, label_ids    

In [None]:
e2e_dataset = e2eDataset(csv_file1='dataset/trainset.csv', csv_file2='dataset/devset.csv', tokenizer=my_model.tokenizer)
print(e2e_dataset[200][0])
# print(e2e_dataset.typ_list.keys())
# print(e2e_dataset.typ_list)
# priceRange, customer rating, familyFriendly

# check data

In [None]:
f_dev = open('f_dev.txt', 'r')
f_pred = open('f_pred.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [None]:
len(f_dev_dataset), len(f_pred_dataset)

In [None]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

In [None]:
4672+546

In [None]:
f_dev = open('./e2e-metrics/example-inputs/devel-conc.txt', 'r')
f_pred = open('./e2e-metrics/example-inputs/baseline-output.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [None]:
len(f_dev_dataset), len(f_pred_dataset)

In [None]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

In [None]:
import random
p=random.random()
p