In [1]:
from torch.utils.data import Dataset, DataLoader
class e2eDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        """
        Args:
            csv_file (string): csv 파일의 경로
        """
        self.dataset = pd.read_csv(csv_file)
        self.columns = self.dataset.columns
        self.conditions = self.dataset[self.columns[0]]
        self.sentences = self.dataset[self.columns[1]]
        self.tokenizer = tokenizer
        
        self.typ_list = {}
        for k in range(len(self.conditions)):
            cond_set = self.conditions[k].split(',')
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] in self.typ_list.keys():
                    self.typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
                else:            
                    self.typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        

    def __len__(self):
        return len(self.conditions)

    def __getitem__(self, idx):
        cond = self.conditions[idx]
        cond_set = cond.split(',')
        condition_string = ''
        for m in range(len(cond_set)):
            cond_set[m] = cond_set[m].strip()
            pos = cond_set[m].index('[')
            
            condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        
        sen = self.sentences[idx]
        input_string = condition_string + '<START>'
        input_ids = torch.tensor(self.tokenizer.encode(input_string, add_special_tokens=True))
        
        input_len = len(input_ids)

        return input_ids, sen, condition_string


In [2]:
from model import *
my_model = mymodel().cuda()
my_model.eval()
my_model.load_state_dict(torch.load('./gen_model/base/2/model'))
print('ok') 

I0403 08:02:35.806313 140105540278080 file_utils.py:41] PyTorch version 1.2.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I0403 08:02:39.756214 140105540278080 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b91451

ok


In [3]:
# e2e_dataset = e2eDataset(csv_file='dataset/testset_w_refs.csv', tokenizer=my_model.tokenizer)
# e2e_dataset = e2eDataset(csv_file='dataset/trainset.csv', tokenizer=my_model.tokenizer)
e2e_dataset = e2eDataset(csv_file='dataset/devset.csv', tokenizer=my_model.tokenizer)
dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=False, num_workers=4)   

In [4]:
from tqdm import tqdm
same_condition = []
ref_sentences = []
input_ids_list = []
pre_condition_string = ''
start = 0
for i_batch, sample_batched in tqdm(enumerate(dataloader)):
    sen = sample_batched[1][0]
#     print(i_batch, sen)
    condition_string = sample_batched[2]  
    input_ids = sample_batched[0].squeeze(0).cuda()
        
    if start == 0 or condition_string == pre_condition_string:      
        if start == 0:
            input_ids_list.append(input_ids)
        same_condition.append(sen)        
        pre_condition_string = condition_string
        start += 1
    else:   
        input_ids_list.append(input_ids)
        ref_sentences.append(same_condition)
        pre_condition_string = condition_string
        same_condition = [sen]
        start += 1

#     if i_batch == 30:
#         break            
ref_sentences.append(same_condition)

4672it [00:03, 1311.04it/s]


In [5]:
len(ref_sentences), len(input_ids_list)
# input_ids_list[0]

(547, 547)

In [5]:
import time
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
max_len = 50
start_time = time.time()
bleu_score = 0
bleu_1 = 0

f_dev = open('./predictions/base/f_dev_2.txt', 'w')
f_pred = open('./predictions/base/f_pred_2.txt', 'w')

for k in range(len(ref_sentences)):
    input_ids = input_ids_list[k]
    input_len = len(input_ids)

    ori_tokens = []
    for m in range(len(ref_sentences[k])):
        f_dev.write(ref_sentences[k][m]+'\n')
        ori_tokens.append(word_tokenize(ref_sentences[k][m]))
    if k < len(ref_sentences)-1:
        f_dev.write('\n')
    
    for _ in range(max_len):
        model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
        pred_idx = model_out.argmax(1)[-1]        
        if pred_idx == my_model.tokenizer.eos_token_id:
            break            
        input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        
    
    out_sen = my_model.tokenizer.decode(input_ids[input_len:])
    f_pred.write(out_sen+'\n')
    
#     print(ref_sentences[k])
#     print(out_sen)    
    
    out_tokens = word_tokenize(out_sen)
    
    bleu_1_score = sentence_bleu(ori_tokens, out_tokens, weights=(1, 0, 0, 0))
    bleu_2_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.5, 0.5, 0, 0))
    bleu_3_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.33, 0.33, 0.33, 0))
    bleu_4_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.25, 0.25, 0.25, 0.25))
    
    bleu_1 += bleu_1_score

    bleu_score += min(1, len(out_tokens)/len(ori_tokens))*((bleu_1_score*bleu_2_score*bleu_3_score*bleu_4_score)**(0.25))    
    
#     print("time: {}".format(time.time()-start_time))
#     print('')
f_dev.close()
f_pred.close()
    
print("BLEU score: {}".format(bleu_score/len(ref_sentences)*100))
print("BLEU1 score: {}".format(bleu_1/len(ref_sentences)*100))

[nltk_data] Downloading package punkt to /home/ds_user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU score: 73.41162457057993
BLEU1 score: 89.30558590312106


In [13]:
BLEU score: 71.18825887456514
BLEU1 score: 88.76109911863252

./measure_scores.py ../predictions/base/f_dev_6.txt ../predictions/base/f_pred_6.txt
./measure_scores.py ../predictions/base2/f_dev_2.txt ../predictions/base2/f_pred_2.txt
./measure_scores.py ../predictions/large/f_dev_5.txt ../predictions/large/f_pred_5.txt
./measure_scores.py ../predictions/f_dev_2.txt ../predictions/large2/f_pred_2.txt

### Large
#### my_output_1
BLEU: 0.6826
NIST: 8.4243
METEOR: 0.4557
ROUGE_L: 0.7032
CIDEr: 2.1234

#### my_output_2
BLEU: 0.7228
NIST: 8.5241
METEOR: 0.4851
ROUGE_L: 0.7461
CIDEr: 2.4645

#### my_output_3
BLEU: 0.7035
NIST: 8.5937
METEOR: 0.4700
ROUGE_L: 0.7252
CIDEr: 2.3310

#### my_output_4
BLEU: 0.6738
NIST: 8.4018
METEOR: 0.4576
ROUGE_L: 0.7075
CIDEr: 2.2172

#### my_output_5
BLEU: 0.6927
NIST: 8.4429
METEOR: 0.4662
ROUGE_L: 0.7180
CIDEr: 2.2729

### base
#### my_output_2
BLEU: 0.6812
NIST: 8.5491
METEOR: 0.4442
ROUGE_L: 0.7036
CIDEr: 2.1261

#### my_output_6
BLEU: 0.6655
NIST: 8.4830
METEOR: 0.4475
ROUGE_L: 0.6992
CIDEr: 2.1077
    
#### my_output_final
BLEU: 0.6529
NIST: 8.3116
METEOR: 0.4430
ROUGE_L: 0.6842
CIDEr: 1.9766

### Pragmatically Informative Text Generation
BLEU 68.60
NIST 8.73
METEOR 45.25
R-L 70.82
CIDEr 2.37

## Total model inference

In [6]:
# from model_large import *
from model import *
from tqdm import tqdm
import time
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
max_len = 70

my_model = mymodel().cuda()
my_model.eval()

for i in range(1, 7):
    model_name = './gen_model/base4/'+str(i)+'/model'
    my_model.load_state_dict(torch.load(model_name))
    print('ok') 
    if i == 1:
#         e2e_dataset = e2eDataset(csv_file='dataset/devset.csv', tokenizer=my_model.tokenizer)
        e2e_dataset = e2eDataset(csv_file='dataset/testset_w_refs.csv', tokenizer=my_model.tokenizer)
        dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=False, num_workers=4)
        same_condition = []
        ref_sentences = []
        input_ids_list = []
        pre_condition_string = ''
        start = 0
        for i_batch, sample_batched in tqdm(enumerate(dataloader)):
            sen = sample_batched[1][0]
            condition_string = sample_batched[2]  
            input_ids = sample_batched[0].squeeze(0).cuda()

            if start == 0 or condition_string == pre_condition_string:      
                if start == 0:
                    input_ids_list.append(input_ids)
                same_condition.append(sen)        
                pre_condition_string = condition_string
                start += 1
            else:   
                input_ids_list.append(input_ids)
                ref_sentences.append(same_condition)
                pre_condition_string = condition_string
                same_condition = [sen]
                start += 1
        ref_sentences.append(same_condition)    

    bleu_score = 0
    bleu_1 = 0

#     f_dev = open('./predictions/testset/large2/f_dev_'+str(i)+'.txt', 'w')
#     f_pred = open('./predictions/devset/base4/f_pred_'+str(i)+'.txt', 'w')
    f_pred = open('./predictions/testset/base4/f_pred_'+str(i)+'.txt', 'w')

    for k in range(len(ref_sentences)):
        input_ids = input_ids_list[k]
        input_len = len(input_ids)

        ori_tokens = []
        for m in range(len(ref_sentences[k])):
#             f_dev.write(ref_sentences[k][m]+'\n')
            ori_tokens.append(word_tokenize(ref_sentences[k][m]))
#         if k < len(ref_sentences)-1:
#             f_dev.write('\n')

        for _ in range(max_len):
            model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
            pred_idx = model_out.argmax(1)[-1]        
            if pred_idx == my_model.tokenizer.eos_token_id:
                break            
            input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        

        out_sen = my_model.tokenizer.decode(input_ids[input_len:])
        f_pred.write(out_sen+'\n')

        out_tokens = word_tokenize(out_sen)

        bleu_1_score = sentence_bleu(ori_tokens, out_tokens, weights=(1, 0, 0, 0))
        bleu_2_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.5, 0.5, 0, 0))
        bleu_3_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.33, 0.33, 0.33, 0))
        bleu_4_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.25, 0.25, 0.25, 0.25))

        bleu_1 += bleu_1_score

        bleu_score += min(1, len(out_tokens)/len(ori_tokens))*((bleu_1_score*bleu_2_score*bleu_3_score*bleu_4_score)**(0.25))    

#     f_dev.close()
    f_pred.close()
    
    print(i, "th model")
    print("BLEU score: {}".format(bleu_score/len(ref_sentences)*100))
    print("BLEU1 score: {}".format(bleu_1/len(ref_sentences)*100))    

[nltk_data] Downloading package punkt to /home/ds_user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
I0409 06:18:53.380787 140280378275648 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I0409 06:18:53.381873 140280378275648 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/ds_user1/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0409 06:18:53.459385 140280378275648 tokenization_utils.py:663] Adding <START> to the vocabulary
I0409 06:18:53.460474 140280378275648 tokenization_utils.py:741] Assigning <START> to the bos_token key of the tokenizer

ok


4693it [00:03, 1323.73it/s]


1 th model
BLEU score: 70.4744678037779
BLEU1 score: 84.8217172689494
ok
2 th model
BLEU score: 74.67457868362462
BLEU1 score: 89.68311829741859
ok
3 th model
BLEU score: 73.6242194106939
BLEU1 score: 88.84134436373095
ok
4 th model
BLEU score: 74.2753596714431
BLEU1 score: 90.01122665494424
ok
5 th model
BLEU score: 74.16440771530924
BLEU1 score: 89.26223977831661
ok
6 th model
BLEU score: 73.87599061984044
BLEU1 score: 88.80306822578953


# check data

In [7]:
f_dev = open('f_dev.txt', 'r')
f_pred = open('f_pred.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [8]:
len(f_dev_dataset), len(f_pred_dataset)

(5218, 547)

In [9]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

(4672, 547)

In [10]:
4672+546

5218

In [11]:
f_dev = open('./e2e-metrics/example-inputs/devel-conc.txt', 'r')
f_pred = open('./e2e-metrics/example-inputs/baseline-output.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [12]:
len(f_dev_dataset), len(f_pred_dataset)

(146, 10)

In [13]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

(137, 10)