In [57]:
from torch.utils.data import Dataset, DataLoader
class e2eDataset(Dataset):
    def __init__(self, csv_file, tokenizer):
        """
        Args:
            csv_file (string): csv 파일의 경로
        """
        self.dataset = pd.read_csv(csv_file)
        self.columns = self.dataset.columns
        self.conditions = self.dataset[self.columns[0]]
        self.sentences = self.dataset[self.columns[1]]
        self.tokenizer = tokenizer
        
        self.typ_list = {}
        for k in range(len(self.conditions)):
            cond_set = self.conditions[k].split(',')
            for m in range(len(cond_set)):
                cond_set[m] = cond_set[m].strip()
                pos = cond_set[m].index('[')
                if cond_set[m][:pos] in self.typ_list.keys():
                    self.typ_list[cond_set[m][:pos]].add(cond_set[m][pos+1:-1])
                else:            
                    self.typ_list[cond_set[m][:pos]] = {cond_set[m][pos+1:-1]}        

    def __len__(self):
        return len(self.conditions)

    def __getitem__(self, idx):
        cond = self.conditions[idx]
        cond_set = cond.split(',')
        condition_string = ''
        for m in range(len(cond_set)):
            cond_set[m] = cond_set[m].strip()
            pos = cond_set[m].index('[')
            
            condition_string += '<' + cond_set[m][:pos] + '>' + cond_set[m][pos+1:-1] + ' '
        
        sen = self.sentences[idx]
        input_string = condition_string + '<START>'
        input_ids = torch.tensor(self.tokenizer.encode(input_string, add_special_tokens=True))
        
        input_len = len(input_ids)

        return input_ids, sen, condition_string


In [2]:
from model import *
my_model = mymodel().cuda()
my_model.eval()
my_model.load_state_dict(torch.load('./gen_model/base/2/model'))
print('ok') 

I0403 08:02:35.806313 140105540278080 file_utils.py:41] PyTorch version 1.2.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I0403 08:02:39.756214 140105540278080 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b91451

ok


In [3]:
# e2e_dataset = e2eDataset(csv_file='dataset/testset_w_refs.csv', tokenizer=my_model.tokenizer)
# e2e_dataset = e2eDataset(csv_file='dataset/trainset.csv', tokenizer=my_model.tokenizer)
e2e_dataset = e2eDataset(csv_file='dataset/devset.csv', tokenizer=my_model.tokenizer)
dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=False, num_workers=4)   

In [4]:
from tqdm import tqdm
same_condition = []
ref_sentences = []
input_ids_list = []
pre_condition_string = ''
start = 0
for i_batch, sample_batched in tqdm(enumerate(dataloader)):
    sen = sample_batched[1][0]
#     print(i_batch, sen)
    condition_string = sample_batched[2]  
    input_ids = sample_batched[0].squeeze(0).cuda()
        
    if start == 0 or condition_string == pre_condition_string:      
        if start == 0:
            input_ids_list.append(input_ids)
        same_condition.append(sen)        
        pre_condition_string = condition_string
        start += 1
    else:   
        input_ids_list.append(input_ids)
        ref_sentences.append(same_condition)
        pre_condition_string = condition_string
        same_condition = [sen]
        start += 1

#     if i_batch == 30:
#         break            
ref_sentences.append(same_condition)

4672it [00:03, 1311.04it/s]


In [5]:
len(ref_sentences), len(input_ids_list)
# input_ids_list[0]

(547, 547)

In [5]:
import time
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
max_len = 50
start_time = time.time()
bleu_score = 0
bleu_1 = 0

f_dev = open('./predictions/base/f_dev_2.txt', 'w')
f_pred = open('./predictions/base/f_pred_2.txt', 'w')

for k in range(len(ref_sentences)):
    input_ids = input_ids_list[k]
    input_len = len(input_ids)

    ori_tokens = []
    for m in range(len(ref_sentences[k])):
        f_dev.write(ref_sentences[k][m]+'\n')
        ori_tokens.append(word_tokenize(ref_sentences[k][m]))
    if k < len(ref_sentences)-1:
        f_dev.write('\n')
    
    for _ in range(max_len):
        model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
        pred_idx = model_out.argmax(1)[-1]        
        if pred_idx == my_model.tokenizer.eos_token_id:
            break            
        input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        
    
    out_sen = my_model.tokenizer.decode(input_ids[input_len:])
    f_pred.write(out_sen+'\n')
    
#     print(ref_sentences[k])
#     print(out_sen)    
    
    out_tokens = word_tokenize(out_sen)
    
    bleu_1_score = sentence_bleu(ori_tokens, out_tokens, weights=(1, 0, 0, 0))
    bleu_2_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.5, 0.5, 0, 0))
    bleu_3_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.33, 0.33, 0.33, 0))
    bleu_4_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.25, 0.25, 0.25, 0.25))
    
    bleu_1 += bleu_1_score

    bleu_score += min(1, len(out_tokens)/len(ori_tokens))*((bleu_1_score*bleu_2_score*bleu_3_score*bleu_4_score)**(0.25))    
    
#     print("time: {}".format(time.time()-start_time))
#     print('')
f_dev.close()
f_pred.close()
    
print("BLEU score: {}".format(bleu_score/len(ref_sentences)*100))
print("BLEU1 score: {}".format(bleu_1/len(ref_sentences)*100))

[nltk_data] Downloading package punkt to /home/ds_user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU score: 73.41162457057993
BLEU1 score: 89.30558590312106


In [13]:
BLEU score: 71.18825887456514
BLEU1 score: 88.76109911863252

./measure_scores.py ../predictions/base/f_dev_6.txt ../predictions/base/f_pred_6.txt
./measure_scores.py ../predictions/base2/f_dev_2.txt ../predictions/base2/f_pred_2.txt
./measure_scores.py ../predictions/large/f_dev_5.txt ../predictions/large/f_pred_5.txt
./measure_scores.py ../predictions/f_dev_2.txt ../predictions/large2/f_pred_2.txt

### Large
#### my_output_1
BLEU: 0.6826
NIST: 8.4243
METEOR: 0.4557
ROUGE_L: 0.7032
CIDEr: 2.1234

#### my_output_2
BLEU: 0.7228
NIST: 8.5241
METEOR: 0.4851
ROUGE_L: 0.7461
CIDEr: 2.4645

#### my_output_3
BLEU: 0.7035
NIST: 8.5937
METEOR: 0.4700
ROUGE_L: 0.7252
CIDEr: 2.3310

#### my_output_4
BLEU: 0.6738
NIST: 8.4018
METEOR: 0.4576
ROUGE_L: 0.7075
CIDEr: 2.2172

#### my_output_5
BLEU: 0.6927
NIST: 8.4429
METEOR: 0.4662
ROUGE_L: 0.7180
CIDEr: 2.2729

### base
#### my_output_2
BLEU: 0.6812
NIST: 8.5491
METEOR: 0.4442
ROUGE_L: 0.7036
CIDEr: 2.1261

#### my_output_6
BLEU: 0.6655
NIST: 8.4830
METEOR: 0.4475
ROUGE_L: 0.6992
CIDEr: 2.1077
    
#### my_output_final
BLEU: 0.6529
NIST: 8.3116
METEOR: 0.4430
ROUGE_L: 0.6842
CIDEr: 1.9766

### Pragmatically Informative Text Generation
BLEU 68.60
NIST 8.73
METEOR 45.25
R-L 70.82
CIDEr 2.37

## Total model inference

In [81]:
# from model_large import *
from model import *
from tqdm import tqdm
import time
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
max_len = 70

my_model = mymodel().cuda()
my_model.eval()

for i in range(1, 9):
    model_name = './gen_model/base_devtrain_5/'+str(i)+'/model'
    my_model.load_state_dict(torch.load(model_name))
    print('ok') 
    if i == 1:
#         e2e_dataset = e2eDataset(csv_file='dataset/devset.csv', tokenizer=my_model.tokenizer)
        e2e_dataset = e2eDataset(csv_file='dataset/testset_w_refs.csv', tokenizer=my_model.tokenizer)
        dataloader = DataLoader(e2e_dataset, batch_size=1, shuffle=False, num_workers=4)
        same_condition = []
        ref_sentences = []
        input_ids_list = []
        pre_condition_string = ''
        start = 0
        for i_batch, sample_batched in tqdm(enumerate(dataloader)):
            sen = sample_batched[1][0]
            condition_string = sample_batched[2]  
            input_ids = sample_batched[0].squeeze(0).cuda()

            if start == 0 or condition_string == pre_condition_string:      
                if start == 0:
                    input_ids_list.append(input_ids)
                same_condition.append(sen)        
                pre_condition_string = condition_string
                start += 1
            else:   
                input_ids_list.append(input_ids)
                ref_sentences.append(same_condition)
                pre_condition_string = condition_string
                same_condition = [sen]
                start += 1
        ref_sentences.append(same_condition)    

    bleu_score = 0
    bleu_1 = 0

#     f_dev = open('./predictions/testset/large2/f_dev_'+str(i)+'.txt', 'w')
#     f_pred = open('./predictions/devset/base4/f_pred_'+str(i)+'.txt', 'w')
    f_pred = open('./predictions/testset/base5_dev/f_pred_'+str(i)+'.txt', 'w')

    for k in range(len(ref_sentences)):
        input_ids = input_ids_list[k]
        input_len = len(input_ids)

        ori_tokens = []
        for m in range(len(ref_sentences[k])):
#             f_dev.write(ref_sentences[k][m]+'\n')
            ori_tokens.append(word_tokenize(ref_sentences[k][m]))
#         if k < len(ref_sentences)-1:
#             f_dev.write('\n')

        for _ in range(max_len):
            model_out = my_model.model_feeding(input_ids) # (batch, seq_len, emb_dim)
            pred_idx = model_out.argmax(1)[-1]        
            if pred_idx == my_model.tokenizer.eos_token_id:
                break            
            input_ids = torch.cat((input_ids, pred_idx.unsqueeze(0)), 0)        

        out_sen = my_model.tokenizer.decode(input_ids[input_len:])
        f_pred.write(out_sen+'\n')

        out_tokens = word_tokenize(out_sen)

        bleu_1_score = sentence_bleu(ori_tokens, out_tokens, weights=(1, 0, 0, 0))
        bleu_2_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.5, 0.5, 0, 0))
        bleu_3_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.33, 0.33, 0.33, 0))
        bleu_4_score = sentence_bleu(ori_tokens, out_tokens, weights=(0.25, 0.25, 0.25, 0.25))

        bleu_1 += bleu_1_score

        bleu_score += min(1, len(out_tokens)/len(ori_tokens))*((bleu_1_score*bleu_2_score*bleu_3_score*bleu_4_score)**(0.25))    

#     f_dev.close()
    f_pred.close()
    
    print(i, "th model")
    print("BLEU score: {}".format(bleu_score/len(ref_sentences)*100))
    print("BLEU1 score: {}".format(bleu_1/len(ref_sentences)*100))    

[nltk_data] Downloading package punkt to /home/ds_user1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
I0421 01:10:42.175892 140472973551424 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/ds_user1/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
I0421 01:10:42.177130 140472973551424 tokenization_utils.py:504] loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/ds_user1/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
I0421 01:10:42.251970 140472973551424 tokenization_utils.py:663] Adding <START> to the vocabulary
I0421 01:10:42.252930 140472973551424 tokenization_utils.py:741] Assigning <START> to the bos_token key of the tokenizer

ok


4693it [00:03, 1263.23it/s]


1 th model
BLEU score: 71.75814362353185
BLEU1 score: 86.53756834238277
ok
2 th model
BLEU score: 72.63030702160998
BLEU1 score: 86.87483571530996
ok
3 th model
BLEU score: 72.63175142953376
BLEU1 score: 87.46104884920169
ok
4 th model
BLEU score: 75.01133126127439
BLEU1 score: 89.81478053589682
ok
5 th model
BLEU score: 74.07566945973493
BLEU1 score: 88.5473048008202
ok
6 th model
BLEU score: 74.36240951326324
BLEU1 score: 89.02817967750391
ok
7 th model
BLEU score: 74.53685592565864
BLEU1 score: 89.05226296212801
ok
8 th model
BLEU score: 74.69360042059449
BLEU1 score: 89.52472181005867


## BERTscore

In [79]:
### BERT score with human reference
from bert_score import score
import glob
human_files = "/project/work/E2E/predictions/testset/f_dev.txt"

human_open = open(human_files, "r")
human_dataset = human_open.readlines()
human_open.close()

human_references = []

temp_reference = []
for i in range(len(human_dataset)):
    if human_dataset[i] == '\n':
        human_references.append(temp_reference)
        temp_reference = []
    else:
        temp_reference.append(human_dataset[i].strip())
human_references.append(temp_reference)
human_compare = []
for i in range(len(human_references)):
    for k in range(len(human_references[i])):
        human_compare.append(human_references[i][k])

output_path = "/project/work/E2E/predictions/testset/base2_sample10/*"
pred_files = glob.glob(output_path)

score_list = []
for i in range(len(pred_files)):    
    cands = []
    pred_data_open = open(pred_files[i], "r")
    pred_data_dataset = pred_data_open.readlines()
    pred_len = len(pred_data_dataset)
    pred_data_open.close()
    
    for k in range(len(pred_data_dataset)):
        out_sen = pred_data_dataset[k].strip()
        repeat_num = len(human_references[k])
        for _ in range(repeat_num):
            cands.append(out_sen)

    P, R, F1 = score(cands, human_compare, lang='en', verbose=True)
    F1_list=list(F1.numpy())
    BERT_score = sum(F1_list)/len(F1_list)
    
    score_list.append(BERT_score)    

I0420 04:51:34.488308 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:51:34.489642 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.63 seconds, 441.46 sentences/sec


I0420 04:51:59.196571 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:51:59.197704 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.93 seconds, 429.44 sentences/sec


I0420 04:52:23.781665 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:52:23.782730 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=81), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 11.03 seconds, 425.46 sentences/sec


I0420 04:52:48.475887 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:52:48.476953 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 11.02 seconds, 426.03 sentences/sec


I0420 04:53:13.102193 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:53:13.103636 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=75), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.02 seconds, 468.39 sentences/sec


I0420 04:53:36.603080 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:53:36.604154 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.82 seconds, 433.65 sentences/sec


I0420 04:54:00.907493 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:54:00.908562 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.76 seconds, 435.96 sentences/sec


I0420 04:54:25.293946 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:54:25.295020 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=80), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.80 seconds, 434.35 sentences/sec


I0420 04:54:49.462030 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:54:49.463121 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=81), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 11.01 seconds, 426.38 sentences/sec


I0420 04:55:13.897138 140472973551424 configuration_utils.py:275] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json from cache at /home/ds_user1/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.b0c148f080e2f5eb0abadfa0a38793b5631ca093ac4321d8614d219229fdee2a
I0420 04:55:13.898322 140472973551424 configuration_utils.py:311] Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_ep

calculating scores...
computing bert embedding.


HBox(children=(IntProgress(value=0, max=79), HTML(value='')))


computing greedy matching.


HBox(children=(IntProgress(value=0, max=74), HTML(value='')))


done in 10.60 seconds, 442.64 sentences/sec


In [80]:
pred_files, score_list

(['/project/work/E2E/predictions/testset/base2_sample10/f_pred_3.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_7.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_5.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_9.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_1.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_10.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_4.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_8.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_6.txt',
  '/project/work/E2E/predictions/testset/base2_sample10/f_pred_2.txt'],
 [0.9340897221415886,
  0.9375306307291542,
  0.9385130896945256,
  0.9396600867420175,
  0.896774981541596,
  0.9385862989438868,
  0.9347218730773291,
  0.9386929859872213,
  0.9381917336336107,
  0.9325849555993218])

In [None]:
### BERT score with human reference
import csv
from bert_score import score
import glob
human_files = "/project/work/E2E/predictions/testset/f_dev.txt"

human_open = open(human_files, "r")
human_dataset = human_open.readlines()
human_open.close()

human_references = []

temp_reference = []
for i in range(len(human_dataset)):
    if human_dataset[i] == '\n':
        human_references.append(temp_reference)
        temp_reference = []
    else:
        temp_reference.append(human_dataset[i].strip())
human_references.append(temp_reference)
human_compare = []
for i in range(len(human_references)):
    for k in range(len(human_references[i])):
        human_compare.append(human_references[i][k])

output_path = "/project/work/E2E/compared_system/system_outputs/primary_txt/*"
pred_files = glob.glob(output_path)

score_list = []
for i in range(len(pred_files)):    
    cands = []
    pred_data_open = open(pred_files[i], "r")
    pred_data_dataset = pred_data_open.readlines()
    pred_len = len(pred_data_dataset)
    pred_data_open.close()
    
    for k in range(len(pred_data_dataset)):
        out_sen = pred_data_dataset[k].strip()
        repeat_num = len(human_references[k])
        for _ in range(repeat_num):
            cands.append(out_sen)

    P, R, F1 = score(cands, human_compare, lang='en', verbose=True)
    F1_list=list(F1.numpy())
    BERT_score = sum(F1_list)/len(F1_list)
    
    score_list.append(BERT_score)  
print(pred_files, score_list)    

In [39]:
for i in range(len(pred_files)):
    print(pred_files[i], score_list[i])

/project/work/E2E/compared_system/system_outputs/primary_txt/adapt.txt 0.9224714525544057
/project/work/E2E/compared_system/system_outputs/primary_txt/forge3.txt 0.9277185035519018
/project/work/E2E/compared_system/system_outputs/primary_txt/tgen.txt 0.9391622960808789
/project/work/E2E/compared_system/system_outputs/primary_txt/chen.txt 0.9121247716430608
/project/work/E2E/compared_system/system_outputs/primary_txt/sheff2.txt 0.9337284921863148
/project/work/E2E/compared_system/system_outputs/primary_txt/tuda.txt 0.9388613972027826
/project/work/E2E/compared_system/system_outputs/primary_txt/forge1.txt 0.9296145912378093
/project/work/E2E/compared_system/system_outputs/primary_txt/dangnt.txt 0.9390964954615199
/project/work/E2E/compared_system/system_outputs/primary_txt/zhaw2.txt 0.9328836475052662
/project/work/E2E/compared_system/system_outputs/primary_txt/zhang.txt 0.931265755944889
/project/work/E2E/compared_system/system_outputs/primary_txt/tr2.txt 0.927562745489242
/project/work

In [84]:
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_1.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_2.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_3.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_4.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_5.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_6.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_7.txt
# !./e2e-metrics/measure_scores.py ./predictions/testset/f_dev.txt ./predictions/testset/base5_dev/f_pred_8.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base1_sample10_6.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base2_sample10_9.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/1base4_devtest_4.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/2base1_sample50_7.txt
!./e2e-metrics/measure_scores.py ./predictions/joosung2/testset/f_dev.txt ./predictions/final/2base2_sample10_9.txt

./predictions/testset/f_dev.txt ./predictions/final/1base1_sample10_6.txt None
Traceback (most recent call last):
  File "./e2e-metrics/measure_scores.py", line 351, in <module>
    data_src, data_ref, data_sys = load_data(args.ref_file, args.sys_file, args.src_file)
  File "./e2e-metrics/measure_scores.py", line 193, in load_data
    data_ref = read_lines(ref_file, multi_ref=True)
  File "./e2e-metrics/measure_scores.py", line 25, in read_lines
    with codecs.open(file_name, 'rb', 'UTF-8') as fh:
  File "/usr/lib/python3.6/codecs.py", line 897, in open
    file = builtins.open(filename, mode, buffering)
FileNotFoundError: [Errno 2] No such file or directory: './predictions/testset/f_dev.txt'
./predictions/testset/f_dev.txt ./predictions/final/1base2_sample10_9.txt None
Traceback (most recent call last):
  File "./e2e-metrics/measure_scores.py", line 351, in <module>
    data_src, data_ref, data_sys = load_data(args.ref_file, args.sys_file, args.src_file)
  File "./e2e-metrics/measure

## Tsv to txt

In [35]:
import pandas as pd
output_path = "/project/work/E2E/compared_system/system_outputs/primary/*"
comapred_files = glob.glob(output_path)

for i in range(len(comapred_files)):
    dataset = pd.read_csv(comapred_files[i], delimiter='\t', header=None)
    
    name = comapred_files[i].split('/')[-1].split('.')[0]
    txt_files = "/project/work/E2E/compared_system/system_outputs/primary_txt/"+name+".txt"
    f = open(txt_files, "w")
    gen_sentences = dataset[1]
    
    for k in range(1, len(gen_sentences)):
        f.write(gen_sentences[k]+'\n')
    f.close()

In [36]:
len(gen_sentences)

631

# check data

In [7]:
f_dev = open('f_dev.txt', 'r')
f_pred = open('f_pred.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [8]:
len(f_dev_dataset), len(f_pred_dataset)

(5218, 547)

In [9]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

(4672, 547)

In [10]:
4672+546

5218

In [11]:
f_dev = open('./e2e-metrics/example-inputs/devel-conc.txt', 'r')
f_pred = open('./e2e-metrics/example-inputs/baseline-output.txt', 'r')
f_dev_dataset = f_dev.readlines()
f_pred_dataset = f_pred.readlines()

In [12]:
len(f_dev_dataset), len(f_pred_dataset)

(146, 10)

In [13]:
data=[x for x in f_dev_dataset if x != '\n']
data2=[x for x in f_pred_dataset if x != '\n']
len(data), len(data2)

(137, 10)