In [None]:
import pandas as pd
df=pd.read_csv("/kaggle/input/table-to-text-generation-dataset-google-totto/totto_data/tablesWithTagDev.csv")

In [None]:
len(df)

In [None]:
MAXLENI=400
MAXLENO=200

In [None]:
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import AdamW, WarmUp, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
import time
import copy
import numpy
import matplotlib.pyplot as plt

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model=torch.load("/kaggle/input/table-to-text-generation-utils/T5Epoch 7", map_location=device)

In [None]:
special_tokens_dict = {'pad_token': '<pad>', 'bos_token': '<bos>', 'eos_token': '<eos>', 
                       'additional_special_tokens': ['<PAGESTART>', '<PAGEEND>', '<SECTIONSTART>', '<SECTIONEND>',
                                                     '<TABLESTART>','<TABLEEND>','<CELLSTART>','<CELLEND>','<COLHEADERSTART>',
                                                     '<COLHEADEREND>','<ROWHEADERSTART>','<ROWHEADEREND>']}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('We have added', num_added_toks, 'tokens')
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

In [None]:
class tottodataset(Dataset):
  def __init__(self,df,tokenizer):
    self.sentence=df['sentence']
    self.table=df['table']
    self.tokenizer=tokenizer

  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,idx):
    inp=(self.table[idx]+'</s>').replace("<page_title>", "<PAGESTART>").replace("</page_title>", "<PAGEEND>") \
                                    .replace("<section_title>", "<SECTIONSTART>").replace("</section_title>", "<SECTIONEND>") \
                                    .replace("<table>", "<TABLESTART>").replace("</table>", "<TABLEEND>") \
                                    .replace("<cell>", "<CELLSTART>").replace("</cell>", "<CELLEND>") \
                                    .replace("<col_header>", "<COLHEADERSTART>").replace("</col_header>", "<COLHEADEREND>") \
                                    .replace("<row_header>", "<ROWHEADERSTART>").replace("</row_header>", "<ROWHEADEREND>")
    out=self.sentence[idx]+'</s>'
    inp_tokens=self.tokenizer.encode_plus(inp, padding="max_length", max_length=MAXLENI, truncation=True)
    out_tokens=self.tokenizer.encode_plus(out, padding="max_length", max_length=MAXLENO, truncation=True)
    inp_id=inp_tokens.input_ids
    out_id=out_tokens.input_ids
    inp_mask=inp_tokens.attention_mask
    out_mask=out_tokens.attention_mask
    labels=out_tokens.input_ids.copy()
    labels=[-100  if x==self.tokenizer.pad_token_id else x for x in labels]

    return {
        "table_text":inp,
        "sentence":out,
        "input_ids":torch.tensor(inp_id, dtype=torch.long),
        "input_attention_mask":torch.tensor(inp_mask, dtype=torch.long),
        "decoder_input_ids":torch.tensor(out_id, dtype=torch.long),
        "decoder_attention_mask":torch.tensor(out_mask, dtype=torch.long),
        "labels":torch.tensor(labels, dtype=torch.long)
    }

In [None]:
test_dataset=tottodataset(df,tokenizer)

test_dataloader=DataLoader(test_dataset,
                            batch_size=64,
                            num_workers=2,
                            shuffle=False)

In [None]:
model.to(device)

In [None]:
!nvidia-smi

In [None]:
test_iterator=iter(test_dataloader)

In [None]:
test_batch=next(test_iterator)

In [None]:
generation_output = model.generate(test_batch['input_ids'].to(device), return_dict_in_generate=True, output_scores=True)

In [None]:
generation_output["sequences"]

In [None]:
expected=test_batch["sentence"][0]
predicted=tokenizer.decode(generation_output["sequences"][0],skip_special_tokens=True)

In [None]:
print(f"Expected Sentence: {expected}")
print(f"Predicted Sentence: {predicted}")

In [None]:
!nvidia-smi

In [None]:
# bleurt_dataloader=DataLoader(test_dataset,
#                             batch_size=1,
#                             num_workers=2,
#                             shuffle=False)

In [None]:
from tqdm.notebook import tqdm

fref = open('./corrects.txt', 'w+')
fout = open('./outputs.txt', 'w+')

output=[]
corrects=[]

for batch in tqdm(test_dataloader):
    input_ids=batch["input_ids"].to(device)
    
    outputs = model.generate(input_ids, return_dict_in_generate=True, output_scores=True)
    for i in range(64):
        try:
            outputstring = tokenizer.decode(outputs["sequences"][i], skip_special_tokens=True)

            refstring=batch["sentence"][i].replace('</s>',"")
            output.append(outputstring)
            corrects.append(refstring)

            fref.write(refstring+'\n')
            fout.write(outputstring+'.\n')
        except:
            pass

fref.close()
fout.close()

In [None]:
%%bash
git clone https://github.com/google-research/language.git language_repo
cd language_repo
pip3 install -r language/totto/eval_requirements.txt

In [None]:
!ls

In [None]:
%cd language_repo
!bash language/totto/totto_eval.sh --prediction_path language/totto/sample/output_sample.txt --target_path language/totto/sample/dev_sample.jsonl

In [None]:
len(output)

In [None]:
!bash language/totto/totto_eval.sh --prediction_path /kaggle/working/outputs.txt --target_path /kaggle/input/table-to-text-generation-dataset-google-totto/totto_data/totto_dev_data.jsonl --output_dir /kaggle/working/