### General

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Here is the path of the root dir of this folder in your google drive
path="/content/drive/My Drive/Project"

import os
import sys
os.chdir(path)
sys.path.append(path)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import json
import random
import copy
import h5py
import math
from tqdm import tqdm
tqdm.pandas()

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.4 MB/s[0m eta [36m0:00:0

### Synthetic

#### load model and data

In [None]:
from torch import cuda
import torch
from transformers import PreTrainedTokenizer, PreTrainedModel
from transformers import AutoTokenizer, T5ForConditionalGeneration, T5Tokenizer

#this model is for synthetic from serialized triplets sentences to a semantic sentence
device = 'cuda' if cuda.is_available() else 'cpu'
syn_model = torch.load(os.path.join(path, 'model/t5_best_model.pt')).to(device)
syn_tokenizer = T5Tokenizer.from_pretrained("t5-small")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
from transformers import PreTrainedTokenizer, PreTrainedModel

MAX_LEN = 512

def predict(tokenizer: PreTrainedTokenizer, model: PreTrainedModel, text: str, device):
    with torch.no_grad():
        inputs = tokenizer(text, max_length=MAX_LEN, padding=True, return_tensors='pt')
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        ids = ids.to(device)
        mask = mask.to(device)
        generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask,
                max_length=MAX_LEN,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
                )
        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    return preds

In [None]:
dataset = pd.read_csv(os.path.join(path, 'data/entity_to_subgraph.csv'))

In [None]:
dataset['subgraph'] = dataset['subgraph'].apply(eval)

#### save result

In [None]:
sentences = []
for subgraph in dataset['subgraph']:
    sentence = ""
    temp = 0
    for triplet in subgraph:
        if temp == 0:
            sentence = ' '.join(triplet)
            temp = 1
        else:
            sentence += ', '
            sentence += ' '.join(triplet[1:])

    preds = predict(syn_tokenizer, syn_model, sentence, device)[0]
    sentences.append(preds)
dataset['synthetic'] = sentences

In [None]:
columns_to_drop = ['subgraph']
dataset = dataset.drop(columns=columns_to_drop)

In [None]:
dataset.to_csv(os.path.join(path,'data/memories/syn_memory.csv'),index_label=False)