## Embedding preparing

In [2]:
# !git clone https://huggingface.co/ridger/SpikeGPT-OpenWebText-216M
# !git clone https://github.com/ridgerchu/SpikeGPT.git
# !pip install torch matplotlib numpy tqdm torchvision scipy ninja accelerate transformers


/content/SpikeGPT


In [10]:
%cd SpikeGPT
import matplotlib.ticker as ticker
import numpy as np
import math, os, sys, types, time, gc
import torch
from torch import nn

def prepare_env():
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = sys.argv[1]
    except:
        pass
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cuda.matmul.allow_tf32 = True
    np.set_printoptions(precision=4, suppress=True, linewidth=200)
    args = types.SimpleNamespace()

    args.RUN_DEVICE = "cpu" # 'cuda' // 'cpu' (already fast)
    args.FLOAT_MODE = "fp32" # fp16 (good for GPU, does not work for CPU) // fp32 (good for CPU) // bf16 (less accurate, but works for CPU)
    os.environ["RWKV_JIT_ON"] = '1' # '1' or '0'. very useful for GPU/CPU fp32, but might be harmful for GPU fp16. please benchmark !!!
    vocab_size = 50277

    # MODEL_NAME = model_path + 'SpikeGPT-216M'
    n_layer = 18
    n_embd = 768
    ctx_len = 1024

    # args.MODEL_NAME = MODEL_NAME
    args.n_layer = n_layer
    args.n_embd = n_embd
    args.ctx_len = ctx_len
    args.vocab_size = vocab_size
    args.head_qk = 0
    args.pre_ffn = 0
    args.grad_cp = 0
    args.my_pos_emb = 0
    os.environ["RWKV_RUN_DEVICE"] = args.RUN_DEVICE
    return args

In [36]:
args = prepare_env()

from src.utils import TOKENIZER
from src.model_run import RWKV_RNN

def load_embedding_weights(model_path, args):

    MODEL_NAME = model_path + 'SpikeGPT-216M'
    args.MODEL_NAME = MODEL_NAME

    # Load pretrained state
    model = RWKV_RNN(args)

    # Get embedding layer from the model
    emb = nn.Embedding(num_embeddings=args.vocab_size, embedding_dim=args.n_embd)
    # emb.weight.data = torch.rand(num_node,embedding_dim)
    # print(emb.weight.data)
    emb.weight.data = model.w.emb.weight.data

    return emb

def load_tokenizer():
    TOKEN_MODE = "pile"
    WORD_NAME = [
        "20B_tokenizer.json",
        "20B_tokenizer.json",
    ]  # [vocab, vocab] for Pile model
    UNKNOWN_CHAR = None

    tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)
    if TOKEN_MODE == "pile":
        assert tokenizer.tokenizer.decode([187]) == '\n'
    return tokenizer


def transform_text_pretrained_embedding(text, emb, tokenizer):
    if tokenizer.charMode:
        context = tokenizer.refine_context(text)
        ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in text]
    else:
        ctx = tokenizer.tokenizer.encode(text)
    # print("Number of tokens:", len(ctx))

    return emb(torch.tensor(ctx))


emb = load_embedding_weights("/content/SpikeGPT-OpenWebText-216M/", args)
tokenizer = load_tokenizer()
%cd ..
text = "In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet."
result = transform_text_pretrained_embedding(text, emb, tokenizer)
result

/


tensor([[ 0.2670, -0.0613,  0.2563,  ..., -0.0652,  0.0423,  0.0334],
        [ 0.0528, -0.0115,  0.2056,  ..., -0.2686,  0.0228,  0.0199],
        [ 0.1391,  0.0236,  0.1507,  ..., -0.0139,  0.1105,  0.3136],
        ...,
        [ 0.0697, -0.0099,  0.4996,  ..., -0.1872, -0.1179,  0.0978],
        [ 0.0641, -0.0057, -0.0210,  ..., -0.0274,  0.0193,  0.0831],
        [ 0.1805, -0.0024, -0.0600,  ...,  0.3261,  0.0504, -0.1003]],
       grad_fn=<EmbeddingBackward0>)

In [14]:
result = transform_text_pretrained_embedding(text, emb, tokenizer)

Number of tokens: 24


In [None]:
text = "Hello"
r1 = transform_text_pretrained_embedding(text, emb, tokenizer)
r2 = transform_text_pretrained_embedding(text.lower(), emb, tokenizer)
r1==r2

# Graph creation

In [15]:
# !pip install gdown
# !pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.0/1.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [27]:
import gdown

url = "https://drive.google.com/drive/folders/1Jx3Cz7t0hiNdtlBWUdPjhtLwPOH3LtzC"
output = './'

gdown.download_folder(url, output = output, quiet = True)

['./webnlg/test.json',
 './webnlg/test.source',
 './webnlg/test.target',
 './webnlg/train.json',
 './webnlg/val.json']

In [30]:
import json
file_dir = "./webnlg/"

with open (file_dir + "train.json", "r") as f:
    train_set = json.load(f)


with open (file_dir + "val.json", "r") as f:
    val_set = json.load(f)


with open (file_dir + "test.json", "r") as f:
    test_set = json.load(f)

test_set[2]['kbs']

{'W1016': ['California',
  'California',
  [['is part of', 'Anaheim, California']]],
 'W1277': ['United States',
  'United States',
  [['location', 'United States House of Representatives']]],
 'W3706': ['United States House of Representatives',
  'United States House of Representatives',
  [['leader title', 'Anaheim, California']]]}

In [55]:
def get_triplets(dataset_element):
    triplets = []
    for key, value in dataset_element['kbs'].items():
        triplets.append([value[0]]+value[-1][0])
        # !!! value[-1] могут быть и др значения?
        # if len(value[-1]) > 1:
        #     print(value)
    return triplets

get_triplets(test_set[2])

[['California', 'is part of', 'Anaheim, California'],
 ['United States', 'location', 'United States House of Representatives'],
 ['United States House of Representatives',
  'leader title',
  'Anaheim, California']]

Albany , Georgia is in the United States , a country led by the President of the United States and where the inhabitants are called Americans . One of the ethnic groups in the country are the African Americans

In [32]:
test_set[27]

{'id': 1457,
 'kbs': {'W2849': ['President of the United States',
   'President of the United States',
   [['leader title', 'United States']]],
  'W915': ['Americans', 'Americans', [['demonym', 'United States']]],
  'W1277': ['United States',
   'United States',
   [['country', 'Albany, Georgia'], ['is part of', 'Albany, Georgia']]],
  'W1639': ['African Americans',
   'African Americans',
   [['ethnic group', 'United States']]]},
 'text': ['Albany , Georgia is in the United States , a country led by the President of the United States and where the inhabitants are called Americans . One of the ethnic groups in the country are the African Americans .',
  'The leader of the United States , which includes the ethnic group of African Americans among its population of Americans , is the President . The country is the location of Albany in Georgia .',
  'Americans live in the United States where the leader if known as the President . The country includes the ethnic group of African Americans

In [51]:
import numpy as np
import torch
from torch_geometric.data import Data

def replace_embedding(nodes_list):
    result = []
    for el in nodes_list:
        result.append([transform_text_pretrained_embedding(text, emb, tokenizer)])
    return result

def create_graph(triplets, text):
    x = []
    edge_index = []
    edge_attr = []
    y = replace_embedding([text])[0]

    for t in triplets:

        # Create nodes
        if t[0] not in x:
            x.append(t[0])
        if t[1] not in x:
            x.append(t[2])

        # Add edge
        edge_index.append([x.index(t[0]), x.index(t[2])])

        # Add edge attribute
        edge_attr.append(replace_embedding([t[1]])[0])

    x = replace_embedding(x)
    edge_index = np.array(edge_index).T

    # x = torch.tensor(x, dtype=torch.float)
    # edge_index = torch.tensor(edge_index, dtype=torch.long)
    # edge_attr=torch.tensor(edge_attr, dtype=torch.float)
    # y=torch.tensor(y, dtype=torch.float)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    return data

data = create_graph(get_triplets(test_set[0]), test_set[0]['text'][0])

In [62]:
import torch
from torch_geometric.data import InMemoryDataset, download_url


class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, file_path, transform=None, pre_transform=None, pre_filter=None):
        self.file_path = file_path
        super().__init__(root, transform, pre_transform, pre_filter)
        self.load(self.processed_paths[0])
        # For PyG<2.4:
        # self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.file_path]

    @property
    def processed_file_names(self):
        return [".".join(self.file_path.split(".")[:-1])+".pt"]

    def download(self):
        # Download to `self.raw_dir`.
        with open (self.raw_file_names[0], "r") as f:
            return json.load(f)

    def process(self):
        # Read data into huge `Data` list.
        data_list = []
        data  = self.download()
        for datapoint in data:
            data_list.append(create_graph(get_triplets(datapoint), datapoint['text'][0]))

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        self.save(data_list, self.processed_paths[0])
        # For PyG<2.4:
        # torch.save(self.collate(data_list), self.processed_paths[0])

In [63]:
dataset = MyOwnDataset(".", "/content/webnlg/test.json")

Processing...
Done!
