# Seminar on Graphs for NLP: Vector representations

## Plan for today:

#### 0. What a taxonomy is. Taxonomy Enrichment task.
#### 1. Graph Neural networks: GCN and GAT
#### 2. GATv2
#### 3. GraphBERT: Only Attention is Needed for Learning Graph Representations
#### 4. GOpenHGNN library

# 0. Taxonomy

A taxonomy is a hierarchical structure of units in terms if class inclusion such that superordinate units in the hierarchy include, or subsume, all items in subordinate units. Taxonomies are typically represented as having tree structures.

![](https://www.digital-mr.com/media/cache/51/6f/516f493d37a7b4895f678843b6383e48.png)


Taxonomies can be represented as graphs!

Let us download the most popular and well-known taxonomy called WordNet. You may also use the `from nltk.corpus import wordnet as wn`, but keep in mind that you can operate with earlier versions.

In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.0.1+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hcanceled
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 417, in run
    _, build_failures = build(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/wheel_builder.py", line 320, in build
    wheel_file = _build_one(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/wheel_builder.py", line 194, in _build_one
    wheel_path = _build_one_inside_env(
  File "/

In [None]:
!pip install tensorboardX

In [None]:
!pip install --upgrade gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown --id 1avRebH3BMsolRxmthVFNPoLwyRpAV2tx

In [None]:
!unzip wordnet_n_is_directed_1_en_synsets.zip

In [None]:
!git clone https://github.com/jwzhanggy/Graph-Bert

fatal: destination path 'Graph-Bert' already exists and is not an empty directory.


In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
from gensim.models.poincare import PoincareModel
import numpy as np
import time
import os

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
import nltk
nltk.download('wordnet')

In [10]:
wn.synset("guy.n.01").lemmas()

[Lemma('guy.n.01.guy'),
 Lemma('guy.n.01.cat'),
 Lemma('guy.n.01.hombre'),
 Lemma('guy.n.01.bozo')]

In [11]:
path = f"wordnet_n_is_directed_1_en_synsets/"

link_path = os.path.join(path, "link")
node_path = os.path.join(path, "node")

In [12]:
id2synset = {}
fasttext_dict = {}

with open(node_path) as f:
    for line in f:
        line_split = line.split("\t")
        id2synset[line_split[0].strip()] = line_split[-1].strip()
        fasttext_dict[line_split[-1].strip()] = np.array([float(num) for num in line_split[1:-1]])

In [13]:
link_pairs = set()
with open(link_path) as f:
    for line in f:
        line_split = line.split("\t")
        link_pairs.add((id2synset[line_split[0].strip()], id2synset[line_split[-1].strip()]))

# 4. Graph Neural Networks

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [15]:
import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils

In [16]:
import time
from datetime import datetime

import networkx as nx
import numpy as np
import torch
import torch.optim as optim

from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader
from torch_geometric.utils import train_test_split_edges
import torch_geometric.transforms as T
from torch_geometric.data import Data

from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


## Data preparation

In [17]:
from gensim.models.keyedvectors import KeyedVectors

fasttext = KeyedVectors(vector_size=300)
fasttext.add_vectors(list(fasttext_dict.keys()), list(fasttext_dict.values()))

In [18]:
import networkx as nx

In [19]:
G = nx.DiGraph()

for pair in link_pairs:
    G.add_edge(*pair)

In [20]:
def create_edge_list(G):
    starts = []
    ends = []
    for left, right in G.edges:
        if left in fasttext.key_to_index and right in fasttext.key_to_index:
            starts.append(fasttext.key_to_index[left])
            ends.append(fasttext.key_to_index[right])
    return torch.tensor([starts, ends], dtype=torch.long)

In [21]:
index_to_key = dict(map(reversed, fasttext.key_to_index.items()))

In [22]:
edge_index = create_edge_list(G)

In [23]:
x = torch.tensor([fasttext[index_to_key[int(i)]] for i in index_to_key], dtype=torch.float)

  x = torch.tensor([fasttext[index_to_key[int(i)]] for i in index_to_key], dtype=torch.float)


In [24]:
data = Data(x=x, edge_index=edge_index)
#data = train_test_split_edges(data)

In [25]:
from torch_geometric.transforms import RandomLinkSplit

In [26]:
transform = RandomLinkSplit(is_undirected=True, split_labels=True)
train_data, val_data, test_data = transform(data)

### GCN and GAT Encoder

The following code snippet describes the Encoder module with GCN or GAT networks.

In [27]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, mode="gcn"):
        super(Encoder, self).__init__()
        if mode == "gcn":
            self.conv1 = pyg_nn.GCNConv(in_channels, 2 * out_channels, cached=True)
            self.conv2 = pyg_nn.GCNConv(2 * out_channels, out_channels, cached=True)
        elif mode == 'gat':
            self.conv1 = pyg_nn.GATConv(in_channels, 2 * out_channels)
            self.conv2 = pyg_nn.GATConv(2 * out_channels, out_channels)
        else:
            raise Exception("Encoder mode is not recognized, try gcn/gat")

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv2(x, edge_index)

def train(epoch):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    loss.backward()
    optimizer.step()
    writer.add_scalar("loss", loss.item(), epoch)
    return loss.item()

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [28]:
writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))

channels = 64
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA availability:', torch.cuda.is_available())

CUDA availability: True


## Variational Graph Auto-Encoders

https://arxiv.org/pdf/1611.07308.pdf

The pipeline is working as follows: first, we train a graph autoencoder with GCN or GAT under the hoot. During the evaluation phase, the latent representations of the autoencoder are actually the embeddings we are looking for.

In [29]:
model = pyg_nn.GAE(Encoder(300, channels, 'gcn')).to(dev)
x, train_pos_edge_index = train_data.x.to(dev), train_data.pos_edge_label_index.to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 401):
    loss = train(epoch)
    auc, ap = test(test_data.pos_edge_label_index, test_data.neg_edge_label_index)
    writer.add_scalar("AUC", auc, epoch)
    writer.add_scalar("AP", ap, epoch)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}, Loss: {:.4f}'.format(epoch, auc, ap, loss))

Epoch: 010, AUC: 0.8353, AP: 0.8220, Loss: 0.9354
Epoch: 020, AUC: 0.8730, AP: 0.8669, Loss: 0.8543
Epoch: 030, AUC: 0.8932, AP: 0.8880, Loss: 0.8268
Epoch: 040, AUC: 0.8973, AP: 0.8962, Loss: 0.8096
Epoch: 050, AUC: 0.9029, AP: 0.9037, Loss: 0.7987
Epoch: 060, AUC: 0.9050, AP: 0.9073, Loss: 0.7874
Epoch: 070, AUC: 0.9077, AP: 0.9111, Loss: 0.7850
Epoch: 080, AUC: 0.9117, AP: 0.9145, Loss: 0.7795
Epoch: 090, AUC: 0.9107, AP: 0.9150, Loss: 0.7801
Epoch: 100, AUC: 0.9115, AP: 0.9160, Loss: 0.7733
Epoch: 110, AUC: 0.9115, AP: 0.9169, Loss: 0.7753
Epoch: 120, AUC: 0.9122, AP: 0.9175, Loss: 0.7691
Epoch: 130, AUC: 0.9133, AP: 0.9187, Loss: 0.7701
Epoch: 140, AUC: 0.9138, AP: 0.9198, Loss: 0.7685
Epoch: 150, AUC: 0.9141, AP: 0.9200, Loss: 0.7662
Epoch: 160, AUC: 0.9143, AP: 0.9207, Loss: 0.7653
Epoch: 170, AUC: 0.9146, AP: 0.9212, Loss: 0.7652
Epoch: 180, AUC: 0.9140, AP: 0.9206, Loss: 0.7645
Epoch: 190, AUC: 0.9153, AP: 0.9222, Loss: 0.7613
Epoch: 200, AUC: 0.9150, AP: 0.9219, Loss: 0.7588


#### Examples

Let us see the nearest neighbours for the unseen words from the test set.

In [30]:
model.eval()
new_x = torch.tensor([fasttext[index_to_key[i]] for i in index_to_key], dtype=torch.float).to(dev)
z = model.encode(new_x, train_pos_edge_index)

In [31]:
id2syns = {}
syns2id = {}
with open('wordnet_n_is_directed_1_en_synsets/node') as f:
    for line in f:
        id2syns[line.split()[0]] = line.split()[-1]
        syns2id[line.split()[-1]] = line.split()[0]

In [32]:
par2orph = {}
orph2par = {}
with open('wordnet_n_is_directed_1_en_synsets/link') as f:
    for line in f:
        par_id = line.split()[0]
        child_id = line.split()[-1]
        
        if "ORPHAN_" in id2syns[child_id]:
            par2orph[id2syns[par_id]] = id2syns[child_id]
            orph2par[id2syns[child_id]] = id2syns[par_id]

In [33]:
c = 0
for word in fasttext.key_to_index:
    if ".n." not in word:
        cur_index = fasttext.key_to_index[word]
        tensor_ = torch.tensor([[cur_index]*(len(G.nodes)), [i for i in range(0, len(G.nodes))]])
        results = model.decode(z, tensor_)
        top10 = list(reversed(sorted([(index_to_key[i], round(float(score.cpu().detach().float()), 4)) for i, score in enumerate(results)], key=lambda x: x[1])))[:10]       
        print(orph2par[word], ":", top10)
        print("="*10)
        c += 1
        if c == 15:
            break

course.n.04 : [('act.n.05', 0.997), ('act.n.03', 0.997), ('action.n.05', 0.9895), ('pump_action.n.01', 0.9862), ('course.n.04', 0.984), ('movement.n.10', 0.9837), ('ORPHAN_100000000', 0.9815), ('job_action.n.01', 0.9805), ('police_action.n.01', 0.9795), ('scheme.n.01', 0.9763)]
recovery.n.03 : [('reclamation.n.02', 0.9554), ('redemption.n.01', 0.9545), ('bestowal.n.01', 0.9536), ('salvation.n.04', 0.9499), ('search_and_rescue_mission.n.01', 0.9481), ('contribution.n.03', 0.9468), ('lifesaving.n.01', 0.9464), ('accordance.n.02', 0.9424), ('salvage.n.02', 0.9419), ('salvage.n.03', 0.9419)]
disappearance.n.01 : [('shading.n.01', 0.9468), ('flit.n.02', 0.9313), ('move.n.05', 0.9283), ('blaze.n.05', 0.9282), ('spot.n.05', 0.9277), ('gradient.n.01', 0.9248), ('difference.n.04', 0.9235), ('rewording.n.01', 0.9231), ('foray.n.02', 0.923), ('turning.n.03', 0.9229)]
hit.n.03 : [('base_hit.n.01', 0.9958), ('hit.n.05', 0.9927), ('hit.n.01', 0.9927), ('slap.n.01', 0.9862), ('knock.n.03', 0.9846), (

## GraphBERT

https://github.com/jwzhanggy/Graph-Bert

Yet another model for embedding generation is GraphBert. Instead of feeding large input graph, we train GRAPH-BERT with sampled subgraphs within their local contexts. The input vector embeddings to be fed to the graphtransformer model actually cover four parts: (1) raw feature vector embedding, (2) Weisfeiler-Lehman absolute role embedding, (3) intimacy based relative positional embedding, and (4) hop based relative distance embedding, respectively.

GRAPH-BERT is trained with the node attribute reconstruction and structure recovery tasks.

![](https://github.com/jwzhanggy/Graph-Bert/raw/master/result/screenshot/model.png)

## Subgraph Sampling

![](https://i.ibb.co/5cbjJZ6/photo-2021-12-07-16-41-32.jpg)

## Positional embeddings

### Weisfeiler-Lehman Absolute Role Embedding

![](https://i.ibb.co/bgT7gqb/wl.png)

### Intimacy based Relative Positional Embedding

![](https://i.ibb.co/34FvCf0/photo-2021-12-07-16-52-30.jpg)

### Hop based Relative Distance Embedding
![](https://i.ibb.co/tCzRcfK/hops-drawio.png)

Actually, you are simply expected to run two scripts: `script_1_preprocess.py` and `script_2_pre_train.py`

In [34]:
!git clone https://github.com/jwzhanggy/Graph-Bert.git

Cloning into 'Graph-Bert'...
remote: Enumerating objects: 450, done.[K
remote: Counting objects: 100% (136/136), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 450 (delta 106), reused 79 (delta 78), pack-reused 314[K
Receiving objects: 100% (450/450), 2.23 MiB | 15.41 MiB/s, done.
Resolving deltas: 100% (232/232), done.


In [35]:
%cd Graph-Bert
!python3 script_1_preprocess.py

/content/Graph-Bert
************ Start ************
WL, dataset: cora
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 1
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 2
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 3
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 4
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 5
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 6
Loading cora dataset...
************ Finish ************
************ Start ************
Subgraph Batching, dataset: cora, k: 7
Loading cora dataset...
**********

In [36]:
!python3 script_2_pre_train.py

Traceback (most recent call last):
  File "/content/Graph-Bert/script_2_pre_train.py", line 5, in <module>
    from code.MethodBertComp import GraphBertConfig
  File "/content/Graph-Bert/code/MethodBertComp.py", line 11, in <module>
    from transformers.modeling_bert import BertPredictionHeadTransform, BertAttention, BertIntermediate, BertOutput
ModuleNotFoundError: No module named 'transformers'


After the model has been trained, we predict embeddings for the new (unseen words) and their nearest neighbours.

In [37]:
import os
import sys

import numpy as np
from nltk.corpus import wordnet as wn

sys.path.append("/home/nikishina/Graph-Bert/code")
sys.path.append("/home/nikishina/Graph-Bert/")
from DatasetLoader import DatasetLoader
from MethodBertComp import GraphBertConfig
from MethodGraphBertGraphRecovery import MethodGraphBertGraphRecovery
from MethodGraphBertNodeConstruct import MethodGraphBertNodeConstruct
from itertools import combinations
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


def load_data(dataset_path, k, device):
    data_obj = DatasetLoader()
    data_obj.dataset_source_folder_path = '/home/nikishina/Graph-Bert/data/' + dataset_path + '/'
    data_obj.dataset_name = dataset_path
    data_obj.k = k
    data_obj.device = device
    data_obj.load_all_tag = True
    return data_obj.load()


def get_query_embedding(word, final_embeddings, index_id_map):
    offset, definition = wn.synset(word).offset(), wn.synset(word).definition()
    index_of_synset = None

    for i, j in index_id_map.items():
        if j == offset:
            index_of_synset = i
            break

    query_embedding = final_embeddings[index_of_synset]
    return query_embedding

ModuleNotFoundError: ignored

In [None]:
class GraphBERTEmbeddingsSaver:
    def __init__(self, model_name, model, x_size=300, device='cpu', max_index=132, intermediate_size=32,
                 num_attention_heads=2, num_hidden_layers=2, y_size=0, residual_type='graph_raw', k=5, nfeature=300):

        pretrained_path = './result/PreTrained_GraphBert/' + model_name
        bert_config = GraphBertConfig(residual_type=residual_type, k=k, x_size=x_size, y_size=y_size,
                                      hidden_size=intermediate_size, intermediate_size=intermediate_size,
                                      num_attention_heads=num_attention_heads, num_hidden_layers=num_hidden_layers,
                                      max_wl_role_index=max_index, max_hop_dis_index=max_index,
                                      max_inti_pos_index=max_index)

        self.model = model(bert_config, pretrained_path, device=device)
        self.model.eval()
        self.nfeature = nfeature

    def compute_and_save_embeddings(self, data, test_synsets, index_id_map, id2label, result_dir):
        final_embeddings = self.compute_embeddings(data, index_id_map, id2label)
        self.save_embeddings(test_synsets, final_embeddings, result_dir)

    def compute_embeddings(self, data, index_id_map, id2label):
        final_embeddings = np.zeros(shape=(len(index_id_map), self.nfeature), dtype=np.float32)

        for _index, raw_f, wl, init, hop in zip(index_id_map, *data):
            final_embeddings[_index, :] = np.array(
                self.model(raw_f.unsqueeze(0), wl.unsqueeze(0), init.unsqueeze(0), hop.unsqueeze(0))[0]
                    .cpu().detach())
        return self.get_embeddings_dict(final_embeddings, index_id_map, id2label)

    @staticmethod
    def get_embeddings_dict(embeddings, index2id_map, id2label):
        return {id2label[index]: embeddings[_id] for _id, index in index2id_map.items()}

    def save_embeddings(self, test_synsets, embeddings, result_dir):
        with open(os.path.join(result_dir, f"{self.model.__class__.__name__}_model_train_embeddings.txt"), 'w') as w1:
            with open(os.path.join(result_dir, f"{self.model.__class__.__name__}_model_test_embeddings.txt"),
                      'w') as w2:
                for synset_name, embedding in embeddings.items():
                    if synset_name in test_synsets:
                        text_embedding = " ".join([str(e) for e in embedding])
                        w2.write(f"{synset_name} {text_embedding}\n")
                    else:
                        text_embedding = " ".join([str(e) for e in embedding])
                        w1.write(f"{synset_name} {text_embedding}\n")

In [None]:
loaded_data = load_data('wordnet_n_is_directed_1_en_synsets_2.0', 5, 'cpu')
dataset = (loaded_data['raw_embeddings'], loaded_data['wl_embedding'], loaded_data['hop_embeddings'],
           loaded_data['int_embeddings'])

In [None]:
index_id_map = loaded_data['index_id_map']

In [None]:
idx_features_labels = np.genfromtxt("{}/node".format('/home/nikishina/Graph-Bert/data/wordnet_n_is_directed_1_en_synsets_2.0/'), dtype=np.dtype(str))
id2label = {int(i): j for i, j in zip(idx_features_labels[:, 0], idx_features_labels[:, -1])}

In [None]:
saver = GraphBERTEmbeddingsSaver('wordnet_n_is_directed_1_en_synsets_2.0/node_reconstruct_model', MethodGraphBertNodeConstruct)
saver.compute_and_save_embeddings(dataset, new_words, index_id_map, id2label, "../")

In [None]:
saver = GraphBERTEmbeddingsSaver('wordnet_n_is_directed_1_en_synsets_2.0/node_graph_reconstruct_model', MethodGraphBertGraphRecovery)
saver.compute_and_save_embeddings(dataset, new_words, index_id_map, id2label, "../")

## View and evaluate results

In [None]:
!gdown 1IAfd9tRgtVtdosM5vuDdxh-VSBFp3mzI
!gdown 1LItbxEcchOfU4TrlLBZjQweC8jpQ3b3Q
!gdown 1VLLLyu9YyLX3uCojiTm_VLtgK2gKCCfW
!gdown 1h5sSbFeCJbouH96fKIZDF2xugNiKf3La

In [None]:
from gensim.models import KeyedVectors

In [None]:
graphBertNode_train = KeyedVectors.load_word2vec_format("MethodGraphBertNodeConstruct_model_train_embeddings_.txt")
graphBertNode_test = KeyedVectors.load_word2vec_format("MethodGraphBertNodeConstruct_model_test_embeddings.txt")

In [None]:
graphBertNode_train.similar_by_word("chocolate_milk.n.01")

In [None]:
graphbert_node_predicts = {}

for word in fasttext.key_to_index:
    if ".n." not in word:
        graphbert_node_predicts[word] = graphBertNode_train.similar_by_vector(graphBertNode_test[word])