In this notebook we prepare Explanation dataset


*   first use sanity test for creating small dataset
*   second  preprocess dataset
*   third  use sbert to create embedding for nodes and edges




In [1]:
import re
import os
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch_geometric.data.data import Data
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader
import numpy as np
import glob #finding files and directories whose names match a specified pattern.

Original Dataset

In [2]:
path = '/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph'
original_dataset = pd.read_csv(f'{path}/train_dev.tsv', sep='\t')

original_dataset.head()

Unnamed: 0,arg1,arg2,label,graph
0,Cannabis should be legal.,It's not a bad thing to make marijuana more av...,support,(cannabis; synonym of; marijuana)(legal; cause...
1,Women should not be in combat.,Women and men have the same rights.,counter,(women and men; is a; citizens)(citizens; caus...
2,People will use marijuana independent of its l...,People use marijuana everywhere now.,support,(marijuana; receives action; popular)(popular;...
3,women should not partake in war,the armed forces are more open to recruiting w...,counter,(armed forces; desires; nurses and helpers)(nu...
4,Marijuana should not be legalized.,Marijuana is dangerous for society.,support,(marijuana; is a; recreational drug)(recreatio...


In [3]:
print("The the shape of original_dataset :",original_dataset.shape)

The the shape of original_dataset : (2766, 4)


In [4]:
print("The first original graph is :", original_dataset.graph.iloc[0])

The first original graph is : (cannabis; synonym of; marijuana)(legal; causes; more available)(marijuana; capable of; good thing)(good thing; desires; legal)


This graph contains 4 triplets

# ***Saniti Test ***

The **Sanity Test** is a quick, basic check to verify that the code or functionality works as expected in its simplest form, ensuring there are no critical issues before deeper testing.

In [5]:
seed = 0
percent_data = 0.1
dataset_sample = original_dataset.copy(deep=True) # 10% of the dataset to be reserved as the main dataset for this thesis
X_train, dataset = train_test_split(dataset_sample, test_size = percent_data, random_state = seed)

In [6]:
print("The the shape of dataset :",dataset.shape)

The the shape of dataset : (277, 4)


In [7]:
dataset.head()

Unnamed: 0,arg1,arg2,label,graph
1436,Organ acquisition in the market makes it easie...,Sale of organ in the market makes it easy to g...,support,(organ acquisition; capable of; more organs av...
817,the three strikes law is not fair.,The three strikes law keeps people safe.,counter,(three strikes law; capable of; keeps people s...
2708,Cloning is inherently decreasing quality,Getting your original out of the copier and pu...,support,(cloning; synonym of; copy)(copy; capable of; ...
2199,Three-strike laws help reduce crime rates.,To say that three-strike laws reduce crime rat...,counter,(three-strike laws; not capable of; assist)(as...
1074,"Telemarketing is fast, safe and reliable.",Telemarketing helps business advertise and get...,support,(telemarketing; capable of; secure)(secure; ha...


In [8]:
dataset.to_csv("/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph/sample_train_dev.tsv", index=False)
dataset=pd.read_csv("/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph/sample_train_dev.tsv")

Save indices of train, val, test in txt file

In [9]:
def generate_split(num_nodes, path):

    # Split the dataset into train, val, and test sets
    indices = np.arange(num_nodes)

    # Make a small training set 60%, a validation set 20%, and a test set 20%
    train_indices, temp_data = train_test_split(indices, test_size=0.4, random_state=42)
    val_indices, test_indices = train_test_split(temp_data, test_size=0.5, random_state=42)
    print("# train samples: ", len(train_indices))
    print("# val samples: ", len(val_indices))
    print("# test samples: ", len(test_indices))

    # Create a folder for the split
    os.makedirs(path, exist_ok=True)

    # Save the indices to separate files
    with open(f'{path}/train_indices.txt', 'w') as file:
        file.write('\n'.join(map(str, train_indices)))

    with open(f'{path}/val_indices.txt', 'w') as file:
        file.write('\n'.join(map(str, val_indices)))

    with open(f'{path}/test_indices.txt', 'w') as file:
        file.write('\n'.join(map(str, test_indices)))

Extracting nodes and edges from graph in dataset.

In [10]:
def textualize_graph(graph):
    triplets = re.findall(r'\((.*?)\)', graph) # find all substrings in a string graph that are enclosed within parentheses ().
    nodes = {}
    edges = []
    for tri in triplets:
        src, edeg_attr, dst = tri.split(';')
        src = src.lower().strip()
        dst = dst.lower().strip()
        if src not in nodes:
            nodes[src] = len(nodes)
        if dst not in nodes:
            nodes[dst] = len(nodes)
        edges.append({'src': nodes[src], 'edge_attr': edeg_attr.lower().strip(), 'dst': nodes[dst], })

    nodes = pd.DataFrame(nodes.items(), columns=['node_attr', 'node_id'])
    edges = pd.DataFrame(edges)
    return nodes, edges

Example of textualize_graph:

In [11]:
textual_graph=textualize_graph( '(cannabis; synonym of; marijuana)(legal; causes; more available)(marijuana; capable of; good thing)(good thing; desires; legal)')

In [12]:
print("The type of tesxtual_graph is ", type(textual_graph))

The type of tesxtual_graph is  <class 'tuple'>


In [13]:
textual_graph[0]

Unnamed: 0,node_attr,node_id
0,cannabis,0
1,marijuana,1
2,legal,2
3,more available,3
4,good thing,4


In [14]:
textual_graph[1]

Unnamed: 0,src,edge_attr,dst
0,0,synonym of,1
1,2,causes,3
2,1,capable of,4
3,4,desires,2


# ***Step 1 :*** Save nodes and edges in the csv file

In [15]:
def step_one():
    # generate textual graphs
    os.makedirs(f'{path}/nodes', exist_ok=True)
    os.makedirs(f'{path}/edges', exist_ok=True)

    for i, row in tqdm(dataset.iterrows(), total=len(dataset)):
        nodes, edges = textualize_graph(row['graph'])
        nodes.to_csv(f'{path}/nodes/{i}.csv', index=False, columns=['node_id', 'node_attr'])
        edges.to_csv(f'{path}/edges/{i}.csv', index=False, columns=['src', 'edge_attr', 'dst'])

In [32]:
#the nodes of the first row in dataset
pd.read_csv("/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph/nodes/1.csv")

Unnamed: 0,node_id,node_attr
0,0,three strikes law
1,1,keeps people safe
2,2,just
3,3,fair


In [33]:
#the edges of the first row in dataset
pd.read_csv("/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph/edges/1.csv")

Unnamed: 0,src,edge_attr,dst
0,0,capable of,1
1,1,is a,2
2,2,synonym of,3


# ***Step 2 :*** Encode the graph

Using sbert as encode

In [17]:
pretrained_repo = 'sentence-transformers/all-roberta-large-v1' # It maps sentences & paragraphs to a 1024 dimensional dense vector space
batch_size = 8  # Adjust the batch size as needed
model_name = 'sbert'

Prepration for SBERT Embedding

In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, input_ids=None, attention_mask=None):
        super().__init__()
        self.data = {
            "input_ids": input_ids,
            "att_mask": attention_mask,
        }

    def __len__(self):
        return self.data["input_ids"].size(0)

    def __getitem__(self, index):
        if isinstance(index, torch.Tensor):
            index = index.item()
        batch_data = dict()
        for key in self.data.keys():
            if self.data[key] is not None:
                batch_data[key] = self.data[key][index]
        return batch_data


In [19]:
class Sentence_Transformer(nn.Module):

    def __init__(self, pretrained_repo):
        super(Sentence_Transformer, self).__init__()
        print(f"inherit model weights from {pretrained_repo}")
        self.bert_model = AutoModel.from_pretrained(pretrained_repo)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
        data_type = token_embeddings.dtype
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(data_type)
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def forward(self, input_ids, att_mask):
        bert_out = self.bert_model(input_ids=input_ids, attention_mask=att_mask)
        sentence_embeddings = self.mean_pooling(bert_out, att_mask)

        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


In [None]:
# #original code on hugging face for SBERT is:

# from transformers import AutoTokenizer, AutoModel
# import torch
# import torch.nn.functional as F

# #Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() #Expands the attention_mask to match the size of token_embeddings
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# ##EXAMPLE
# token_embeddings = torch.tensor([
#     [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]],  # sentence 1
#     [[0.9, 1.0], [1.1, 1.2], [1.3, 1.4], [0.0, 0.0]]   # sentence 2
# ])  # shape: (2, 4, 2)

# attention_mask = torch.tensor([
#     [1, 1, 1, 0],
#     [1, 1, 1, 0]
# ])

# input_mask_expanded = torch.tensor([
#     [[1, 1], [1, 1], [1, 1], [0, 0]],
#     [[1, 1], [1, 1], [1, 1], [0, 0]]
# ])
# weighted_embeddings = torch.tensor([
#     [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.0, 0.0]],
#     [[0.9, 1.0], [1.1, 1.2], [1.3, 1.4], [0.0, 0.0]]
# ])

# summed_embeddings = torch.tensor([
#     [0.9, 1.2],
#     [3.3, 3.6]
# ])

# valid_token_counts = torch.tensor([
#     [3.0, 3.0],
#     [3.0, 3.0]
# ])

# mean_embeddings = summed_embeddings / valid_token_counts
# mean_embeddings = torch.tensor([
#     [0.3, 0.4],
#     [1.1, 1.2]
# ])








# # Sentences we want sentence embeddings for
# sentences = ['This is an example sentence', 'Each sentence is converted']

# # Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1')
# model = AutoModel.from_pretrained('sentence-transformers/all-roberta-large-v1')

# # Tokenize sentences
#   #padding :Ensures all tokenized sequences have the same length by padding shorter sequences with special tokens (e.g., [PAD]).
#   #Truncates sequences that are longer than the model's maximum input length
#   # return_tensors :Converts the output into PyTorch tensors
# encoded_input = tokenizer(sentences, padding=True , truncation=True, return_tensors='pt')
# # {
# #     'input_ids': tensor([[101, 3452, 102, ...]]),  # Token IDs for the sentences
# #     'attention_mask': tensor([[1, 1, 1, ...]])    # Mask to distinguish real tokens from padding
# # }


# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input) #convert to argument
#     # output: (batch_size, sequence_length, hidden_size), others
#     # batch_size : number of sentences
#     # sequence_length : number of token in each sentence
#     # hidden_size : number of feature for each token


# # Perform pooling
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# # Normalize embeddings
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) #L2 norm : p=2, dim=1 means normalization is applied to each row (sentence embedding) independently.

# print("Sentence embeddings:")
# print(sentence_embeddings)


In [20]:
def load_sbert():

    model = Sentence_Transformer(pretrained_repo)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_repo)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    model.eval()
    return model, tokenizer, device

In [21]:
def sber_text2embedding(model, tokenizer, device, text):
    try:
        encoding = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        dataset = Dataset(input_ids=encoding.input_ids, attention_mask=encoding.attention_mask)

        # DataLoader
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        # Placeholder for storing the embeddings
        all_embeddings = []

        # Iterate through batches
        with torch.no_grad():

            for batch in dataloader:
                # Move batch to the appropriate device
                batch = {key: value.to(device) for key, value in batch.items()}

                # Forward pass
                embeddings = model(input_ids=batch["input_ids"], att_mask=batch["att_mask"])

                # Append the embeddings to the list
                all_embeddings.append(embeddings)

        # Concatenate the embeddings from all batches
        all_embeddings = torch.cat(all_embeddings, dim=0).cpu()

    except:
        return torch.zeros((0, 1024))

    return all_embeddings



In [22]:
load_model = {
    'sbert': load_sbert
}



load_text2embedding = {
    'sbert': sber_text2embedding
}

In [23]:
def step_two():

    def _encode_graph():
        print('Encoding graphs...')
        os.makedirs(f'{path}/graphs', exist_ok=True)
        # Set paths to nodes and edges directories
        nodes_path = f'{path}/nodes'
        edges_path = f'{path}/edges'

        # Get all CSV files in the directories
        node_files = glob.glob(f'{nodes_path}/*.csv')
        edge_files = glob.glob(f'{edges_path}/*.csv')

        # Sorting ensures that the order of node_files and edge_files corresponds correctly
        node_files.sort()
        edge_files.sort()

        for i in tqdm(range(len(dataset))):
            nodes = pd.read_csv(node_files[i])
            edges = pd.read_csv(edge_files[i])

            x = text2embedding(model, tokenizer, device, nodes.node_attr.tolist())
            e = text2embedding(model, tokenizer, device, edges.edge_attr.tolist())
            edge_index = torch.LongTensor([edges.src, edges.dst])
            data = Data(x=x, edge_index=edge_index, edge_attr=e, num_nodes=len(nodes))
            torch.save(data, f'{path}/graphs/{i}.pt')

    model, tokenizer, device = load_model[model_name]()
    text2embedding = load_text2embedding[model_name]

    _encode_graph()


if __name__ == '__main__':
    step_one()
    step_two()
    generate_split(len(dataset), f'{path}/split')

  6%|▌         | 17/277 [00:00<00:01, 165.33it/s]

100%|██████████| 277/277 [00:00<00:00, 280.12it/s]


inherit model weights from sentence-transformers/all-roberta-large-v1


config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Encoding graphs...


100%|██████████| 277/277 [00:34<00:00,  8.13it/s]

# train samples:  166
# val samples:  55
# test samples:  56






content of graph.pt

In [24]:
# Define the path to the .pt file
file_path = f'{path}/graphs/{10}.pt'

# Load the data
data = torch.load(file_path)

# Check the type of the loaded data
print(f'Type of the loaded data: {type(data)}')

# Print the content of the loaded data
print(data)

# If data is an instance of torch_geometric.data.Dat
# a, you can access its attributes
if isinstance(data, Data):
    print("Node Features (x):", data.x)
    print("Edge Index (edge_index):", data.edge_index)
    print("Edge Attributes (edge_attr):", data.edge_attr)
    print("Number of Nodes (num_nodes):", data.num_nodes)
    print("Additional attributes:", data.keys)
else:
    print("The loaded data is not a PyTorch Geometric Data object.")


Type of the loaded data: <class 'torch_geometric.data.data.Data'>
Data(x=[8, 1024], edge_index=[2, 7], edge_attr=[7, 1024], num_nodes=8)
Node Features (x): tensor([[ 0.0072, -0.0548,  0.0139,  ...,  0.0123, -0.0234, -0.0507],
        [-0.0194, -0.0667,  0.0295,  ...,  0.0396,  0.0123, -0.0259],
        [-0.0468,  0.0213,  0.0375,  ...,  0.0281, -0.0228, -0.0524],
        ...,
        [-0.0076,  0.0213, -0.0130,  ..., -0.0112, -0.0575, -0.0248],
        [-0.0073,  0.0176, -0.0072,  ...,  0.0116, -0.0444, -0.0390],
        [-0.0101,  0.0828,  0.0038,  ..., -0.0087,  0.0113, -0.0370]])
Edge Index (edge_index): tensor([[0, 1, 2, 3, 4, 5, 7],
        [1, 2, 3, 4, 5, 6, 0]])
Edge Attributes (edge_attr): tensor([[ 0.0039, -0.0208,  0.0201,  ..., -0.0440, -0.0128, -0.0602],
        [ 0.0077,  0.0517,  0.0018,  ...,  0.0248,  0.0030,  0.0017],
        [ 0.0077,  0.0517,  0.0018,  ...,  0.0248,  0.0030,  0.0017],
        ...,
        [-0.0206,  0.0311,  0.0329,  ...,  0.0276, -0.0026,  0.0001],


  data = torch.load(file_path)


In [25]:
data

Data(x=[8, 1024], edge_index=[2, 7], edge_attr=[7, 1024], num_nodes=8)

In [28]:
import json
import pandas as pd
import torch
from torch.utils.data import Dataset


PATH = '/home/ahmadi/sadaf/GraphNeighborLM/G-retriever/datasets/Explanation_graph'


class ExplaGraphsDataset(Dataset):
    def __init__(self):
        super().__init__()

        self.text = pd.read_csv(f'{PATH}/sample_train_dev.tsv', sep=',')
        self.prompt = 'Question: Do argument 1 and argument 2 support or counter each other? Answer in one word in the form of \'support\' or \'counter\'.\n\nAnswer:'
        self.graph = None
        self.graph_type = 'Explanation Graph'

    def __len__(self):
        """Return the len of the dataset."""
        return len(self.text)

    def __getitem__(self, index):

        text = self.text.iloc[index]
        graph = torch.load(f'{PATH}/graphs/{index}.pt')
        question = f'Argument 1: {text.arg1}\nArgument 2: {text.arg2}\n{self.prompt}'
        nodes = pd.read_csv(f'{PATH}/nodes/{index}.csv')
        edges = pd.read_csv(f'{PATH}/edges/{index}.csv')
        desc = nodes.to_csv(index=False)+'\n'+edges.to_csv(index=False)

        return {
            'id': index,
            'label': text['label'],
            'desc': desc,
            'graph': graph,
            'question': question,
        }

    def get_idx_split(self):

        # Load the saved indices
        with open(f'{PATH}/split/train_indices.txt', 'r') as file:
            train_indices = [int(line.strip()) for line in file]

        with open(f'{PATH}/split/val_indices.txt', 'r') as file:
            val_indices = [int(line.strip()) for line in file]

        with open(f'{PATH}/split/test_indices.txt', 'r') as file:
            test_indices = [int(line.strip()) for line in file]

        return {'train': train_indices, 'val': val_indices, 'test': test_indices}




In [29]:
nodes = pd.read_csv(f'{PATH}/nodes/{1}.csv')
edges = pd.read_csv(f'{PATH}/edges/{1}.csv')
desc = nodes.to_csv(index=False)+'\n'+edges.to_csv(index=False)
desc

'node_id,node_attr\n0,three strikes law\n1,keeps people safe\n2,just\n3,fair\n\nsrc,edge_attr,dst\n0,capable of,1\n1,is a,2\n2,synonym of,3\n'

In [30]:
dataset = ExplaGraphsDataset()
dataset[1]

  graph = torch.load(f'{PATH}/graphs/{index}.pt')


{'id': 1,
 'label': 'counter',
 'desc': 'node_id,node_attr\n0,three strikes law\n1,keeps people safe\n2,just\n3,fair\n\nsrc,edge_attr,dst\n0,capable of,1\n1,is a,2\n2,synonym of,3\n',
 'graph': Data(x=[4, 1024], edge_index=[2, 3], edge_attr=[3, 1024], num_nodes=4),
 'question': "Argument 1: the three strikes law is not fair.\nArgument 2: The three strikes law keeps people safe.\nQuestion: Do argument 1 and argument 2 support or counter each other? Answer in one word in the form of 'support' or 'counter'.\n\nAnswer:"}

In [31]:
if __name__ == '__main__':
    dataset = ExplaGraphsDataset()

    print(dataset.prompt)

    data = dataset[1]
    for k, v in data.items():
        print(f'{k}: {v}')

    split_ids = dataset.get_idx_split()
    for k, v in split_ids.items():
        print(f'# {k}: {len(v)}')


Question: Do argument 1 and argument 2 support or counter each other? Answer in one word in the form of 'support' or 'counter'.

Answer:
id: 1
label: counter
desc: node_id,node_attr
0,three strikes law
1,keeps people safe
2,just
3,fair

src,edge_attr,dst
0,capable of,1
1,is a,2
2,synonym of,3

graph: Data(x=[4, 1024], edge_index=[2, 3], edge_attr=[3, 1024], num_nodes=4)
question: Argument 1: the three strikes law is not fair.
Argument 2: The three strikes law keeps people safe.
Question: Do argument 1 and argument 2 support or counter each other? Answer in one word in the form of 'support' or 'counter'.

Answer:
# train: 166
# val: 55
# test: 56


  graph = torch.load(f'{PATH}/graphs/{index}.pt')
