In [1]:
import sys

sys.path.append("/workspace/kbqa/")  # go to parent dir

### Getting our dataset

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
import ujson
import jsonlines
import networkx as nx
import pandas as pd
from tqdm import tqdm

from pathlib import Path

In [3]:
with_candidate = True

In [4]:
dataset = "MINTAKA"
dataset_type = "mintaka_test_labeled"
subgraph_path = (
    f"/workspace/storage/new_subgraph_dataset/{dataset}/{dataset_type}.jsonl"
)
dataset_dir_path = f"./subgraph_classify_dataset/{dataset}/candidates_{with_candidate}"
# Path(dataset_dir_path).mkdir(parents=True, exist_ok=True)
dataset_path = f"{dataset_dir_path}/{dataset_type}.csv"

In [10]:
dataset_path

'./subgraph_classify_dataset/MINTAKA/candidates_True/mintaka_test_labeled.csv'

In [11]:
jsonl_reader = jsonlines.open(subgraph_path)
jsonl_reader_list = list(jsonl_reader)
nx_graphs = []
df = []
for idx, line in tqdm(enumerate(jsonl_reader_list)):
    df.append(line)
df = pd.DataFrame(df)

28325it [00:00, 1987647.20it/s]


In [31]:
def get_node_names(subgraph):
    node_names = [subgraph.nodes[node]["label"] for node in subgraph.nodes()]
    return node_names

In [17]:
def graph_to_sequence(subgraph, node_names):
    # getting adjency matrix and weight info
    adj_matrix = nx.adjacency_matrix(subgraph).todense().tolist()
    edge_data = subgraph.edges.data()

    # adding our edge info
    for edge in edge_data:
        i, j, data = edge
        i, j = int(i), int(j)
        adj_matrix[i][j] = data["label"]

    sequence = []
    # for adjency matrix, i, j means node i -> j
    for i, row in enumerate(adj_matrix):
        from_node = node_names[i]  # from node (node i)
        for j, edge_info in enumerate(row):
            to_node = node_names[j]
            if edge_info == 0:  # no endge from_node -> to_node
                # sequence.extend([from_node, "None", to_node])
                pass
            else:
                sequence.extend([from_node, edge_info, to_node])
    sequence = ",".join(str(node) for node in sequence)
    return sequence

#### Getting our subgraphs sequences

In [306]:
graphs = list(df["graph"])
graph_seq = []
for graph in tqdm(graphs):
    graph_obj = nx.readwrite.json_graph.node_link_graph(graph)
    try:
        graph_node_names = get_node_names(graph_obj)
        curr_seq = graph_to_sequence(graph_obj, graph_node_names)
    except KeyError:
        print("ERROR NO LABEL!")
        curr_seq = "ERROR_NO_LABEL"
    except nx.NetworkXError:
        print("ERROR EMPTY GRAPHS!")
        curr_seq = "ERROR_EMPTY_GRAPH"
    graph_seq.append(curr_seq)

  9%|▉         | 1339/14286 [00:00<00:02, 4681.69it/s]

ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!


 27%|██▋       | 3853/14286 [00:00<00:02, 5006.68it/s]

ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!


 70%|███████   | 10013/14286 [00:02<00:00, 5110.25it/s]

ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!


 92%|█████████▏| 13201/14286 [00:02<00:00, 5320.71it/s]

ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!


100%|██████████| 14286/14286 [00:02<00:00, 5060.60it/s]

ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!
ERROR EMPTY GRAPHS!





In [307]:
# add graph sequence and correct label to our df
df["graph_sequence"] = graph_seq
df["correct"] = df["groundTruthAnswerEntity"] == df["answerEntity"]
df.head()

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,9ace9041,What is the fourth book in the Twilight series?,[Q189378],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Twilight,Twilight,part of...",False
1,9ace9041,What is the fourth book in the Twilight series?,[Q19765983],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,romantic fiction",False
2,9ace9041,What is the fourth book in the Twilight series?,[Q111019576],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,vampire fiction",False
3,9ace9041,What is the fourth book in the Twilight series?,[Q849907],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),The Short Second Life of ...",False
4,9ace9041,What is the fourth book in the Twilight series?,[Q53945],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Breaking Dawn,Breaking Da...",True


In [308]:
# getting the error df and save it
error_df = df[
    (df["graph_sequence"] == "ERROR_EMPTY_GRAPH")
    | (df["graph_sequence"] == "ERROR_NO_LABEL")
]
error_df_path = ".".join(dataset_path.split(".")[:-1])
error_df.to_csv(f"{error_df_path}_ERROR.csv", index=False)

In [309]:
# deleting our error from main data
df = df.drop(error_df.index)
df

Unnamed: 0,id,question,answerEntity,questionEntity,groundTruthAnswerEntity,complexityType,graph,graph_sequence,correct
0,9ace9041,What is the fourth book in the Twilight series?,[Q189378],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Twilight,Twilight,part of...",False
1,9ace9041,What is the fourth book in the Twilight series?,[Q19765983],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,romantic fiction",False
2,9ace9041,What is the fourth book in the Twilight series?,[Q111019576],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,genre,vampire fiction",False
3,9ace9041,What is the fourth book in the Twilight series?,[Q849907],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),The Short Second Life of ...",False
4,9ace9041,What is the fourth book in the Twilight series?,[Q53945],[Q44523],[Q53945],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Twilight,has part(s),Breaking Dawn,Breaking Da...",True
...,...,...,...,...,...,...,...,...,...
14281,3e711adc,What is the first Discworld book about the Cit...,[Q178869],"[Q3320632, Q253295]",[Q2078564],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Mort,instance of,literary work,Mort,narrative ...",False
14282,2652ec2b,Which award did the first puzzle game by Firep...,[Q78762377],"[Q546692, Q15070334]",[Q16969671],multihop,"{'directed': True, 'multigraph': False, 'graph...","United States of America,country,United States...",False
14283,650ed632,Who was the last prime minister during Queen V...,[Q8016],[Q9439],[Q243705],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Victoria,on focus list of Wikimedia project,Wi...",False
14284,3e711adc,What is the first Discworld book about the Cit...,[Q2383911],"[Q3320632, Q253295]",[Q2078564],ordinal,"{'directed': True, 'multigraph': False, 'graph...","Discworld,creator,Terry Pratchett,Thud!,author...",False


In [310]:
# saving our final dataframe
df.to_csv(dataset_path, index=False)

### Building out Dataset

In [5]:
import pandas as pd
import torch
import numpy as np
from transformers import RobertaTokenizer, BertTokenizer, BertModel, RobertaModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

In [6]:
model_type = "roberta-base"
bs = 16 if model_type == "roberta-base" or model_type == "bert-base-cased" else 8

In [28]:
data_path = "/workspace/storage/subgraph_classify_dataset/SQWD/candidates_True"
dataset_type = "sqwd"

In [29]:
train_df = pd.read_csv(f"{data_path}/{dataset_type}_train_labeled.csv")
val_df = pd.read_csv(f"{data_path}/{dataset_type}_validation_labeled.csv")
test_df = pd.read_csv(f"{data_path}/{dataset_type}_test_labeled.csv")

In [30]:
len(test_df)

49953

In [31]:
tokenizer = RobertaTokenizer.from_pretrained(model_type)

In [32]:
def preproc_text(graph_seqs, graph_labels):
    "tokenize our graph sequnces, skip if empty graph"
    labels_dict = {True: 1, False: 0}
    labels = []
    texts = []

    for text, label in tqdm(zip(graph_seqs, graph_labels)):
        if pd.isna(text):  # empty graph
            continue
        texts.append(
            tokenizer(
                text,
                padding="max_length",
                max_length=512,
                truncation=True,
                return_tensors="pt",
            )
        )
        labels.append(labels_dict[label])
    return texts, labels

In [33]:
# preprocessing out texts and label
train_text, train_label = preproc_text(
    train_df["graph_sequence"].tolist(), train_df["correct"].tolist()
)
val_text, val_label = preproc_text(
    val_df["graph_sequence"].tolist(), val_df["correct"].tolist()
)
test_text, test_label = preproc_text(
    test_df["graph_sequence"].tolist(), test_df["correct"].tolist()
)

107299it [00:35, 3064.88it/s]
24520it [00:07, 3170.37it/s]
49953it [00:16, 3062.74it/s]


In [34]:
from torch.utils.data.sampler import WeightedRandomSampler


def create_sampler(target):
    class_sample_count = np.array(
        [len(np.where(target == t)[0]) for t in np.unique(target)]
    )
    weight = 1.0 / class_sample_count
    samples_weight = np.array([weight[t] for t in target])

    samples_weight = torch.from_numpy(samples_weight)
    samples_weigth = samples_weight.double()
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    return sampler

In [35]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, labels, texts):
        self.labels = labels
        self.texts = texts

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [36]:
# creating our sampler for train, val and test
train_sampler = create_sampler(train_label)
val_sampler = create_sampler(val_label)

In [37]:
# creating our loaders
train_dataset = Dataset(train_text, train_label)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=bs, sampler=train_sampler
)

val_dataset = Dataset(val_text, val_label)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=bs, sampler=val_sampler
)

test_dataset = Dataset(test_text, test_label)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=bs)

### Model & Training

In [19]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        if model_type == "roberta-large" or model_type == "roberta-case":
            self.model = RobertaModel.from_pretrained(model_type)
        else:
            self.model = BertModel.from_pretrained(model_type)
        self.dropout = nn.Dropout(dropout)
        if model_type == "bert-large-cased" or model_type == "roberta-large":
            dim = 1024
        else:
            dim = 768
        self.linear = nn.Linear(dim, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.model(
            input_ids=input_id, attention_mask=mask, return_dict=False
        )
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-6)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
def epoch_train(model, train_dataloader, criterion, optimizer):
    total_acc_train = 0
    total_loss_train = 0
    for train_label, train_input in tqdm(train_dataloader):
        train_label = train_label.to(device)
        mask = train_input["attention_mask"].to(device)
        input_id = train_input["input_ids"].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, train_label.long())
        total_loss_train += batch_loss.item()

        acc = (output.argmax(dim=1) == train_label).sum().item()
        total_acc_train += acc

        model.zero_grad()
        batch_loss.backward()
        optimizer.step()

    return total_acc_train, total_loss_train


def epoch_val(model, val_dataloader, criterion):
    total_acc_val = 0
    total_loss_val = 0

    with torch.no_grad():
        for val_label, val_input in tqdm(val_dataloader):
            val_label = val_label.to(device)
            mask = val_input["attention_mask"].to(device)
            input_id = val_input["input_ids"].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, val_label.long())
            total_loss_val += batch_loss.item()

            acc = (output.argmax(dim=1) == val_label).sum().item()
            total_acc_val += acc

    return total_acc_val, total_loss_val

In [19]:
def train(
    model, train_dataloader, val_dataloader, criterion, optimizer, epochs, model_path
):
    best_acc = 0
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        (
            total_acc_train,
            total_loss_train,
        ) = epoch_train(model, train_dataloader, criterion, optimizer)
        total_acc_val, total_loss_val = epoch_val(model, val_dataloader, criterion)

        train_loss = total_loss_train / len(train_dataloader.dataset)
        train_acc = total_acc_train / len(train_dataloader.dataset)
        val_loss = total_loss_val / len(val_dataloader.dataset)
        val_acc = total_acc_val / len(val_dataloader.dataset)
        print(
            f"Epochs: {epoch_num + 1} | Train Loss: {train_loss: .3f} | Train Accuracy: {train_acc: .3f} \
                | Val Loss: {val_loss: .3f} | Val Accuracy: {val_acc: .3f}"
        )

        if val_acc > best_acc:
            torch.save(
                model,
                model_path
                + f"{model.__class__.__name__ }_{dataset_type}_{model_type}_sampler.pt",
            )
            best_acc = val_acc

In [23]:
with_candidate = True
model_path = f"/workspace/storage/subgraph_classify_models/{dataset_type}/candidates_{with_candidate}/"
Path(model_path).mkdir(parents=True, exist_ok=True)

In [24]:
len(test_loader.dataset)

49850

In [25]:
train(model, train_loader, val_loader, criterion, optimizer, 5, model_path)

100%|██████████| 13390/13390 [1:34:28<00:00,  2.36it/s]
100%|██████████| 3060/3060 [07:08<00:00,  7.14it/s]


Epochs: 1 | Train Loss:  0.043 | Train Accuracy:  0.851                 | Val Loss:  0.042 | Val Accuracy:  0.864


100%|██████████| 13390/13390 [1:34:49<00:00,  2.35it/s]
100%|██████████| 3060/3060 [07:08<00:00,  7.13it/s]


Epochs: 2 | Train Loss:  0.033 | Train Accuracy:  0.894                 | Val Loss:  0.047 | Val Accuracy:  0.855


  4%|▍         | 512/13390 [03:37<1:31:14,  2.35it/s]

### Testing

In [17]:
from sklearn.metrics import balanced_accuracy_score

In [40]:
model = torch.load(
    "/workspace/storage/subgraph_classify_models/sqwd/candidates_True/BertClassifier_sqwd_roberta-base_sampler.pt"
)

In [46]:
def test_model(model, test_dataloader):
    total_acc_val = 0
    y_true = []
    y_pred = []

    with torch.no_grad():
        for val_label, val_input in tqdm(test_dataloader):
            val_label = val_label.to(device)
            y_true.extend(val_label.tolist())
            mask = val_input["attention_mask"].to(device)
            input_id = val_input["input_ids"].squeeze(1).to(device)

            output = model(input_id, mask)
            y_pred.extend(output.argmax(dim=1).tolist())

            acc = (output.argmax(dim=1) == val_label).sum().item()
            total_acc_val += acc

    final_acc = total_acc_val / len(test_dataloader.dataset)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    return final_acc, balanced_acc, y_true

In [47]:
final_acc, balanced_acc, y_true = test_model(model, test_loader)
print(f"Final Accuracy: {final_acc}, balanced accuracy: {balanced_acc}")

100%|██████████| 3116/3116 [09:53<00:00,  5.25it/s]

Final Accuracy: 0.79518555667001, balanced accuracy: 0.8637087858949936





Accuracy if we predict 0 for all questions

In [22]:
tmp = [0] * len(y_true)
counter = 0
for i, j in zip(tmp, y_true):
    if i == j:
        counter += 1
print(counter / len(y_true))
# print(acc/len(acc))

0.8960280842527583
