

**Reminder üê£**


If the notebook imports the .py files,
then the .py files must be in the same project folder or environment.

Meaning:

On Colab ‚Üí you upload the folder to /content/‚Ä¶

On GitHub ‚Üí the .py files sit in /src/

In local development ‚Üí the .py files sit in the repository next to the notebook




In [None]:
# Create main project folder
!mkdir -p /content/TKG_demo_project/src/

!ls -R /content/TKG_demo_project



In [None]:
from google.colab import files
import shutil


#upload augmentation.py
uploaded = files.upload()
shutil.move(list(uploaded.keys())[0], "/content/TKG_demo_project/src/")

#upload temporal_qa_system.py
uploaded = files.upload()
shutil.move(list(uploaded.keys())[0], "/content/TKG_demo_project/src/")




In [None]:
import glob
glob.glob("/content/TKG_demo_project/src/*.py")


In [None]:
import sys
sys.path.append('/content/TKG_demo_project/src/')

!pip install wikipedia-api sentence-transformers transformers
from temporal_qa_system import create_system_from_notebook
from wikipedia_retriever import WikipediaRetriever


In [None]:
# Download and unzip the mmkb repo
!wget https://github.com/mniepert/mmkb/archive/refs/heads/master.zip -O mmkb.zip
!unzip -q mmkb.zip

# Inspect the ICEWS14 folder
!ls mmkb-master/TemporalKGs/icews14



In [None]:
!mkdir -p /content/ICEWS14
!cp mmkb-master/TemporalKGs/icews14/* /content/ICEWS14/
!ls /content/ICEWS14


In [None]:
import pandas as pd

# Paths to ICEWS14 files downloaded from mmkb
train_path = "/content/ICEWS14/icews_2014_train.txt"
valid_path = "/content/ICEWS14/icews_2014_valid.txt"
test_path  = "/content/ICEWS14/icews_2014_test.txt"

# Load ICEWS14 splits (tab-separated, no header)
train_df = pd.read_csv(train_path, sep="\t", header=None)
valid_df = pd.read_csv(valid_path, sep="\t", header=None)
test_df  = pd.read_csv(test_path,  sep="\t", header=None)

# Assign column names
train_df.columns = ["head", "relation", "tail", "timestamp"]
valid_df.columns = ["head", "relation", "tail", "timestamp"]
test_df.columns  = ["head", "relation", "tail", "timestamp"]

print("Train sample:")
print(train_df.head())


In [None]:
# 1. Build ID dictionaries from all splits
df_all = pd.concat([train_df, valid_df, test_df]).reset_index(drop=True)

for col in ["head", "relation", "tail", "timestamp"]:
    df_all[col] = df_all[col].astype(str)

entities = sorted(set(df_all["head"]).union(df_all["tail"]))
relations = sorted(df_all["relation"].unique())
times = sorted(df_all["timestamp"].unique())

entity2id = {e: idx for idx, e in enumerate(entities)}
relation2id = {r: idx for idx, r in enumerate(relations)}
time2id = {t: idx for idx, t in enumerate(times)}

print("Num entities:", len(entity2id))
print("Num relations:", len(relation2id))
print("Num timestamps:", len(time2id))

# 2. Build triple lists from official ICEWS14 splits
train_triples_raw = list(zip(
    train_df["head"].astype(str),
    train_df["relation"].astype(str),
    train_df["tail"].astype(str),
    train_df["timestamp"].astype(str),
))

valid_triples_raw = list(zip(
    valid_df["head"].astype(str),
    valid_df["relation"].astype(str),
    valid_df["tail"].astype(str),
    valid_df["timestamp"].astype(str),
))

test_triples_raw = list(zip(
    test_df["head"].astype(str),
    test_df["relation"].astype(str),
    test_df["tail"].astype(str),
    test_df["timestamp"].astype(str),
))

print("Train triples:", len(train_triples_raw))
print("Valid triples:", len(valid_triples_raw))
print("Test triples:", len(test_triples_raw))
print("Example train triple:", train_triples_raw[0])


**PyTorch TransE**

In [None]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer


In [None]:
class TransE(nn.Module):
    def __init__(self, num_entities, num_relations, dim=100):
        super().__init__()
        self.ent_embeddings = nn.Embedding(num_entities, dim)
        self.rel_embeddings = nn.Embedding(num_relations, dim)
        nn.init.xavier_uniform_(self.ent_embeddings.weight)
        nn.init.xavier_uniform_(self.rel_embeddings.weight)

    def forward(self, h, r, t):
        h_e = self.ent_embeddings(h)
        r_e = self.rel_embeddings(r)
        t_e = self.ent_embeddings(t)
        # higher score = better triple
        return -torch.norm(h_e + r_e - t_e, p=2, dim=1)

def train_transe(model, triples, epochs=3, batch_size=1024, lr=0.001, device="cuda"):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    h = torch.tensor([entity2id[a[0]] for a in triples], dtype=torch.long)
    r = torch.tensor([relation2id[a[1]] for a in triples], dtype=torch.long)
    t = torch.tensor([entity2id[a[2]] for a in triples], dtype=torch.long)

    dataset = torch.utils.data.TensorDataset(h, r, t)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        for h_b, r_b, t_b in loader:
            h_b = h_b.to(device)
            r_b = r_b.to(device)
            t_b = t_b.to(device)

            optimizer.zero_grad()
            pos_score = model(h_b, r_b, t_b)

            # simple negative sampling: corrupt tail
            t_neg = t_b[torch.randperm(len(t_b))]
            neg_score = model(h_b, r_b, t_neg)

            loss = torch.relu(1.0 + neg_score - pos_score).mean()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, loss = {float(loss):.4f}")

    return model

device = "cuda" if torch.cuda.is_available() else "cpu"

transe_model = TransE(
    num_entities=len(entity2id),
    num_relations=len(relation2id),
    dim=100
)

transe_model = train_transe(
    transe_model,
    train_triples_raw,
    epochs=10,
    batch_size=1024,
    lr=0.001,
    device=device
)



**Load BGE-large**

In [None]:
semantic_model = SentenceTransformer("BAAI/bge-large-en-v1.5").to(device)


**Import Demo system + wiki retriever**

In [None]:
# Choose 2‚Äì3 demo ICEWS triple indices
demo_indices = [0, 2, 6]

import sys
sys.path.append('/content/TKG_demo_project/src/')

system = create_system_from_notebook(
    model=transe_model,
    semantic_model=semantic_model,
    entity2id=entity2id,
    relation2id=relation2id,
    time2id=time2id,
    train_triples_raw=train_triples_raw,
    demo_indices=demo_indices,  # Uses the indices from Cell 1
)
print("System created")

In [None]:
from wikipedia_retriever import WikipediaRetriever

system.wikipedia_retriever = WikipediaRetriever(
    language="en",
    user_agent="TKG-Demo/1.0 (contact: example@example.com)"
)
print("Retriever attached")

**attach LLM and run demo**

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

llm_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
llm_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)

def call_flan_t5(prompt, max_length=256):
    inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    outputs = llm_model.generate(**inputs, max_length=max_length)
    return llm_tokenizer.decode(outputs[0], skip_special_tokens=True)

system._call_llm = call_flan_t5
print("LLM is ready")

 **Build demo QA from your ICEWS triples**

In [None]:
# Run demo
for qa in demo_qa:
    q = qa["question"]
    gold = qa["answer"]

    print("\n" + "="*60)
    print("Question:", q)
    print("Gold answer:", gold)

    result = system.answer_question(q, k=2, alpha=0.5)

    print("\nLLM answer:", result["answer"])
    print("\nTop ICEWS facts:")
    for f in result["top_facts"]:  # Changed from "facts"
        print("  -", f)

    print("\nWikipedia passages:")
    for w in result["wiki_passages"]:
        print("  -", w)

    print("\nEntities:", result["entities_used"], "Year:", result["year_used"])  # Changed keys

# üê£ think about how to make implicit Questions -> explicit

example:

implicit: ‚ÄúAfter the Danish Ministry, who was the first to visit Iraq?"

explicit: ‚ÄúAfter 2016-01-05, who was the first to visit Iraq?‚Äù


