In [1]:
!pip install -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nvjitli

In [2]:
import re
import json
import torch
from peft import LoraConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

# Paths and configurations
MODEL_ID = "SciPhi/Triplex"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
base_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, device_map="auto")
model = PeftModel(base_model, LoraConfig(r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]))

# Updated entity and relationship types for Russia-Ukraine war context
ENTITY_TYPE = ["Person", "Location", "Event", "Organization", "Military Unit", "Weapon", "Country"]
RELATIONSHIP_TYPE = ["Attacked", "Defended", "Supported", "Opposed", "Occurred On", "Involved In", "Negotiated With"]
ENTITY_TYPE = [x.upper() for x in ENTITY_TYPE]
RELATIONSHIP_TYPE = [x.upper() for x in RELATIONSHIP_TYPE]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.42k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [3]:
# Input format template
input_format = """Perform Named Entity Recognition (NER) and extract knowledge graph triplets.

**Entity Types:** {entity_types}
**Predicates:** {predicates}

**Question:** {question}
**Answer:** {answer}
"""

# Function to process Q&A pairs using Triplex
def process_qa_pair(question, answer):
    message = input_format.format(
        entity_types=json.dumps({"entity_types": ENTITY_TYPE}),
        predicates=json.dumps({"predicates": RELATIONSHIP_TYPE}),
        question=question,
        answer=answer,
    )
    messages = [{"role": "user", "content": message}]
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", padding=True).to("cuda")

    outputs = model.generate(input_ids=input_ids, max_new_tokens=512, do_sample=False, temperature=0.0)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    json_match = re.search(r"```json\n(.*?)```", response, re.DOTALL)
    if json_match:
        json_text = json_match.group(1).strip()
        try:
            return json.loads(json_match.group(1).strip())
        except json.JSONDecodeError as e:
            print(f"JSON Decode Error: {e}")
            print(f"Extracted JSON Text: {json_text}")
            return None
    else:
        print(f"Failed to extract JSON block. Raw Response: {response}")
        return None
    return None

# Function to parse the Q&A text file
def parse_qa_file(input_file):
    with open(input_file, "r") as f:
        lines = f.readlines()

    data = []
    current_timestamp, question, answer = None, None, None
    i, line_count = 0, len(lines)
    for line in lines:
        i += 1
        # print(f"Processing {i}/{line_count} lines...")
        line = line.strip()
        if line.startswith("Timestamp:"):
            current_timestamp = line.replace("Timestamp:", "").strip()
        elif line.startswith("Q:"):
            question = line[3:].strip()
        elif line.startswith("A:"):
            answer = line[3:].strip()
            if current_timestamp and question and answer:
                data.append({"timestamp": current_timestamp, "question": question, "answer": answer})
                question, answer = None, None  # Reset for next pair
    return data

# Main function to generate the knowledge graph
def generate_knowledge_graph(input_file, output_file):
    qa_data = parse_qa_file(input_file)
    knowledge_graph = []
    q_number = 1
    for entry in qa_data:
        timestamp, question, answer = entry["timestamp"], entry["question"], entry["answer"]
        response = process_qa_pair(question, answer)
        if response:
            knowledge_graph.append({"timestamp": timestamp, "triplets": response.get("entities_and_triples", [])})
            print(f"Processed Q{q_number}: {question} | A: {answer}")
            print(f"Response: {response}")
        q_number += 1

    with open(output_file, "w") as f:
        json.dump(knowledge_graph, f, indent=4)
    print(f"Knowledge graph saved to {output_file}")

In [4]:
# Paths and configurations
INPUT_FILE = "data/formatted_questions_answers.txt"
OUTPUT_FILE = "data/temporal_knowledge_graph.json"

# Run the process
generate_knowledge_graph(INPUT_FILE, OUTPUT_FILE)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processed Q1: Who accused Ukraine of terrorist activity? | A: Russian president Vladimir Putin.
Response: {'entities_and_triples': ['[1], COUNTRY:Ukraine', '[2], PERSON:Vladimir Putin', '[3], COUNTRY:Russia', '[2] NEGOTIATED WITH [1]', '[2] OPPOSED [1]', '[2] OPPOSED [3]']}
Processed Q2: What specific event did Putin reference as terrorist activity? | A: The drone attack on Moscow.
Response: {'entities_and_triples': ['[1], PERSON:Putin', '[2], EVENT:drone attack', '[3], LOCATION:Moscow', '[2] OCCURRED [3]', '[1] REFERENCE [2]', '[2] ATTACKED [3]']}
Failed to extract JSON block. Raw Response: Perform Named Entity Recognition (NER) and extract knowledge graph triplets.

**Entity Types:** {"entity_types": ["PERSON", "LOCATION", "EVENT", "ORGANIZATION", "MILITARY UNIT", "WEAPON", "COUNTRY"]}
**Predicates:** {"predicates": ["ATTACKED", "DEFENDED", "SUPPORTED", "OPPOSED", "OCCURRED ON", "INVOLVED IN", "NEGOTIATED WITH"]}

**Question:** When did the drone attack on Moscow occur?
**Answer:** E