## GROQ

In [3]:
# !pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sammartj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

There are two files given to you, one a .txt file labelled as a 'PMC' followed by numbers, this is an ID for an article, 
and a .json file with an entity object that matches with the PMC id. Extract relational information from the .txt file 
that relates to ONLY the biomarkers listed for THAT article in the .json file. Not every entity in the .json file is necessarily a
biomarker, so you need to decide which ones are biomarkers before proceeding.

In [2]:
system_message = """
There is a file given to you labelled as a 'PMC' followed by numbers, this is an ID for an article. Extract any named biomarkers and
their relational information from the .txt file as it pertains to Alzheimer's Disease. If no information containing biomarkers is described, 
you may return an emtpy .json.

Use ONLY the following relationship types, and you may use more than one for each biomarker (but using only one is acceptable too):
- DIAGNOSITC_OF
- INDICATES_PRESENCE_OF
- USED_TO_DIAGNOSE
- RELATED_PROGNOSIS
- ASSOCIATED_WITH_POOR_OUTCOME
- ASSOCIATED_WITH_BETTER_OUTCOME
- PREDICTS_SURVIVAL
- PREDICTS_MORTALITY
- RELATED_TREATMENT
- GUIDES_TREATMENT_SELECTION
- INDICATES_TREATMENT_ELIGIBILITY
- IS_TARGET_OF_TREATMENT
- IS_BASIS_FOR_THERAPY_CHOICE
- RESPONDED_TO
- MONITORED_DURING_TREATMENT
- LEVELS_REFLECT_RESPONSE
- BIOMARKER_DYNAMICS_UNDER_TREATMENT
Return a list of JSON objects. For example:
[
    {
        "article": "PMC00000034.txt",
        "subject": "APOE4",
        "relationship": "USED_TO_DIAGNOSE",
        "object": "Diagnose: Early-onset Alzheimer's Disease"
    }
]

- ONLY return sets of four and nothing else. None of 'article', 'subject', 'relationship' and 'object' can be empty, unless no biomarkers
aren mentioned. Return the response in a JSON format. 

 """

In [None]:
import os
import json
import requests
from tqdm import tqdm
import re
from nltk.tokenize import sent_tokenize
import time
from random import uniform

# Configs
api_key = "" 
model = "gemma2-9b-it" # "deepseek-r1-distill-llama-70b"
url = "https://api.groq.com/openai/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}


txt_dir = "../data/target"
ner_json_path = "ner_results.json"
output_file = "rel_ner_results.jsonl"
max_chunk_tokens = 1000 

with open(ner_json_path, 'r', encoding='utf-8') as f:
    ner_data = json.load(f)
processed_articles = set()
if os.path.exists("processed.txt"):
    with open("processed.txt", "r", encoding="utf-8") as f:
        processed_articles = set(line.strip() for line in f)

# Prepare output
for txt_file in tqdm(os.listdir(txt_dir), desc= "Loading .txt articles"):
    # if not txt_file.endswith(".txt") or txt_file not in ner_data:
    if not txt_file.endswith(".txt") or txt_file in processed_articles:
        continue
    with open(output_file, 'a', encoding='utf-8') as outfile:

        txt_path = os.path.join(txt_dir, txt_file)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Sentence tokenize
        sentences = sent_tokenize(text)

        # Chunk sentences into groups
        chunks = []
        current_chunk = []
        current_len = 0

        for sent in sentences:
            token_count = len(sent.split())  
            if current_len + token_count > max_chunk_tokens:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                    current_len = 0
            current_chunk.append(sent)
            current_len += token_count
        if current_chunk:
            chunks.append(" ".join(current_chunk))

        # Get known biomarkers
        biomarkers = [entry["word"] for entry in ner_data[txt_file]]

        # Send each chunk
        for i, chunk in enumerate(chunks):
            user_prompt = f""" This is a section from {txt_file} (chunk {i + 1} of {len(chunks)}): {chunk} 
            The following are the known biomarkers from the corresponding JSON: {json.dumps(biomarkers)}
            Extract relationships as per instructions. """


            payload = {
                "model": model,
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_prompt}
                ]
            }

            try:
                MAX_RETRIES = 5
                BASE_DELAY = 60  # seconds

                # response = requests.post(url, headers=headers, data=json.dumps(payload))
                # response.raise_for_status()
                def send_with_retries(payload, retries=MAX_RETRIES):
                    for attempt in range(retries):
                        try:
                            response = requests.post(url, headers=headers, data=json.dumps(payload))
                            if response.status_code == 429:
                                raise requests.exceptions.HTTPError("Rate limit hit (429)", response=response)
                            response.raise_for_status()
                            return response.json()
                        except requests.exceptions.HTTPError as e:
                            if response.status_code == 429:
                                wait_time = BASE_DELAY * (2 ** attempt) + uniform(0, 1)  # exponential backoff with jitter
                                print(f"[429] Rate limit hit. Waiting {wait_time:.1f} seconds before retrying...")
                                time.sleep(wait_time)
                            else:
                                print(f"HTTP error on attempt {attempt + 1}: {e}")
                                break
                        except Exception as e:
                            print(f"Error on attempt {attempt + 1}: {e}")
                            break
                    return None
                # result = response.json()
                result = send_with_retries(payload)
                if not result:
                    print(f"Skipping {txt_file}, chunk {i + 1} due to repeated errors.")
                    continue
                output_text = result['choices'][0]['message']['content']


                output_text = result['choices'][0]['message']['content']

                # Save notes separately
                with open("processed.txt", "a", encoding="utf-8") as notes_file:
                    output_text = result['choices'][0]['message']['content']

                    # Extract reasoning from <think>...</think>
                    think_match = re.search(r"<think>(.*?)</think>", output_text, re.DOTALL)
                    if think_match:
                        notes_file.write(f"### {txt_file} | Chunk {i + 1} ###\n")
                        notes_file.write(think_match.group(1).strip() + "\n\n")

                    # Everything after </think> (or full content if no <think>)
                    post_think_match = re.split(r"</think>", output_text, maxsplit=1)
                    post_think_text = post_think_match[1].strip() if len(post_think_match) > 1 else output_text.strip()

                    # Try parsing the post-<think> content as JSON
                    try:
                        parsed_output = json.loads(post_think_text)
                    except json.JSONDecodeError:
                        # Use regex to find JSON array if wrapped in text
                        match = re.search(r"\[\s*{.*?}\s*\]", post_think_text, re.DOTALL)
                        if match:
                            try:
                                parsed_output = json.loads(match.group(0))
                            except json.JSONDecodeError:
                                print(f"Secondary parse failed for {txt_file}, chunk {i + 1}.")
                                continue
                        else:
                            print(f"No valid JSON found in {txt_file}, chunk {i + 1}. Output:\n{post_think_text}")
                            continue

                    # Optional: skip empty results
                    if not parsed_output:
                        continue

                    # Save valid entries to .jsonl
                    for relation in parsed_output:
                        if all(k in relation and relation[k] for k in ['article', 'subject', 'relationship', 'object']):
                            relation["chunk"] = i + 1
                            relation["article"] = txt_file
                            outfile.write(json.dumps(relation) + '\n')

            except Exception as e:
                print(f"Error on {txt_file}, chunk {i + 1}: {e}")
                
    with open("processed.txt", "a", encoding="utf-8") as f:
        f.write(txt_file + '\n')

Loading .txt articles:   0%|          | 0/30 [00:00<?, ?it/s]

[429] Rate limit hit. Waiting 61.0 seconds before retrying...
[429] Rate limit hit. Waiting 60.1 seconds before retrying...
[429] Rate limit hit. Waiting 120.7 seconds before retrying...
No valid JSON found in PMC8615710.txt, chunk 4. Output:
[ ]

```json
 []  
```
[429] Rate limit hit. Waiting 60.7 seconds before retrying...
[429] Rate limit hit. Waiting 60.7 seconds before retrying...
[429] Rate limit hit. Waiting 120.6 seconds before retrying...
[429] Rate limit hit. Waiting 240.4 seconds before retrying...
[429] Rate limit hit. Waiting 60.9 seconds before retrying...
No valid JSON found in PMC8615710.txt, chunk 8. Output:
```json
[ ]
```
[429] Rate limit hit. Waiting 60.7 seconds before retrying...


Loading .txt articles:   3%|▎         | 1/30 [14:10<6:50:59, 850.32s/it]

No valid JSON found in PMC8615710.txt, chunk 10. Output:
```json
[]
```
[429] Rate limit hit. Waiting 60.1 seconds before retrying...
[429] Rate limit hit. Waiting 121.0 seconds before retrying...
[429] Rate limit hit. Waiting 240.7 seconds before retrying...
[429] Rate limit hit. Waiting 60.1 seconds before retrying...
[429] Rate limit hit. Waiting 120.9 seconds before retrying...
[429] Rate limit hit. Waiting 240.1 seconds before retrying...
[429] Rate limit hit. Waiting 480.1 seconds before retrying...
[429] Rate limit hit. Waiting 60.4 seconds before retrying...
[429] Rate limit hit. Waiting 120.8 seconds before retrying...
[429] Rate limit hit. Waiting 240.4 seconds before retrying...
Secondary parse failed for PMC8638526.txt, chunk 6.
[429] Rate limit hit. Waiting 60.6 seconds before retrying...
[429] Rate limit hit. Waiting 120.5 seconds before retrying...


In [3]:
import json 
with open('ner_results.json', 'r') as file:
    data = json.load(file)

In [None]:
import csv

# Step 1: Extract unique nodes
nodes = set()
for triple_list in all_triples:
    for triplet in triple_list:
        # Add "subject" if it exists
        if "subject" in triplet:
            nodes.add((triplet["subject"], "Entity"))
        # Add "object" if it exists
        if "object" in triplet:
            nodes.add((triplet["object"], "Entity"))

# Step 2: Write nodes.

with open("nodes-r1.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "label"])
    writer.writerows(nodes)

# Step 3: Extract relationships
edges = []
for triple_list in all_triples:
    for triplet in triple_list:
        if "subject" in triplet and "object" in triplet and "relationship" in triplet:
            edges.append((triplet["subject"], triplet["object"], triplet["relationship"]))

# Step 4: Write edges.csv
with open("edges-r1.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["source", "target", "relationship"])
    writer.writerows(edges)

print("Nodes and edges files generated successfully.")


In [None]:
# sudo cp nodes.csv /var/lib/neo4j/import/
# sudo cp edges.csv /var/lib/neo4j/import/
# cypher-shell -u neo4j -p <your_password>
# LOAD CSV WITH HEADERS FROM 'file:///nodes.csv' AS row
# CREATE (:Entity {id: row.id, label: row.label});
# LOAD CSV WITH HEADERS FROM 'file:///edges-r1.csv' AS row
# MATCH (a:Entity {id: row.source}), (b:Entity {id: row.target})
# CREATE (a)-[:RELATIONSHIP {type: row.relationship}]->(b);
# MATCH (n)
# OPTIONAL MATCH (n)-[r]-()
# DELETE n,r

LOAD CSV WITH HEADERS FROM 'file:///nodes-r1.csv' AS row
CREATE (:Entity {id: row.id, label: row.label});

LOAD CSV WITH HEADERS FROM 'file:///edges-r1.csv' AS row
WITH row
CALL {
    // Match source and target nodes
    MATCH (a:Entity {id: row.source})
    MATCH (b:Entity {id: row.target})

    // Create relationship with dynamic label
    CREATE (a)-[r:`${row.relationship}`]->(b)
}
RETURN count(*);
# ALL DATABASE
# MATCH (n)-[r]->(m) RETURN n, r, m;
# QUERY RELATIONSHIPS
# MATCH (n)-[r]->(m) RETURN r LIMIT 10;
# QUERY PATIENT ALL DIAGNOSES
# MATCH (patient:Entity {id: "Patient ID: 10000980"})-[:RELATIONSHIP{type: "has"}]->(diagnosis:Entity)
# RETURN diagnosis.id AS diagnosis_id;
# QUERY ALL THE MEDICATION
# MATCH (patient:Entity {id: "Patient ID: 10000980"})-[:RELATIONSHIP{type: "receives"}]->(medication:Entity)
# RETURN medication.id AS medication_id
# QUERY ALL THE MEDICATION IN THE DB
# MATCH (patient:Entity)-[:RELATIONSHIP {type: "receives"}]->(medication:Entity)
# RETURN medication.id AS medication_id;

In [None]:
import re

def normalize_relationship(relationship):
    # Remove spaces and special characters, replace with underscores
    cleaned = re.sub(r'[^a-zA-Z0-9]', '_', relationship)
    # Convert to uppercase
    return cleaned.upper()


from neo4j import GraphDatabase
import csv

def normalize_relationship(relationship):
    cleaned = re.sub(r'[^a-zA-Z0-9]', '_', relationship)
    return cleaned.upper()

URI = "bolt://localhost:7688"
AUTH = ("neo4j", "12345678")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

    def create_relationship(source_id, target_id, relationship_type):
        with driver.session() as session:
            query = """
                MATCH (a:Entity {id: $source_id})
                MATCH (b:Entity {id: $target_id})
                CREATE (a)-[r:%s]->(b)
            """ % relationship_type
            parameters = {
                "source_id": source_id,
                "target_id": target_id
            }
            session.run(query, parameters)

    # Read edges file and process each row
    with open("edges-r1.csv", mode='r') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            source_id = row['source']
            target_id = row['target']
            relationship_type = normalize_relationship(row['relationship'])
            create_relationship(source_id, target_id, relationship_type)

    print("All relationships created successfully.")