In [1]:
import os

# --- Configuration for Authentication & Project Context ---
# 1. Credentials file path (must be correct)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/root/.config/gcloud/application_default_credentials.json'

# 2. Strongest project ID variables (used by the Google SDK)
# Use the official Google SDK variables, as these are more reliable than custom ones.
os.environ['GCLOUD_PROJECT'] = 'kg-app-473211'
os.environ['GOOGLE_CLOUD_PROJECT'] = 'kg-app-473211'

# 3. Vertex AI location (often required for client initialization)
os.environ['VERTEX_LOCATION'] = 'us-central1'
# ---------------------------------------------------------

# Confirm the variables are set in the kernel environment
print(f"GOOGLE_CLOUD_PROJECT set to: {os.getenv('GOOGLE_CLOUD_PROJECT')}")
print(f"VERTEX_LOCATION set to: {os.getenv('VERTEX_LOCATION')}")

GOOGLE_CLOUD_PROJECT set to: kg-app-473211
VERTEX_LOCATION set to: us-central1


In [24]:
from kg_gen import KGGen
# kg_gen = KGGen(model=f"ollama/tinyllama",
#                     temperature=0.0,  # Deterministic output
#                     api_base="http://host.docker.internal:11434",
#                     api_key=None)

kg_gen = KGGen(
  model=f"vertex_ai/gemini-2.5-flash",  
  temperature=0.0,        # Default temperature
  api_key=None
)

In [5]:
# EXAMPLE 1: Single string with context
text_input = "Linda is Josh's mother. Ben is Josh's brother. Andrew is Josh's father."
graph_1 = kg_gen.generate(
  input_data=text_input,
  context="Family relationships"
)

In [7]:
graph_1

Graph(entities={'Andrew', 'Linda', 'Josh', 'Ben'}, edges={'is brother of', 'is father of', 'is mother of'}, relations={('Linda', 'is mother of', 'Josh'), ('Ben', 'is brother of', 'Josh'), ('Andrew', 'is father of', 'Josh')}, entity_clusters=None, edge_clusters=None)

In [11]:
KGGen.visualize(graph_1, './test_graph_file.html', open_in_browser=True)

In [25]:
import sys
sys.path.append('../')

In [26]:
from src.processing.pdf_extractor import extract_text_from_pdf

In [27]:
 file_path = f"./WatsonCrick1953.pdf"
 with open(file_path, 'rb') as f:
        # 3. Read the entire content into the 'content' variable
        file_content = f.read() 

 text = extract_text_from_pdf(file_content)

In [28]:
len(text)

11352

In [29]:
 # from src.processing.kg_gen_extractor import extract_entities_and_relations_with_kg_gen

In [30]:
context = """Scientific research paper. Identify core **Concepts** (Ideas, Theories, Fields), **Entities** (Chemicals, Genes, Proteins, Equipment, Locations), **Data** (Results, Measurements, Parameters), and **Methods** (Algorithms, Protocols, Techniques).

Prioritize extraction of relationships that convey causality, methodology, support/refute, and experimental details.

**REQUIRED RELATIONSHIP TYPES (Edges):**
1. **DESCRIBES:** (Paper/Section) $\to$ (Concept/Theory/Model)
2. **USES_METHOD:** (Experiment/Study) $\to$ (Method/Protocol)
3. **TESTS/APPLIES:** (Method/Model) $\to$ (Concept/Data)
4. **MEASURES/YIELDS:** (Experiment/Method) $\to$ (Data/Result/Parameter)
5. **CAUSES/AFFECTS:** (Entity/Process) $\to$ (Entity/Process/Result)
6. **SUPPORTS/REFUTES:** (Data/Finding) $\to$ (Concept/Hypothesis)
7. **HAS_PROPERTY:** (Entity) $\to$ (Value/Measurement)
8. **LOCATED_IN:** (Entity/Process) $\to$ (Location/Organism)
"""

In [31]:
graph_2 = kg_gen.generate(
  input_data=text,
  context=context,
  chunk_size=5000,  # Process text in chunks of 5000 chars
  cluster=True      # Cluster similar entities and relations
)

In [32]:
KGGen.visualize(graph_2, './test_graph_file_WatsonCrick1953.html', open_in_browser=True)

In [33]:
type(graph_1)

kg_gen.models.Graph

In [44]:
help(graph_2.entities)

Help on set object:

class set(object)
 |  set() -> new empty set object
 |  set(iterable) -> new set object
 |  
 |  Build an unordered collection of unique elements.
 |  
 |  Methods defined here:
 |  
 |  __and__(self, value, /)
 |      Return self&value.
 |  
 |  __contains__(...)
 |      x.__contains__(y) <==> y in x.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iand__(self, value, /)
 |      Return self&=value.
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __ior__(self, value, /)
 |      Return self|=value.
 |  
 |  __isub__(self, value, /)
 |      Return self-=value.
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __ixor__(self, value, /)
 |      Return self^=value.


In [52]:
import json
import os

# --- 1. Extract data using the confirmed attributes as simple lists ---

# Based on the error, we know that:
# 1. graph_2.entities exists, but holds strings (the entity names).
# 2. graph_2.relations likely holds a list of structured data, possibly strings 
#    representing the subject-predicate-object triples.

try:
    graph_data = {
        # Nodes are stored as simple strings (entity names)
        "nodes": list(graph_2.entities), 
        
        # Relations are stored as a list of strings or tuples.
        # We try to convert the set to a list, assuming it works for serialization.
        "relations": list(graph_2.relations), 
    }

except Exception as e:
    # If the above fails (e.g., if relations is a set of tuples that needs special handling), 
    # print the error and continue to the save attempt.
    print(f"Extraction failed again (using list conversion): {e}")
    raise

# --- 2. Save the structured data to a JSON file ---
output_filename = "graph_data_WatsonCrick1953.json"
output_path = os.path.join(os.getcwd(), output_filename) 

with open(output_path, 'w') as f:
    # Standard JSON dump works fine with lists of strings/tuples
    json.dump(graph_data, f, indent=4) 

print(f"Graph data successfully saved as JSON to: {output_path} ðŸ’¾")

Graph data successfully saved as JSON to: /app/notebooks/graph_data_WatsonCrick1953.json ðŸ’¾


In [54]:
import json
import os

# --- 1. Define a helper function to ensure everything is converted to a serializable type ---
def serialize_graph_attribute(data):
    """Recursively converts sets to lists to ensure JSON compatibility."""
    if isinstance(data, set):
        # Convert the set to a list
        return list(data)
    elif isinstance(data, dict):
        # Recursively process values in a dictionary
        return {k: serialize_graph_attribute(v) for k, v in data.items()}
    elif isinstance(data, list):
        # Recursively process items in a list
        return [serialize_graph_attribute(item) for item in data]
    else:
        # Return all other types as is (e.g., strings, integers)
        return data

# --- 2. Extract and Convert All Data ---
try:
    clustered_graph_data = {
        # Convert core data from set to list
        "nodes": serialize_graph_attribute(graph_2.entities), 
        "relations": serialize_graph_attribute(graph_2.relations), 
        
        # Convert cluster data (which likely contains nested sets/dicts)
        "entity_clusters": serialize_graph_attribute(graph_2.entity_clusters), 
        "edge_clusters": serialize_graph_attribute(graph_2.edge_clusters),
    }

except Exception as e:
    print(f"Failed during final extraction or conversion: {e}")
    raise

# --- 3. Save the structured data to a JSON file ---
output_filename = "clustered_graph_data_WatsonCrick1953.json"
output_path = os.path.join(os.getcwd(), output_filename) 

with open(output_path, 'w') as f:
    # json.dump will now succeed because all sets have been converted to lists
    json.dump(clustered_graph_data, f, indent=4) 

print(f"Clustered graph data successfully saved as JSON to: {output_path} ðŸ’¾")

Clustered graph data successfully saved as JSON to: /app/notebooks/clustered_graph_data_WatsonCrick1953.json ðŸ’¾


In [59]:
import json
import re
import os

file_path = "clustered_graph_data_WatsonCrick1953.json"
edge_collection_name = "KG_Nodes" # The collection where vertices are stored

with open(file_path, 'r') as f:
    data = json.load(f)

# --- KEY SANITIZATION FUNCTION (Most Robust) ---
def get_safe_key(node_name):
    """
    Creates a safe ArangoDB key by normalizing, lowercasing, and removing 
    all characters that are NOT alphanumeric or underscore.
    """
    if not isinstance(node_name, str):
        return None 
    
    # 1. Replace spaces with underscores
    intermediate_key = node_name.lower().replace(' ', '_')
    
    # 2. AGGRESSIVE SANITIZATION: Remove all characters that are NOT word characters (a-z, A-Z, 0-9, _)
    safe_key = re.sub(r'[^\w]', '', intermediate_key) 
    
    # 3. Handle specific issues like leading numbers or empty strings
    if safe_key and safe_key[0].isdigit():
        safe_key = "c" + safe_key # Prefix with 'c' if it starts with a number
    
    return safe_key if safe_key else f"fallback_{hash(node_name)}"


# --- 1. PREPARE NODES (Vertices) ---
nodes_to_import = []
node_key_map = {} 

for node_name in data['nodes']:
    safe_key = get_safe_key(node_name)
    
    node_doc = {
        "_key": safe_key,
        "name": node_name,
        "type": "Concept" 
    }
    nodes_to_import.append(node_doc)
    node_key_map[node_name] = safe_key

# --- 2. PREPARE EDGES (Relations) ---
edges_to_import = []

for triple_list in data['relations']:
    
    if len(triple_list) == 3:
        subject, predicate, obj = triple_list 
        
        # Get the keys using the consistent function
        subject_key = get_safe_key(subject)
        object_key = get_safe_key(obj)
        
        if subject_key and object_key:
            edges_to_import.append({
                # CORRECT ArangoDB edge format: CollectionName/DocumentKey
                "_from": f"{edge_collection_name}/{subject_key}", 
                "_to": f"{edge_collection_name}/{object_key}",
                "label": predicate 
            })
        else:
            print(f"Skipping edge due to unmapped entity key in triple: {triple_list}")
    else:
        print(f"Skipping malformed relation (not 3 parts): {triple_list}")


# --- 3. Save Cleaned Data Files ---
nodes_output_path = "arango_nodes_FINAL.json"
edges_output_path = "arango_edges_FINAL.json"

with open(nodes_output_path, 'w') as f:
    json.dump(nodes_to_import, f, indent=2)

with open(edges_output_path, 'w') as f:
    json.dump(edges_to_import, f, indent=2)
    
print("Sanitization and key generation synchronized.")
print("The new files are ready for a clean import.")

Sanitization and key generation synchronized.
The new files are ready for a clean import.


In [2]:
context = """
            Research Paper Knowledge Graph Extraction

            Extract key elements and relationships to build a 
            comprehensive knowledge graph from academic literature.

            **CORE ELEMENTS TO EXTRACT:**

            1. **CONCEPTS:** Fundamental ideas, theories, frameworks, hypotheses, principles
            2. **ENTITIES:** Key objects, subjects, or components relevant to the domain
            3. **METHODS:** Approaches, techniques, procedures, methodologies, algorithms
            4. **DATA:** Findings, results, measurements, observations, statistics
            5. **CONTEXTS:** Domains, fields, systems, environments where research occurs
            6. **AUTHORS:** Researchers, scholars, contributors (include full names and affiliations)
            7. **ORGANIZATIONS:** Institutions, universities, companies, research centers, departments
            8. **PUBLICATION_VENUES:** Journals, conferences, proceedings, books

            **RELATIONSHIP TYPES (Edges):**

            **RESEARCH CONTENT RELATIONSHIPS:**
            1. **INVESTIGATES/EXAMINES:** (Research/Study) â†’ (Concept/Problem/Phenomenon)
            2. **PROPOSES/HYPOTHESIZES:** (Paper/Author) â†’ (Theory/Concept/Framework)
            3. **USES/EMPLOYS:** (Study/Method) â†’ (Technique/Approach/Tool)
            4. **PRODUCES/GENERATES:** (Method/Experiment) â†’ (Result/Data/Finding)
            5. **DEMONSTRATES/SHOWS:** (Evidence/Data) â†’ (Relationship/Effect/Pattern)
            6. **SUPPORTS/CHALLENGES:** (Finding/Result) â†’ (Hypothesis/Theory/Claim)
            7. **COMPARES/CONTRASTS:** (Study/Analysis) â†’ (Approaches/Models/Results)
            8. **EXTENDS/BUILDS_ON:** (CurrentWork) â†’ (PreviousWork/Foundation)

            **AUTHORSHIP & COLLABORATION RELATIONSHIPS:**
            9. **AUTHORED_BY:** (Paper) â†’ (Author)
            10. **AFFILIATED_WITH:** (Author) â†’ (Organization/Institution)
            11. **COLLABORATES_WITH:** (Author/Organization) â†’ (Author/Organization)
            12. **LEADS/RESPONSIBLE_FOR:** (Author) â†’ (Study/Method/Concept) [when clear from context]
            13. **CONTRIBUTES_TO:** (Author/Organization) â†’ (Field/Domain/Advancement)

            **ORGANIZATIONAL & CONTEXTUAL RELATIONSHIPS:**
            14. **APPLIES_TO:** (Method/Theory) â†’ (Domain/Context/Problem)
            15. **FUNDED_BY/SUPPORTED_BY:** (Research/Paper) â†’ (Organization/FundingBody)
            16. **PUBLISHED_IN:** (Paper) â†’ (PublicationVenue)
            17. **LOCATED_IN:** (Organization/Study) â†’ (Location/Country/Region)
            18. **SPECIALIZES_IN:** (Organization/Author) â†’ (Domain/Field/Technique)

            **EXTRACTION GUIDELINES:**

            - Capture author full names and their organizational affiliations
            - Note corresponding authors and leadership roles when evident
            - Extract institutional hierarchies (department â†’ university â†’ country)
            - Identify collaboration patterns between authors and organizations
            - Include funding sources and supporting organizations
            - Capture publication context (journal, conference, year)
            - Prioritize relationships that reveal research networks and institutional contributions
            - Maintain distinction between research content and authorship metadata

            **OUTPUT FORMAT:**
            You MUST respond with a single JSON object. The object MUST contain a key 
            called "relations", which holds an array of all the extracted triple objects. 
            Each triple object MUST have the three keys: "subject", "predicate", and "object".
            
            Example of the ABSOLUTELY REQUIRED output JSON format:
            {
                "relations": [
                    {
                        "subject": "Einstein",
                        "predicate": "proposed",
                        "object": "Theory of Relativity"
                    },
                    {
                        "subject": "Paper Title",
                        "predicate": "AUTHORED_BY",
                        "object": "Jane Doe"
                    }
                ]
            }
            """

In [9]:
test_chunk_clean = """
Â© Nature Publishing Group 1953 No. 4356 April 25, 1953 NATURE 737. This figure is purely diagrammatic. The two ribbons symbolize the two phosphate-sugar chains, and the horizontal rods the pairs of bases holding the chains together. The vertical line marks the fibre axis. Equipment, and to Dr. G. E. R. Deacon and the captain and officers of R.R.S. Discovery II for their part in making the observations. Young, F. B., Gerrard, H;, and Jevons, W., Phil. Mag., 40, 149 (1920). Longuet-Higgins, M. S., Mon. Not. Roy. Astra. Soc., Geophys. Supp., 6, 285 (1949). Von Arx, W. S., Woods Hole Papers in Phys. Oceanog. Meteor., 11 (3) (1950). Ekman, Y. W. Arkiv. Mat. Astron. Fysik. (Stockholm), 2 (11) (1905).

MOLECULAR STRUCTURE OF NUCLEIC ACIDS: A Structure for Deoxyribose Nucleic Acid. 

"""



In [None]:
"""

We wish to suggest a structure for the salt of deoxyribose nucleic acid (D.N.A.). This structure has novel features which are of considerable biological interest. A structure for nucleic acid has already been proposed by Pauling and Corey. They kindly made their manuscript available to us in advance of publication. Their model consists of three intertwined chains, with the phosphates near the fibre axis, and the bases on the outside. In our opinion, this structure is unsatisfactory for two reasons: (1) We believe that the material which gives the X-ray diagrams is the salt, not the free acid. Without the acidic hydrogen atoms it is not clear what forces would hold the structure together, especially as the negatively charged phosphates near the axis will repel each other. (2) Some of the van der Waals distances appear to be too small.

Another three-chain structure has also been suggested by Fraser (in the press). In his model the phosphates are on the outside and the bases on the inside, linked together by hydrogen bonds. This structure as described is rather ill-defined, and for this reason we shall not comment on it.

We wish to put forward a radically different structure for the salt of deoxyribose nucleic acid. This structure has two helical chains each coiled round the same axis (see diagram). We have made the usual chemical assumptions, namely, that each chain consists of phosphate di-ester groups joining ÃŸ-D-deoxyribofuranose residues with 3',5' linkages. The two chains (but not their bases) are related by a dyad perpendicular to the fibre axis. Both chains follow right-handed helices, but owing to the dyad the sequences of the atoms in the two chains run in opposite directions. Each chain loosely resembles Furberg's model No.1; that is, the bases are on the inside of the helix and the phosphates on the outside. The configuration of the sugar and the atoms near it is close to Furberg's 'standard configuration', the sugar being roughly perpendicular to the attached base. There is a residue on each chain every 3Â·4 A. in the z-direction. We have assumed an angle of 36Â° between adjacent residues in the same chain, so that the structure repeats after 10 residues on each chain, that is, after 34 A. The distance of a phosphorus atom from the fibre axis is 10 A. As the phosphates are on the outside, cations have easy access to them. The structure is an open one, and its water content is rather high. At lower water contents we would expect the bases to tilt so that the structure could become more compact.

The novel feature of the structure is the manner in which the two chains are held together by the purine and pyrimidine bases. The planes of the bases are perpendicular to the fibre axis. They are joined together in pairs, a single base from one chain being hydrogen-bonded to a single base from the other chain, so that the two lie side by side with identical z-co-ordinates. One of the pair must be a purine and the other a pyrimidine for bonding to occur. The hydrogen bonds are made as follows: purine position 1 to pyrimidine position 1; purine position 6 to pyrimidine position 6. If it is assumed that the bases only occur in the structure in the most plausible tautomeric forms (that is, with the keto rather than the enol configurations) it is found that only specific pairs of bases can bond together. These pairs are: adenine (purine) with thymine (pyrimidine), and guanine (purine) with cytosine (pyrimidine). In other words, if an adenine forms one member of a pair, on either chain, then on these assumptions the other member must be thymine; similarly for guanine and cytosine. The sequence of bases on a single chain does not appear to be restricted in any way. However, if only specific pairs of bases can be formed, it follows that if the sequence of bases on one chain is given, then the sequence on the other chain is automatically determined. It has been found experimentally.
"""

In [3]:
from kg_gen import KGGen
kg_gen = KGGen(
  model=f"vertex_ai/gemini-2.5-flash",  
  temperature=0.0,        # Default temperature
  api_key=None
)

In [4]:
kg_gen

<kg_gen.kg_gen.KGGen at 0x7fba623d4210>

In [11]:
#test_text = "NATURE is published by Nature Publishing Group."
graph = kg_gen.generate(input_data=test_chunk_clean, context=context)
print(f"Entities: {graph.entities}")
print(f"Relations: {graph.relations}")

Entities: {'Dr. G. E. R. Deacon', 'Young, F. B.', 'No. 4356', 'Geophys. Supp.', '1920', '1949', 'Gerrard, H.', 'Structure for Deoxyribose Nucleic Acid', 'MOLECULAR STRUCTURE OF NUCLEIC ACIDS', 'Jevons, W.', 'fibre axis', 'Nature Publishing Group', 'Von Arx, W. S.', 'captain', 'Woods Hole Papers in Phys. Oceanog. Meteor.', 'Ekman, Y. W.', '737', 'Phil. Mag.', 'Arkiv. Mat. Astron. Fysik. (Stockholm)', '11 (3)', 'phosphate-sugar chains', '1905', 'April 25, 1953', 'R.R.S. Discovery II', 'bases', '1950', '2 (11)', 'officers', 'Longuet-Higgins, M. S.', 'Equipment', 'NATURE', '40', 'observations', 'ribbons', '149', '6', '285', 'Mon. Not. Roy. Astra. Soc.', 'Deoxyribose Nucleic Acid'}
Relations: {('Mon. Not. Roy. Astra. Soc.', 'has supplement', 'Geophys. Supp.'), ('Gerrard, H.', 'authored', 'Phil. Mag.'), ('Arkiv. Mat. Astron. Fysik. (Stockholm)', 'published in', '1905'), ('NATURE', 'has page number', '737'), ('Structure for Deoxyribose Nucleic Acid', 'is about', 'Deoxyribose Nucleic Acid'), (

In [21]:
os.getenv('OLLAMA_MODEL') 

In [20]:
os.environ['USE_OLLAMA'] = 'true'

In [None]:
phi3:mini