In [27]:
import urllib.parse        
import requests
from bs4 import BeautifulSoup
import re

import numpy as np
from tqdm import tqdm_notebook
from tqdm import tqdm

import json

from utils.structured_dynamic_index_utils_with_db import Aligner
from utils.openai_utils import LLMTripletExtractor
from utils.structured_inference_with_db import extract_triplets

import pandas as pd
import networkx as nx
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

from pymongo import MongoClient
import os
from dotenv import load_dotenv, find_dotenv

from utils.structured_dynamic_index_utils_with_db import Aligner

In [17]:
# --- Mongo Setup ---
_ = load_dotenv(find_dotenv())
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client.get_database("wikidata_ontology")

# --- Extractor Setup ---
# extractor = LLMTripletExtractor(model='gpt-4.1-mini')
aligner = Aligner(db)

In [18]:
def get_wiki_paragraphs_by_entity(entity_name):

    url = f"https://en.wikipedia.org/wiki/{entity_name}"
    response = requests.get(url)

    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find(id="firstHeading")

    text_div = soup.find("div", class_='mw-content-ltr mw-parser-output')

    regex = re.compile('infobox.*')
    last_marked = text_div.find("table", {"class" : regex})

    texts = []
    last_marked = text_div.find("h2")

    for text in last_marked.find_all_previous('p'):
        texts.append(text)

    texts.reverse()
    
    text_metadata = []
    for text in texts:
        external_entities = []
        for entity in text.find_all("a"):
            title = entity.get('title')
            if title:
                external_entities.append(title)
        
        content = text.text.strip()
        if len(content) > 0:
            text_metadata.append((content, external_entities))
    
    return text_metadata


In [19]:
entity_name = "Steve Jobs"
get_wiki_paragraphs_by_entity(entity_name)

[('Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology company Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.',
  ['Apple Inc.',
   'NeXT',
   'Pixar',
   'Personal computer revolution',
   'Steve Wozniak']),
 ("Jobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing that same year. In 1974, he traveled through India, seeking enlightenment before later studying Zen Buddhism. He and Wozniak co-founded Apple in 1976 to further develop and sell Wozniak's Apple I personal computer. Together, the duo gained fame and wealth a year later with production and sale of the Apple II, one of the first highly successful mass-produced micr

In [20]:
texts_metadata = get_wiki_paragraphs_by_entity(entity_name)
wiki_entities = []

for text, metadata in texts_metadata:
    wiki_entities.extend(metadata)
wiki_entities

['Apple Inc.',
 'NeXT',
 'Pixar',
 'Personal computer revolution',
 'Steve Wozniak',
 'Reed College',
 'Hippie trail',
 'Buddhism in the West',
 'Apple I',
 'Apple II',
 'Microcomputer',
 'Xerox Alto',
 'Computer mouse',
 'Graphical user interface',
 'Apple Lisa',
 'Macintosh 128K',
 'Desktop publishing',
 'Aldus Pagemaker',
 'LaserWriter',
 'Laser printer',
 'Vector graphics',
 'PostScript',
 'John Sculley',
 'Computer platform',
 'Lucasfilm',
 'Pixar',
 'Computer-animated',
 'Toy Story',
 'Animation studio',
 'List of Pixar films',
 'Jony Ive',
 'Think different',
 'IMac',
 'ITunes',
 'Mac OS X',
 'Apple Store',
 'IPod',
 'ITunes Store',
 'IPhone',
 'App Store (iOS)',
 'IPad',
 'Gap Inc.',
 'Pancreatic neuroendocrine tumor',
 'Respiratory arrest',
 'Presidential Medal of Freedom']

In [21]:
with open("wiki_entities.json", 'w') as f:
    json.dump(wiki_entities, f)

## Inference

In [24]:
model_name = 'gpt-4.1'
extractor = LLMTripletExtractor(model=model_name)
aligner = Aligner(db)

In [25]:
entity_name = "Steve Jobs"
jobs_paragraphs = get_wiki_paragraphs_by_entity(entity_name)

for paragraph in jobs_paragraphs:
    initial_triplets, refined_triplets, filtered_triplets = extract_triplets(paragraph, sample_id='wiki-texts', aligner=aligner, extractor=extractor)
    print("Refined triplets: ", refined_triplets)
    print("Filtered triplets: ", filtered_triplets)

Refined triplets:  [{'subject': 'Steven Paul Jobs', 'relation': 'date of birth', 'object': 'February 24, 1955', 'subject_type': 'human', 'object_type': 'point in time', 'qualifiers': [], 'source_text_id': 0, 'prompt_token_nums': 0, 'completion_token_num': 0, 'sample_id': 'wiki-texts'}, {'subject': 'Steven Paul Jobs', 'relation': 'date of death', 'object': 'October 5, 2011', 'subject_type': 'human', 'object_type': 'point in time', 'qualifiers': [], 'source_text_id': 0, 'prompt_token_nums': 0, 'completion_token_num': 0, 'sample_id': 'wiki-texts'}, {'subject': 'Steven Paul Jobs', 'relation': 'country of citizenship', 'object': 'United States of America', 'subject_type': 'human', 'object_type': 'country', 'qualifiers': [], 'source_text_id': 0, 'prompt_token_nums': 0, 'completion_token_num': 0, 'sample_id': 'wiki-texts'}, {'subject': 'Steven Paul Jobs', 'relation': 'occupation', 'object': 'businessman', 'subject_type': 'human', 'object_type': 'profession', 'qualifiers': [], 'source_text_id'

## Neo4j

In [4]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7688"
# username = "neo4j"
# password = "12345678"
# driver = GraphDatabase.driver(uri, auth=(username, password))
driver = GraphDatabase.driver(uri)

In [5]:
def fetch_neo4j_triplets():
    with driver.session() as session:
        query = """
        MATCH (s)-[r]->(o)
        RETURN s.name AS subject, type(r) AS predicate, o.name AS object
        LIMIT 100
        """
        result = session.run(query)
        triplets = [(r["subject"], r["predicate"], r["object"]) for r in result]
    return triplets

fetch_neo4j_triplets()

[('Alla', 'lives_with', 'Roman'), ('Alla', 'lives_with', 'Roman')]

In [13]:

def find_any_node(tx):
    query = "MATCH (n) RETURN n LIMIT 1"
    result = tx.run(query)
    return result.single()[0] if result.peek() else None

def find_some_nodes(tx, limit=5):
    query = f"MATCH (n) RETURN n LIMIT {limit}"
    return [record["n"] for record in tx.run(query)]

with driver.session() as session:
    any_node = session.read_transaction(find_any_node)
    node_collection = session.read_transaction(find_some_nodes, 10)

    print("Any single node:")
    print(any_node)

    print("\nCollection of nodes:")
    for node in node_collection:
        print(node)

driver.close()

Any single node:
<Node element_id='4:4500ff1a-3206-4fe1-9ee3-52b341586ea0:0' labels=frozenset({'Person'}) properties={'name': 'Alla'}>

Collection of nodes:
<Node element_id='4:4500ff1a-3206-4fe1-9ee3-52b341586ea0:0' labels=frozenset({'Person'}) properties={'name': 'Alla'}>
<Node element_id='4:4500ff1a-3206-4fe1-9ee3-52b341586ea0:1' labels=frozenset({'Person'}) properties={'name': 'Alla'}>
<Node element_id='4:4500ff1a-3206-4fe1-9ee3-52b341586ea0:2' labels=frozenset({'Person'}) properties={'name': 'Roman'}>


  with driver.session() as session:
  any_node = session.read_transaction(find_any_node)
  node_collection = session.read_transaction(find_some_nodes, 10)


In [12]:
def find_some_nodes(tx, limit=5):
    query = f"MATCH (n) RETURN n LIMIT {limit}"
    return [record["n"] for record in tx.run(query)]


In [10]:
def add_node(tx, name):
    tx.run("CREATE (n:Person {name: $name})", name=name)

with driver.session() as session:
    session.write_transaction(add_node, "Alla")

  with driver.session() as session:
  session.write_transaction(add_node, "Alla")


In [None]:
def get_triples(tx):
    tx.run(f"""
            MATCH (a)-[r]->(b)
            RETURN a, r, b
            """)
    

with driver.session() as session:
    print(session.execute_read(get_triples))

None


  with driver.session() as session:


In [13]:
def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail)

with driver.session() as session:        
    session.write_transaction(add_relation, "DevOps", "Practice", "is")
    # session.write_transaction(add_node, "Practice")


  session.write_transaction(add_relation, "DevOps", "Practice", "is")


In [8]:
def add_node(tx, node_name):
    tx.run("CREATE (n:Node {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail)

def get_node(tx, name):
    result = tx.run("MATCH (n:Node {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


# for i, row in full_df.iterrows():
#     head = row['subject']
#     tail = row['object']
#     relation = "_".join(row['relation'].replace("-", "").replace(".", "").split())
#     # print(head, tail, relation)
#     with driver.session() as session:
#         if not session.read_transaction(get_node, head):
#             session.write_transaction(add_node, head)

#         if not session.read_transaction(get_node, tail):
#             session.write_transaction(add_node, tail)
            
#         session.write_transaction(add_relation, head, tail, relation)


In [41]:
def get_person(tx):
    
    result = tx.run("MATCH (subject:Node {name: $name1}) -[r:RELATION]-> (object:Node {name: $name2}) RETURN r.name as relation", name1="Steven Paul Jobs", name2="Next")
    
    return [rel["relation"] for rel in result]

with driver.session() as session:
    names = session.read_transaction(get_person)

names

  names = session.read_transaction(get_person)


[]

In [10]:
def delete_all(tx):
    
    result = tx.run("MATCH (n) OPTIONAL MATCH (n)-[r]-() DELETE r, n")
    
    return result

with driver.session() as session:
    names = session.write_transaction(delete_all)
    

  names = session.write_transaction(delete_all)


In [None]:
driver.close()

## Post-analysis

In [2]:
import pandas as pd
from collections import defaultdict

df = pd.read_csv('full_triplets.csv', index_col=0)
df

Unnamed: 0,subject,relation,object
0,Steven Paul Jobs,date of birth,"February 24, 1955"
1,Steven Paul Jobs,date of death,"October 5, 2011"
2,Steven Paul Jobs,country of citizenship,United States of America
3,Steven Paul Jobs,occupation,Businessman
4,Steven Paul Jobs,occupation,Inventor
...,...,...,...
42,several other businesses and philanthropic ven...,specialized in,Microprocessor
43,several other businesses and philanthropic ven...,specialized in,technology and pop culture conventions
44,several other businesses and philanthropic ven...,specialized in,technology in K–12 schools
45,several other businesses and philanthropic ven...,specialized in,environmental practices


In [3]:
df_instance_of = df[df['relation'] == 'instance of']

In [4]:
df[(df['subject'] == 'Apple II') & (df['relation'] != 'instance of')].relation.to_list()

['specialized in', 'notable work', 'inception']

In [14]:
relation2head = defaultdict(set)
relation2tail = defaultdict(set)

for i, row in df_instance_of.iterrows():
    relations = df[(df['subject'] == row['subject']) & (df['relation'] != 'instance of')].relation.to_list()
    
    for rel in relations:
        relation2head[rel].add(row['object'])

    relations = df[(df['object'] == row['subject']) & (df['relation'] != 'instance of')].relation.to_list()
    
    for rel in relations:
        relation2tail[rel].add(row['object'])
        

In [15]:
relation2tail

defaultdict(set,
            {'notable work': {'3D computer-animated feature film',
              'Apple Inc.',
              'Computers for higher-education and business markets',
              'Multinational corporation',
              "One of the world's most valuable brands",
              'desk lamp',
              'market capitalization',
              'microcomputer'},
             'developed': {'Animation studio',
              'Computer graphics division of Lucasfilm',
              'RenderMan',
              'Software',
              'microcomputer',
              'video game'},
             'saw the commercial potential of': {'Graphical user interface',
              'Mouse',
              'microcomputer'},
             'inception': {'Graphical user interface',
              'advertising campaign',
              'microcomputer'},
             'has part': {'Graphical user interface', 'microcomputer'},
             'influenced': {'Apple Inc.',
              'Computers for high

In [16]:
relation2head

defaultdict(set,
            {'specialized in': {'Animation studio',
              'Apple Inc.',
              'Computer graphics division of Lucasfilm',
              'Computers for higher-education and business markets',
              'Multinational corporation',
              "One of the world's most valuable brands",
              'Software',
              'Technology company',
              'market capitalization',
              'microcomputer'},
             'notable work': {'Animation studio',
              'Apple Inc.',
              'Computer graphics division of Lucasfilm',
              'Computers for higher-education and business markets',
              'Multinational corporation',
              "One of the world's most valuable brands",
              'market capitalization',
              'microcomputer'},
             'inception': {'Animation studio',
              'Apple Inc.',
              'Computer graphics division of Lucasfilm',
              'Computers for higher-e

In [5]:
df_instance_of

Unnamed: 0,subject,relation,object
26,Apple II,instance of,microcomputer
29,Xerox Alto,instance of,microcomputer
30,Xerox Alto,instance of,Mouse
31,Xerox Alto,instance of,Graphical user interface
33,Apple Lisa,instance of,microcomputer
36,Macintosh,instance of,microcomputer
38,Macintosh,instance of,Graphical user interface
42,Aldus Pagemaker,instance of,Software
43,Apple LaserWriter,instance of,Laser printer
50,NeXT,instance of,Software
