In [None]:
import os
import pandas as pd
import re
import ollama
import time
import requests
from collections import defaultdict
from tqdm import tqdm 
from neo4j import GraphDatabase

## DBpedia

In [None]:
URI = "bolt://localhost:7687" 
USERNAME = "neo4j"  
PASSWORD = "password123" 

driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))
print("Connected to Neo4j!")

In [None]:
# IMPORT TURTLE FILE
IMPORT_FOLDER = r"C:/Users/ylaar/.Neo4jDesktop/relate-data/dbmss/dbms-ce0521ea-df1c-40e3-8b68-94d8496673a1/import"
IMPORTED_FILES_LIST = "imported_files.txt"

# load the list of already imported files
if os.path.exists(IMPORTED_FILES_LIST):
    with open(IMPORTED_FILES_LIST, "r", encoding="utf-8") as f:
        imported_files = set(f.read().splitlines()) 
else:
    imported_files = set()

turtle_files = [f for f in os.listdir(IMPORT_FOLDER) if f.endswith('.ttl')]
turtle_files_to_import = [f for f in turtle_files if f not in imported_files]

def import_ttl_file(tx, file_path):
    file_path = file_path.replace("\\", "/")  # Fix Windows backslashes
    cypher_query = (
        f'CALL n10s.rdf.import.fetch("file:///{file_path}", "Turtle", '
        '{ handleVocabUris: "SHORTEN" });'
    )
    tx.run(cypher_query)

with driver.session() as session:
    with tqdm(total=len(turtle_files_to_import), desc="Importing Turtle Files", unit="file") as pbar:
        for file in turtle_files_to_import:
            file_path = os.path.join(IMPORT_FOLDER, file)
            print(f"Processing: {file}") 
            session.execute_write(import_ttl_file, file_path)
            with open(IMPORTED_FILES_LIST, "a", encoding="utf-8") as f:
                f.write(file + "\n")

            pbar.update(1)  # Update progress bar

driver.close()
print("All TTL files imported successfully!")


## CSO

In [None]:
file_path = "data/CSO/CSO.3.4.1.csv"
df = pd.read_csv(file_path)

df_clean = df.copy()
df_clean.columns = ['topic1', 'relationship', 'topic2']

# Function to extract the topic name from the URL and clean special characters
def extract_topic(url):
    if pd.notna(url): 
        topic = url.split('/')[-1].replace('>', '').replace('<', '')
        topic = topic.split('_%')[0]  # Remove special character part like _%28svms%29
        return topic.split('%')[0]  # Also remove any other encoded characters like %2C
    return url

df_clean['topic1'] = df_clean['topic1'].apply(extract_topic)
df_clean['topic2'] = df_clean['topic2'].apply(extract_topic)

# Clean up the relationship column
df_clean['relationship'] = df_clean['relationship'].apply(
    lambda x: (x.split('#')[-1] if '#' in x else x.split('/')[-1]).replace('>', '').replace('<', '') if pd.notna(x) else x
)

df_clean.drop_duplicates(inplace=True)


clean_file_path = 'knowledge_graph/CSO_cleaned.csv'
df_clean.to_csv(clean_file_path, index=False)

print(f"Cleaned file saved at: {clean_file_path}")

Generate content: Option 1 - with LLM

In [None]:
file_path = "CSO/CSO_cleaned.csv"
df = pd.read_csv(file_path)

def is_valid_topic(topic):
    return bool(re.match(r"^[a-zA-Z][a-zA-Z0-9_\- ]*$", str(topic)))

# Extract unique topics and relationships
unique_topics = set(df["topic1"].dropna()).union(set(df["topic2"].dropna()))
cleaned_topics = [topic for topic in unique_topics if is_valid_topic(topic)]

# Build ontology mapping (topic -> parent topic)
ontology = defaultdict(set)
for _, row in df.dropna().iterrows():
    ontology[row["topic2"]].add(row["topic1"])  # topic2 is a sub-topic of topic1

# Function to generate a content for a given topic
def generate_description(topic):
    topic_formatted = topic.replace("_", " ")
    parent_topics = ontology.get(topic, [])
    parent_list = ", ".join([p.replace("_", " ") for p in parent_topics]) if parent_topics else "computer science"

    prompt = f"""Generate an educational description for the topic: '{topic_formatted}'. 
    It belongs to the category: {parent_list}. 
    The description should be informative, structured, and suitable for students and researchers. 
    Include definitions, use cases, and why it's important.
    """
    
    try:
        response = ollama.chat(model="mistral", messages=[{"role": "user", "content": prompt}])
        return response["message"]["content"]
    except Exception as e:
        print(f"Error generating content for {topic}: {e}")
        return f"{topic_formatted} is an important topic in {parent_list}."

# Generate descriptions
topic_descriptions = {}
for idx, topic in enumerate(cleaned_topics):
    print(f"Generating content for topic {idx+1}/{len(cleaned_topics)}: {topic}")
    topic_descriptions[topic] = generate_description(topic)
    time.sleep(1)  # To avoid API rate limits

# Convert to DataFrame and save to CSV
df_descriptions = pd.DataFrame(list(topic_descriptions.items()), columns=["Topic", "Description"])
df_descriptions.to_csv("CSO_enhanced.csv", index=False)

print("Descriptions generated and saved successfully!")

Option 2: Use wikipedia articles

In [None]:
# Read the knowledge graph CSV
csv_path = 'CSO/CSO_cleaned.csv'
data = pd.read_csv(csv_path)

def get_wikipedia_content(title):
    """Fetch Wikipedia page content for a given title with retry and delay."""
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
        "format": "json"
    }
    for attempt in range(5):
        try:
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            pages = data.get("query", {}).get("pages", {})
            if not pages:
                return None
            page = next(iter(pages.values()), {})
            if "missing" in page:
                return None
            return page.get("extract")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed for {title}: {e}")
            time.sleep(2 ** attempt)
    return None

def node_exists(tx, node_name):
    """Check if a node exists."""
    result = tx.run("""
        MATCH (n {name: $name})
        RETURN COUNT(n) > 0 AS exists
    """, name=node_name)
    record = result.single()
    return record and record["exists"]

def node_has_content(tx, node_name):
    """Check if a node already has content."""
    result = tx.run("""
        MATCH (n {name: $name})
        RETURN n.content IS NOT NULL AS hasContent
    """, name=node_name)
    record = result.single()
    return record and record["hasContent"]

def delete_node(tx, node_name):
    """Delete a node and its relationships."""
    tx.run("""
        MATCH (n {name: $name})
        DETACH DELETE n
    """, name=node_name)

def update_node_content(tx, node_name, content):
    """Update a node with Wikipedia content."""
    tx.run("""
        MATCH (n {name: $name})
        SET n.content = $content
    """, name=node_name, content=content)

with driver.session() as session:
    topics = pd.concat([data['topic1'], data['topic2']]).unique()
    for topic in tqdm(topics, desc="Processing topics"):
        if not session.execute_read(node_exists, topic):
            # print(f"Skipping: {topic} (already deleted)")
            continue
        if session.execute_read(node_has_content, topic):
            # print(f"Skipping: {topic} (already has content)")
            continue
        content = get_wikipedia_content(topic)
        if content:
            session.execute_write(update_node_content, topic, content)
            # print(f"Updated: {topic}")
        else:
            session.execute_write(delete_node, topic)
            # print(f"Deleted: {topic}")
        time.sleep(1)

driver.close()
print("Graph update complete.")
