<a href="https://colab.research.google.com/github/sof1a03/KDE-group6/blob/main/API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install fastapi uvicorn rdflib pykeen torch numpy pandas

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting rdflib
  Downloading rdflib-7.1.2-py3-none-any.whl.metadata (11 kB)
Collecting pykeen
  Downloading pykeen-1.11.0-py3-none-any.whl.metadata (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.1.0-py3-none-any.whl.metadata

# Project Overview
**Components**
- External Database (GraphDB): Hosts your RDF data (books, users, categories, etc.). Communicates via SPARQL queries through a RESTful endpoint.
- API Server (FastAPI):Acts as the middleware between the database and the front-end. Processes client requests and interacts with GraphDB to fetch or manipulate data.
- Front-End: Sends requests to the API and displays results to the user (e.g., recommendations, search results).
- Variables and Connections:
  - SPARQL Endpoint: The GraphDB endpoint URL for querying the database.
  - RDF Schema: Defines relationships such as:
      - \<User\> \<likesBook\> \<Book\>.  
      - \<Book\> \<hasCategory\> "Fiction".
      - \<Book\> \<hasTitle\> "Book Title".
  - API Endpoints:
  /recommended_books, /similar_books, /search, /like_book.
  - Environment Variables: SPARQL endpoint URL. Pagination defaults (e.g., DEFAULT_PAGE_SIZE).

# High-Level Workflow
- **Search Functionality**:Front-end sends a search request with filters.
API translates the request into a SPARQL query.GraphDB returns the results as RDF triples.
- **Recommendations:** Front-end requests similar books or user-specific recommendations. API sends SPARQL queries to fetch related books or compute recommendations using precomputed relationships in GraphDB.
- **User Actions** (like books): API sends SPARQL INSERT queries to add user preferences into GraphDB.
- **Pagination**: API handles pageSize and pageNum by adding LIMIT and OFFSET to SPARQL queries.

In [None]:
from fastapi import FastAPI, HTTPException, Query
from typing import List, Optional
from SPARQLWrapper import SPARQLWrapper, JSON
from pykeen.triples import TriplesFactory
from node2vec import Node2Vec
import torch
import random

app = FastAPI()

# Configuration for GraphDB and file paths
GRAPHDB_SPARQL_ENDPOINT = " -- " #needs to put the db endpoint
TRANSE_MODEL_DIR = "transe_model_output" # Directory where the TransE model is stored
NODE2VEC_EMBEDDINGS_PATH = "node2vec_embeddings.vec" # File path to Node2Vec embeddings
DEFAULT_PAGE_SIZE = 10  # Default number of results per page for pagination

In [None]:
# Initialize TransE and Node2Vec models
transe_model = None
node2vec_model = None

# Load TransE model
def load_transe_model():
    """
    Load the pre-trained TransE model from the specified directory.
    """
    global transe_model
    model_path = f"{TRANSE_MODEL_DIR}/trained_model.pkl"# Path to the serialized TransE model
    transe_model = torch.load(model_path, map_location=torch.device("cpu")) # Load the model to CPU

In [None]:
# Load Node2Vec embeddings
def load_node2vec_embeddings():
    """
    Load precomputed Node2Vec embeddings as a Word2Vec model.
    """
    global node2vec_model
    # Assume embeddings are precomputed and available as a Word2Vec model
    from gensim.models import KeyedVectors # Import KeyedVectors for Word2Vec model
    node2vec_model = KeyedVectors.load_word2vec_format(NODE2VEC_EMBEDDINGS_PATH, binary=False) # Load embeddings

In [None]:
# Helper to execute SPARQL queries
def execute_sparql_query(query: str):
    """
    Execute a SPARQL query against the GraphDB endpoint.
    Args:
        query (str): The SPARQL query string.
    Returns:
        dict: Parsed JSON response from GraphDB.
    """
    sparql = SPARQLWrapper(GRAPHDB_SPARQL_ENDPOINT)# Initialize SPARQL wrapper with the endpoint URL
    sparql.setQuery(query) # Set the SPARQL query
    sparql.setReturnFormat(JSON)# Execute the query and parse the JSON response
    try:
        response = sparql.query().convert() # Execute the query and parse the JSON response
        return response
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"SPARQL query failed: {e}")  # Handle errors

In [None]:
# TransE-based recommendation logic
def predict_top_books_transe(user_id, top_n=5):
    """
    Predict the top book recommendations for a user based on TransE embeddings.
    Args:
        user_id (str): The ID of the user.
        top_n (int): Number of recommendations to return.
    Returns:
        list: A list of recommended book IDs.
    """
    if not transe_model:  # Ensure the TransE model is loaded
        raise HTTPException(status_code=500, detail="TransE model not loaded")

    entity_to_id = transe_model.entity_to_id # Retrieve entity-to-ID mapping from the model
    if user_id not in entity_to_id:  # Check if the user exists in the model
        raise HTTPException(status_code=404, detail=f"User {user_id} not found")

    # Get the embedding for the user
    user_embedding = transe_model.entity_representations[0](
        torch.tensor([entity_to_id[user_id]])
    ).detach().numpy().squeeze()

    ''' ATTENTION: I'M NOT SURE THIS PART IS CORRECT, NEED TO CHECK WITH THE TEAM'''
    similarities = [] # List to store similarity scores
    for entity, idx in entity_to_id.items():
        if entity.startswith("http://example.org/book"):  # Filter only books
            entity_embedding = transe_model.entity_representations[0](
                torch.tensor([idx])
            ).detach().numpy().squeeze()
            similarity = -((user_embedding - entity_embedding) ** 2).sum()
            similarities.append((entity, similarity))

    # Sort by similarity and return the top N books
    top_books = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
    return [book for book, _ in top_books]

In [None]:
# Node2Vec-based similar book recommendations
def predict_similar_books_node2vec(book_id, top_n=5):
    """
    Recommend books similar to a given book using Node2Vec embeddings.
    Args:
        book_id (str): The ID of the book.
        top_n (int): Number of similar books to return.
    Returns:
        list: A list of similar book IDs.
    """
    if not node2vec_model: # Ensure the Node2Vec model is loaded
        raise HTTPException(status_code=500, detail="Node2Vec model not loaded")

    try:
        similar_books = node2vec_model.most_similar(book_id, topn=top_n)
        return [book for book, _ in similar_books] # Return only the book IDs
    except KeyError:
        raise HTTPException(status_code=404, detail=f"Book ID {book_id} not found in Node2Vec embeddings")

In [None]:
# API endpoint: Fetch personalized recommendations
@app.get("/api/recommended_books")
def recommended_books(userid: str, top_n: int = 5):
    """
    Fetch personalized book recommendations for a user using TransE and demographics.
    Args:
        userid (str): The ID of the user.
        top_n (int): Number of recommendations to return.
    Returns:
        dict: A dictionary with the user ID and recommended books.
    """
    try:
        # Get recommendations using TransE
        recommendations = predict_top_books_transe(userid, top_n=top_n)
        return {"userid": userid, "recommendations": recommendations}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# API endpoint: Fetch books similar to a given book
@app.get("/api/similar_books")
def similar_books(bookid: str, top_n: int = 5):
    """
    Fetch books similar to the given book using Node2Vec embeddings.
    Args:
        bookid (str): The ID of the book.
        top_n (int): Number of similar books to return.
    Returns:
        dict: A dictionary with the book ID and similar books.
    """
    try:
        recommendations = predict_similar_books_node2vec(bookid, top_n=top_n) # Get similar books
        return {"bookid": bookid, "similar_books": recommendations}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# API endpoint: Search for books
@app.get("/api/search")
def search_books(
    categories: Optional[List[str]] = Query(None),
    isbn: Optional[str] = None,
    title: Optional[str] = None,
    author: Optional[str] = None,
    start_year: Optional[int] = None,
    end_year: Optional[int] = None,
    pageSize: int = 10,
    pageNum: int = 1,
):
    """
    Search for books by category, ISBN, title, author, or publication year.
    Args:
        categories (List[str], optional): List of book categories to filter.
        isbn (str, optional): ISBN of the book.
        title (str, optional): Title of the book.
        author (str, optional): Author of the book.
        start_year (int, optional): Start of the publication year range.
        end_year (int, optional): End of the publication year range.
        pageSize (int): Number of results per page.
        pageNum (int): Page number to return.
    Returns:
        dict: A dictionary of search results.
    """
    query = """
    SELECT ?book ?title ?author WHERE {
        ?book <http://example.org/hasTitle> ?title .
        OPTIONAL { ?book <http://example.org/hasAuthor> ?author . }
    """
    if categories:
        category_filter = " || ".join([f'?category = "{cat}"' for cat in categories])
        query += f"?book <http://example.org/hasCategory> ?category . FILTER({category_filter})"

    if isbn:
        query += f'?book <http://example.org/hasISBN> "{isbn}" .'

    if title:
        query += f'FILTER regex(?title, "{title}", "i") .'

    if author:
        query += f'?book <http://example.org/hasAuthor> "{author}" .'

    if start_year:
        query += f'?book <http://example.org/hasYear> ?year . FILTER(?year >= {start_year}) .'

    if end_year:
        query += f'?book <http://example.org/hasYear> ?year . FILTER(?year <= {end_year}) .'

    query += f"}} LIMIT {pageSize} OFFSET {(pageNum - 1) * pageSize}" # Add pagination to the query

    try:
        results = execute_sparql_query(query) # Execute the SPARQL query
        books = [
        books = [
            {"bookid": binding["book"]["value"], "title": binding["title"]["value"]}
            for binding in results["results"]["bindings"]
        ]
        return {"results": books} # Return search results
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# API endpoint: Fetch random book recommendations

@app.get("/api/surprise_me")
def surprise_me(userid: Optional[str] = None, top_n: int = 5):
    """
    Fetch random book recommendations, excluding books already liked by the user.
    Args:
        userid (str, optional): The ID of the user.
        top_n (int): Number of random books to return.
    Returns:
        dict: A dictionary of random book recommendations.
    """
    try:
        query = "SELECT ?book WHERE { ?book a <http://example.org/Book> }"
        all_books = execute_sparql_query(query)["results"]["bindings"]

        if userid: # If a user ID is provided, exclude liked books
            liked_query = f"""
            SELECT ?book WHERE {{
                <http://example.org/user/{userid}> <http://example.org/likesBook> ?book .
            }}
            """
            liked_books = execute_sparql_query(liked_query)["results"]["bindings"]
            liked_book_ids = {b["book"]["value"] for b in liked_books}
            available_books = [b["book"]["value"] for b in all_books if b["book"]["value"] not in liked_book_ids]
        else:
            available_books = [b["book"]["value"] for b in all_books]

        random.shuffle(available_books) # Shuffle books for randomness
        return {"surprise_me": available_books[:top_n]} # Return random recommendations
    except Exception as e:
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# Load models on startup
@app.on_event("startup")
def load_models():
    """
    Load models (TransE and Node2Vec) when the application starts.
    """
    load_transe_model()  # Load the TransE model
    load_node2vec_embeddings()  # Load the Node2Vec embeddings