Simple solution building the graph based on author being referenced in description.

In [2]:
import sys
import os

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [3]:
from src.models.modules import BookDescriptionEmbeddingSimilarity

import networkx as nx
import pandas as pd

dataset_path =  os.path.join(PROJECT_ROOT,"data/raw_data/LEHABOOKS.csv")
dataset = pd.read_csv(dataset_path)

model_path = os.path.join(PROJECT_ROOT, "data/embeddings/books_embeddings_dataset.npy")
model = BookDescriptionEmbeddingSimilarity(model_path)

Load graph from json format

In [4]:
import json
from networkx.readwrite import json_graph

def load_graph(relative_path) :
    graph_path =  os.path.join(PROJECT_ROOT, relative_path)
    with open(graph_path, "r") as f:
        graph_data = json.load(f)

    Graph = json_graph.node_link_graph(graph_data)
    return Graph

Build the graph based on author's references

In [5]:
def find_references_author(description, books, current_book):
    references = []

    for book in books:
        author = book["Authors"]
        if pd.isna(author) or author is None:
            continue 

        book_id = f"{book['Title']} ({book['Authors']})"

        if book_id != current_book:
            if author in description:  
                references.append(book_id) 
    return references

def build_graph():
    G = nx.Graph()
    books = dataset.to_dict(orient="records")
    book_lookup = {f"{book['Title']} ({book['Authors']})": book for book in books}

    for book in books:
        book_id = f"{book['Title']} ({book['Authors']})"
        G.add_node(book_id, data=book)  

    for book in books:
        current_book = f"{book['Title']} ({book['Authors']})"
        description = book['Description']
        references = find_references_author(description, books, current_book)

        for ref in references:
            if ref in book_lookup:
                G.add_edge(current_book, ref)
    return G

Find references by titles(or try to)

In [6]:
def is_multi_word(title):
    return len(title.split()) > 1

def find_references_title(description, books, current_book):
    references = []

    for book in books:
        author = book["Authors"]
        title = book["Title"]
        if pd.isna(author) or author is None:
            continue 

        book_id = f"{book['Title']} ({book['Authors']})"

        if book_id != current_book and len(title) > 3 and is_multi_word(title):
            if title in description:  
                references.append(book_id) 

    return references

Save the graph in json format.

In [7]:
def save_graph(graph, relative_path):
    save_path =  os.path.join(PROJECT_ROOT, relative_path)
    graph_data = json_graph.node_link_data(graph)
    with open(save_path, "w") as f:
        json.dump(graph_data, f, indent=4)

Find neighbours of book by its title

In [8]:
def find_neighbors_title(Graph, title):
    matching_nodes = [node for node in Graph.nodes if node.startswith(f"{title} (")]
    all_neighbors = set()  
    
    for node in matching_nodes:
        neighbors = list(Graph.neighbors(node))
        all_neighbors.update(neighbors)
    
    titles = [neighbor.rsplit(" (", 1)[0] for neighbor in all_neighbors]
    return titles

In [9]:
def print_all_nodes(Graph):
    for node in Graph.nodes():
        if len(list(Graph.neighbors(node))) > 0: 
            print(f"{node} references: {list(Graph.neighbors(node))}")

Find the books that have the biggest number of common neighbours with a given book

In [10]:
def find_most_neighbors(Graph, title, n = 10):
    neighbors = set(find_neighbors_title(Graph, title))
    shared_neighbors = {b: len(neighbors & set(find_neighbors_title(Graph, b))) for b in neighbors}
    sorted_neighbors = sorted(shared_neighbors.items(), key=lambda x: x[1], reverse=True)
    return sorted_neighbors[:n]

Find closest books(embeddings + shared neighbours)

In [78]:
import numpy as np

def find_closest_books(title, n = 20):
    Graph = load_graph("data/graphs/book_graph.json")

    predicted = model.recommend_by_title(title, n + 1)  

    neighbors = find_neighbors_title(Graph, title)

    embedding = None
    for i in model.model:
        if i[0] == title:  
            embedding = np.array(i[1:], dtype=np.float32)
            break

    if embedding is None:
        raise ValueError(f"Title '{title}' not found in the model.")

    neighbor_scores = {}
    for neighbor in neighbors:
        if neighbor == title: 
            continue

        neighbor_embedding = None
        for i in model.model:
            if i[0] == neighbor:  
                neighbor_embedding = np.array(i[1:], dtype=np.float32)
                break

        if neighbor_embedding is None:
            continue 

        norm_embedding = np.linalg.norm(embedding)
        norm_neighbor = np.linalg.norm(neighbor_embedding)
        similarity = np.dot(embedding, neighbor_embedding) / (norm_embedding * norm_neighbor) if norm_embedding and norm_neighbor else 0

        neighbor_scores[neighbor] = similarity

    sorted_neighbors = sorted(neighbor_scores.items(), key=lambda x: -x[1])[:10]  

    combined = {}

    for book, score in predicted:
        if book != title:  
            combined[book] = score  

    for book, similarity in sorted_neighbors:
        if book in combined:
            combined[book] = max(combined[book], similarity)  
        else:
            combined[book] = similarity  

    sorted_books = sorted(combined.items(), key=lambda x: -x[1])

    return [(book, score) for book, score in sorted_books[:n]]


In [81]:
find_closest_books("We")

[('Superluminal', np.float32(0.95735115)),
 ('Metaplanetary', np.float32(0.95497215)),
 ('Moonwar H', np.float32(0.9478443)),
 ('Vacuum Diagrams', np.float32(0.94667035)),
 ('The Wreck of the River of Stars', np.float32(0.9449295)),
 ('Prelude to Foundation', np.float32(0.9446789)),
 ('Star Wars', np.float32(0.943509)),
 ('Lost in Transmission', np.float32(0.9424872)),
 ('Pebble in the Sky', np.float32(0.9411377)),
 ('Excession', np.float32(0.9408336)),
 ("Foundation's Fear", np.float32(0.94065976)),
 ('The Ringworld Throne', np.float32(0.94043005)),
 ('Souls in the Great Machine', np.float32(0.9403687)),
 ('The Labyrinth Key', np.float32(0.94022304)),
 ('The Stand', np.float32(0.93991613)),
 ("Infinity's Shore", np.float32(0.93970543)),
 ('Hyperion', np.float32(0.9396297)),
 ('Dune: The Machine Crusade', np.float32(0.93862253)),
 ('Atlas Shrugged', np.float32(0.91931033)),
 ('1984', np.float32(0.9189502))]