In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("books_embeddings.csv")

In [2]:
def parse_embedding(embedding_str):
    """Convert the string of comma-separated floats into a NumPy array."""
    return np.array([float(x) for x in embedding_str.split(',')])

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_records_clusters(record, new_df, cluster_labels, kmeans, n=5):
    
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)

    record_cluster = kmeans.predict(record_vector)[0]
    
    same_cluster_df = new_df[np.array(cluster_labels) == record_cluster]
    
    distances = []
    names = []

    length = same_cluster_df.shape[0]
    for index, row in same_cluster_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)
        
        similarity = cosine_similarity(record_vector, other_vector)[0][0]
        distance = 1 - similarity
        
        distances.append(distance)
        names.append(row['name'])

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

def find_closest_records_clusters2(record, new_df, cluster_labels, kmeans, n=5, n_clusters_to_search=2):
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)
    record_cluster = kmeans.predict(record_vector)[0]

    record_centroid = kmeans.cluster_centers_[record_cluster].reshape(1, -1)
    centroid_distances = cosine_similarity(record_centroid, kmeans.cluster_centers_)[0]

    closest_cluster_indices = np.argsort(-centroid_distances)[:n_clusters_to_search + 1]
    
    closest_records_df = new_df[np.isin(cluster_labels, closest_cluster_indices)]
    
    distances = []
    names = []
    
    for index, row in closest_records_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)
        similarity = cosine_similarity(record_vector, other_vector)[0][0]
        distance = 1 - similarity
        
        distances.append(distance)
        names.append(row['name'])
    
    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

In [4]:
import joblib
clusters_best = pd.read_csv("cluster_labels_best.csv")
kmeans_best = joblib.load("kmeansbest_model.joblib") # read from csv 

In [6]:
find_closest_records_clusters(df.iloc[9816], df, clusters_best, kmeans_best, 10)

['Anna Karenina',
 'Anna Karenina (Barnes & Noble Classics)',
 'Ethan Frome (Penguin Classics)',
 'Eugenie Grandet (The Human Comedy)',
 'The American (Penguin Classics)',
 "The Pemberley Chronicles: A Companion Volume to Jane Austen's Pride and Prejudice: Book 1",
 'The Wine-Dark Sea (Book 16)',
 'Wildest Dreams',
 'Sense and Sensibility (Barnes & Noble Classics Series) (B&N Classics)',
 'Russka']