In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("books_embeddings.csv")

In [2]:
def parse_embedding(embedding_str):
    """Convert the string of comma-separated floats into a NumPy array."""
    return np.array([float(x) for x in embedding_str.split(',')])

In [21]:
from sklearn.cluster import KMeans

def create_embeddings_list(df, embeddings):
    length = df.shape[0]
    for index, row in df.iterrows():
        vector = parse_embedding(row['book_embedding']).flatten()  
        embeddings.append(vector)  
        
        print(f'Progress: {index / length:.2%}', end='\r')
    return embeddings  


def create_cluster(embeddings):
    embeddings_matrix = np.array(embeddings)  
    print("Embeddings matrix shape:", embeddings_matrix.shape)

    n_clusters = 50
    kmeans = KMeans(n_clusters= n_clusters, random_state=42)
    kmeans.fit(embeddings_matrix)

    cluster_labels = kmeans.labels_
    return cluster_labels, kmeans 


In [20]:
embeddings = []
embeddings = create_embeddings_list(df, embeddings)
cluster_labels, kmeans = create_cluster(embeddings) #not neccessary, can read from csv and load joblib model

Embeddings matrix shape: (103063, 2771)


In [None]:
import joblib
clusters2 = pd.read_csv("cluster_labels.csv")
kmeans2 = joblib.load("kmeans_model.joblib") # read from csv 

In [10]:
unique, counts = np.unique(cluster_labels, return_counts=True)
cluster_distribution = dict(zip(unique, counts))
print("Number of books in each cluster:", cluster_distribution)

Number of books in each cluster: {np.int32(0): np.int64(748), np.int32(1): np.int64(1822), np.int32(2): np.int64(1792), np.int32(3): np.int64(1877), np.int32(4): np.int64(451), np.int32(5): np.int64(528), np.int32(6): np.int64(579), np.int32(7): np.int64(160), np.int32(8): np.int64(1769), np.int32(9): np.int64(685), np.int32(10): np.int64(2674), np.int32(11): np.int64(364), np.int32(12): np.int64(797), np.int32(13): np.int64(718), np.int32(14): np.int64(452), np.int32(15): np.int64(688), np.int32(16): np.int64(210), np.int32(17): np.int64(1533), np.int32(18): np.int64(1294), np.int32(19): np.int64(203), np.int32(20): np.int64(767), np.int32(21): np.int64(806), np.int32(22): np.int64(1699), np.int32(23): np.int64(3843), np.int32(24): np.int64(169), np.int32(25): np.int64(3311), np.int32(26): np.int64(921), np.int32(27): np.int64(516), np.int32(28): np.int64(764), np.int32(29): np.int64(1157), np.int32(30): np.int64(1060), np.int32(31): np.int64(2340), np.int32(32): np.int64(1304), np.in

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def find_closest_records_clusters(record, new_df, cluster_labels, kmeans, n=5):
    
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)

    record_cluster = kmeans.predict(record_vector)[0]
    
    same_cluster_df = new_df[np.array(cluster_labels) == record_cluster]
    
    distances = []
    names = []

    length = same_cluster_df.shape[0]
    for index, row in same_cluster_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)
        
        similarity = cosine_similarity(record_vector, other_vector)[0][0]
        distance = 1 - similarity
        
        distances.append(distance)
        names.append(row['name'])

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names


In [22]:
find_closest_records_clusters(df.iloc[9809], df, cluster_labels, kmeans, 10)

['Writing Was Everything',
 'Readings',
 'Living by Fiction',
 "The Style's the Man: Reflections on Proust, Fitzgerald, Wharton, Vidal, and Others",
 'Jafsie and John Henry: Essays',
 'Walt Whitman: The Song of Himself',
 'Penchants and Places: Essays and Criticism',
 "Why Kerouac Matters: The Lessons of On the Road (They're Not What You Think)",
 'An American Procession',
 'The Best American Spiritual Writing 2006 (The Best American Series)']

In [27]:
cluster_labels_df = pd.DataFrame(cluster_labels, columns=['Cluster'])
cluster_labels_df.to_csv('cluster_labels.csv', index=False)

In [26]:
import joblib

joblib.dump(kmeans, 'kmeans_model.joblib')

['kmeans_model.joblib']