In [2]:
from sentence_transformers import SentenceTransformer

import umap 
import hdbscan 
import pandas as pd

In [44]:
import plotly.express as px

In [70]:
df = pd.read_csv('archive/books.csv')

In [71]:
df.columns

Index(['isbn13', 'isbn10', 'title', 'subtitle', 'authors', 'categories',
       'thumbnail', 'description', 'published_year', 'average_rating',
       'num_pages', 'ratings_count'],
      dtype='object')

In [72]:
# Dropping unwanted columns 

df = df.drop(['isbn13', 'isbn10','subtitle', 'authors','thumbnail','published_year', 'average_rating',
       'num_pages', 'ratings_count'], axis=1)

In [73]:
# Dropping rows that are NAN

df = df.dropna()
df = df.reset_index()

In [75]:
df.columns
df = df.drop(['index'], axis=1)

In [76]:
df

Unnamed: 0,title,categories,description
0,Gilead,Fiction,A NOVEL THAT READERS and critics have been eag...
1,Spider's Web,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...
2,The One Tree,American fiction,Volume Two of Stephen Donaldson's acclaimed se...
3,Rage of angels,Fiction,"A memorable, mesmerizing heroine Jennifer -- b..."
4,The Four Loves,Christian life,Lewis' work on the nature of love divides love...
...,...,...,...
6506,Journey to the East,Adventure stories,This book tells the tale of a man who goes on ...
6507,The Monk Who Sold His Ferrari: A Fable About F...,Health & Fitness,"Wisdom to Create a Life of Passion, Purpose, a..."
6508,I Am that,Philosophy,This collection of the timeless teachings of o...
6509,The Berlin Phenomenology,History,Since the three volume edition ofHegel's Philo...


In [36]:
try_list = []

for i in range(100):

    try_list.append(str(df['description'][i]))

In [78]:
text_list = df['description'].to_list()


In [79]:
model = SentenceTransformer('all-mpnet-base-v2')

In [80]:
para_embeddings = model.encode(text_list, show_progress_bar=True)

Batches:   0%|          | 0/204 [00:00<?, ?it/s]

In [90]:
import pynndescent
import numpy as np

def correct_alternative_cosine(ds):
    result = np.empty_like(ds)
    for i in range(ds.shape[0]):
        result[i] = 1.0 - np.power(2.0, ds[i])
    return result

pynn_dist_fns_fda = pynndescent.distances.fast_distance_alternatives
pynn_dist_fns_fda["cosine"]["correction"] = correct_alternative_cosine
pynn_dist_fns_fda["dot"]["correction"] = correct_alternative_cosine

In [91]:
umap_embeddings = umap.UMAP(n_components=2, n_neighbors=30, metric='cosine', min_dist=0.0).fit_transform(para_embeddings)
cluster = hdbscan.HDBSCAN(min_cluster_size = 15 ,metric='euclidean').fit(umap_embeddings)
cluster_2 = hdbscan.HDBSCAN(min_cluster_size = 30 ,metric='euclidean').fit(umap_embeddings)
cluster_3 = hdbscan.HDBSCAN(min_cluster_size = 50 ,metric='euclidean').fit(umap_embeddings)
cluster_4 = hdbscan.HDBSCAN(min_cluster_size = 100 ,metric='euclidean').fit(umap_embeddings)
cluster_5 = hdbscan.HDBSCAN(min_cluster_size = 1000 ,metric='euclidean').fit(umap_embeddings)

In [92]:
fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster.labels_, title='cluster 1')
fig.show()

In [93]:
fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_2.labels_, title='cluster 2')
fig.show()

In [94]:
fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_3.labels_, title='cluster 3')
fig.show()

In [95]:
fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_4.labels_, title='cluster 4')
fig.show()

In [96]:
fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_5.labels_, title='cluster 5')
fig.show()

In [97]:
cluster_6 = hdbscan.HDBSCAN(min_cluster_size =97 ,metric='euclidean').fit(umap_embeddings)

fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_6.labels_, title='cluster 6')
fig.show()

In [124]:
cluster_7 = hdbscan.HDBSCAN(min_cluster_size = 40 ,metric='euclidean').fit(umap_embeddings)

fig = px.scatter(umap_embeddings, x=0, y=1, color = cluster_7.labels_, title='cluster 7')
fig.show()

In [99]:
df['class 6'] = cluster_6.labels_
df

Unnamed: 0,title,categories,description,class,class 6
0,Gilead,Fiction,A NOVEL THAT READERS and critics have been eag...,5,5
1,Spider's Web,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,-1,-1
2,The One Tree,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,-1,-1
3,Rage of angels,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",5,5
4,The Four Loves,Christian life,Lewis' work on the nature of love divides love...,-1,-1
...,...,...,...,...,...
6506,Journey to the East,Adventure stories,This book tells the tale of a man who goes on ...,2,2
6507,The Monk Who Sold His Ferrari: A Fable About F...,Health & Fitness,"Wisdom to Create a Life of Passion, Purpose, a...",2,2
6508,I Am that,Philosophy,This collection of the timeless teachings of o...,2,2
6509,The Berlin Phenomenology,History,Since the three volume edition ofHegel's Philo...,2,2


In [103]:
df['class 3'] = cluster_3.labels_

In [104]:
df

Unnamed: 0,title,categories,description,class 6,class 3
0,Gilead,Fiction,A NOVEL THAT READERS and critics have been eag...,5,14
1,Spider's Web,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,-1,-1
2,The One Tree,American fiction,Volume Two of Stephen Donaldson's acclaimed se...,-1,-1
3,Rage of angels,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",5,7
4,The Four Loves,Christian life,Lewis' work on the nature of love divides love...,-1,-1
...,...,...,...,...,...
6506,Journey to the East,Adventure stories,This book tells the tale of a man who goes on ...,2,13
6507,The Monk Who Sold His Ferrari: A Fable About F...,Health & Fitness,"Wisdom to Create a Life of Passion, Purpose, a...",2,13
6508,I Am that,Philosophy,This collection of the timeless teachings of o...,2,13
6509,The Berlin Phenomenology,History,Since the three volume edition ofHegel's Philo...,2,12


In [105]:
catergories = df['categories'].to_list()

In [122]:
cat_embed = model.encode(catergories)

umap_cat = umap.UMAP(n_components=2, n_neighbors=30, metric='cosine', min_dist=0.0).fit_transform(cat_embed)
cluster_cat = hdbscan.HDBSCAN(min_cluster_size = 100, metric='euclidean').fit(umap_cat)

fig = px.scatter(umap_cat, x=0, y=1, color = cluster_cat.labels_, title='cluster 6')
fig.show()


In [133]:
df['class_cat'] = cluster_cat.labels_

In [139]:
fiction_cat = df[df['class_cat'] == 1]
fiction_cat

Unnamed: 0,title,categories,description,class 6,class 3,class_cat
0,Gilead,Fiction,A NOVEL THAT READERS and critics have been eag...,5,14,1
3,Rage of angels,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...",5,7,1
12,Warhost of Vastmark,Fiction,"Tricked once more by his wily half-brother, Ly...",3,-1,1
52,Taken at the Flood,Fiction,A Few Weeks After Marrying An Attractive Young...,5,7,1
66,The Yiddish Policemen's Union,Fiction,"For sixty years, Jewish refugees and their des...",-1,-1,1
...,...,...,...,...,...,...
6481,Amazing Disgrace,Fiction,In this sequel to the popular Cooking with Fer...,5,14,1
6484,Falling Angel,Fiction,Raymond Chandler meets The Exorcist. Classic n...,-1,8,1
6486,Night Has a Thousand Eyes,Fiction,"""Cornell Woolrich's novels define the essence ...",5,14,1
6501,Coin Locker Babies,Fiction,Rescued from the lockers in which they were le...,3,9,1


In [140]:
fiction_cat['categories'].unique()

array(['Fiction', 'FICTION', 'True Crime', 'Reference',
       'Political fiction'], dtype=object)

In [161]:
def catergory_sort(col_name):
    
    cat_dict = {}
    col_name = str(col_name)
    for i in range(len(df[col_name].unique())):
        
        cat = i - 1
        
        fiction_cat = df[df['class_cat'] == cat]

        cat_dict[f'{cat}'] = fiction_cat['categories'].unique()

    return cat_dict

In [162]:
cat_3 = catergory_sort('class 3')

In [169]:
cat_3['12']

array(['Human cloning', 'Science', 'Nature', 'Cosmology', 'Physicists',
       'Life on other planets', 'Albigenses', 'Zoology', 'Physics',
       'Gardens', 'Astronomers', 'Mars (Planet)', 'Animals'], dtype=object)