<a href="https://colab.research.google.com/github/remixwithkj/Backupmac/blob/main/clustering/Content_Based_Recsys_Embeddings_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading the dataset

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.0


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import ast

In [4]:
books_df = pd.read_csv('https://raw.githubusercontent.com/manaranjanp/ISB_MLUL/main/clustering/books_summary.csv.zip', index_col=[0])

In [5]:
books_df.head(10)

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
5,Stolen Focus,explains why our attention spans have been dw...,science
6,The Life-Changing Science of Detecting Bullshit,teaches its readers how to avoid falling for ...,science
7,Dopamine Nation,talks about the importance of living a balance...,science
8,The Art of Statistics,is a non-technical book that shows how statis...,science
9,No Self No Problem,is a provocative read about the implications ...,science


In [6]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5201 entries, 0 to 5244
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   book_name   5201 non-null   object
 1   summaries   5194 non-null   object
 2   categories  5201 non-null   object
dtypes: object(3)
memory usage: 162.5+ KB


In [7]:
books_df.shape

(5201, 3)

## Drop the books without summaries

In [8]:
books_df.dropna(subset=['summaries'], inplace=True)

In [9]:
books_df.drop_duplicates(subset=['summaries'], inplace=True)

In [10]:
books_df = books_df.reset_index(drop=True)

In [11]:
books_df

Unnamed: 0,book_name,summaries,categories
0,The Highly Sensitive Person,is a self-assessment guide and how-to-live te...,science
1,Why Has Nobody Told Me This Before?,is a collection of a clinical psychologist’s ...,science
2,The Midnight Library,"tells the story of Nora, a depressed woman in...",science
3,Brave New World,presents a futuristic society engineered perf...,science
4,1984,is the story of a man questioning the system ...,science
...,...,...,...
1222,Better Than Before,breaks down the latest research on how to bre...,work
1223,The Happiness Hypothesis,is the most thorough analysis of how you can ...,work
1224,Rich Dad Poor Dad,"tells the story of a boy with two fathers, on...",work
1225,The Ruthless Elimination Of Hurry,"will teach you how to slow down, relax, and l...",mindfulness


## Embedding the summaries

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [13]:
# Step 1: Generate embeddings using BAAI/bge-small-en model from HuggingFace
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Generate embeddings for each summary
books_df['embedding'] = books_df['summaries'].map(lambda sentence: model.encode(sentence))

In [15]:
books_df.sample(10)

Unnamed: 0,book_name,summaries,categories,embedding
384,Lean In,explains why women are still underrepresented...,economics,"[0.0024933976, -0.032888502, 0.020896288, 0.10..."
702,An Astronaut’s Guide To Life On Earth,teaches you how to live better by taking less...,happiness,"[0.013795313, -0.030288553, 0.05902581, 0.0531..."
3,Brave New World,presents a futuristic society engineered perf...,science,"[0.0017911997, 0.03210592, -0.08147263, -0.005..."
1032,Start,shows you how you can flip the switch of your...,productivity,"[-0.007334346, 0.02969467, 0.03669724, 0.08221..."
854,The Automatic Millionaire,"is an actionable, step-by-step plan for build...",money,"[-0.005733563, 0.07454969, -0.06469891, -0.009..."
1080,Ending Aging,describes how the process of aging is like a ...,psychology,"[-0.017711572, 0.086317554, 0.032675967, 0.100..."
876,Anything You Want,teaches you how to build a business that’s ba...,money,"[0.0020877956, -0.009392955, -0.03555226, -0.0..."
472,The Power Of Showing Up,inspires parents to help their kids develop s...,relationships,"[-0.0117826145, 0.04290128, 0.050970335, 0.080..."
173,Smarter,is one “slow learner” turned A student’s expe...,science,"[0.051467888, 0.03393837, 0.03735785, 0.063682..."
725,A Message To Garcia,teaches you how to be the best at your job by...,happiness,"[-0.02911731, -0.0066583455, -0.012567387, -0...."


## Finding topN similar books

In [16]:
# Function to find top 5 similar books based on a given title
def get_cosine_simiarity(title, df):
    book_idx = books_df[books_df['book_name'] == title].index[0]
    book_embedding = books_df.loc[book_idx, 'embedding']
    similarities = cosine_similarity([book_embedding], books_df['embedding'].tolist()).flatten()
    return  similarities

### Finding Similar Books

 - The Bitcoin Standard
 - Measure What Matters
 - The Happiness Hypothesis

In [17]:
books_df.sample(10)['book_name']

Unnamed: 0,book_name
892,A World Without Email
482,Battle Hymn Of The Tiger Mother
619,Courage Is Calling
787,The Six Pillars Of Self-Esteem
932,Free To Focus
779,How To Be Alone
1183,Genius Foods
329,A First-Rate Madness
238,Becoming
894,The 22 Immutable Laws of Branding


In [18]:
# Display the titles of the top 5 similar movies for a given movie title
#sample_name = 'Einstein: His Life And Universe'
sample_name = 'Lean In'
# sample_name = "The Art Of Learning"
similarity = get_cosine_simiarity(sample_name,
                                  books_df)

In [19]:
rec_books = books_df.copy()

In [20]:
rec_books['similarity'] = list(similarity)

In [21]:
rec_books.sort_values('similarity', ascending = False)[0:6]

Unnamed: 0,book_name,summaries,categories,embedding,similarity
384,Lean In,explains why women are still underrepresented...,economics,"[0.0024933976, -0.032888502, 0.020896288, 0.10...",1.0
274,The Second Sex,delves into the concept of feminism by lookin...,politics,"[0.020590791, 0.004514648, -0.022322487, 0.038...",0.637384
20,Invisible Women,"talks about the flaws in our societal system,...",science,"[0.008369383, 0.015674097, -0.0025090275, 0.05...",0.583159
736,"Girl, Wash Your Face",inspires women to take their lives into their...,happiness,"[0.018947741, -0.008028975, 9.2654656e-05, 0.0...",0.572891
282,Hood Feminism,explores the idea that traditional feminism o...,politics,"[0.004111466, -0.00565356, -0.0012570323, 0.03...",0.534946
693,Why We Can’t Sleep,will help women in Generation X feel better a...,happiness,"[-0.009216746, 0.034883123, 0.049763348, 0.031...",0.515449
