In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("Datasets/BooksDataset.csv")
df_clean = pd.read_csv("Datasets/BooksDatasetClean.csv")

In [None]:
df.shape, df_clean.shape

In [None]:
df.columns, df_clean.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df["Category"].value_counts()

In [13]:
wdf = df_clean.copy()
wdf["Description"] = wdf["Description"].fillna(wdf["Category"])
wdf["Description"] = wdf["Description"].fillna(wdf["Title"])
wdf["Category"] = wdf["Category"].fillna("")

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

wdf['Category_list'] = wdf['Category'].str.split(' , ')
wdf['Category_list'] = wdf['Category_list'].apply(lambda arr: [s.strip() for s in arr])


In [None]:
wdf.head()

In [16]:

mlb = MultiLabelBinarizer()
encoded_categories = mlb.fit_transform(wdf['Category_list'])

encoded_df = pd.DataFrame(encoded_categories, columns=mlb.classes_)

wdf = pd.concat([wdf, encoded_df], axis=1)


In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


In [None]:
def product_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.cpu() 


text = "Check working"


product_text_embedding(text).shape

In [None]:
exp_df = wdf.copy()

exp_df = exp_df.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher', 'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'])

exp_df.head()

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd
import numpy as np

def create_new_dataframe(df):
    new_df = pd.DataFrame()
    numerical_cols = df.select_dtypes(include=np.number).columns

    length = df.shape[0]
    for index, row in df.iterrows():
        vector = product_text_embedding(row['Description']).reshape(768)

        numerical_values = row[numerical_cols].values
        name = row['Title']

        combined_vector = np.concatenate((vector, numerical_values))

        vector_str = ','.join(map(str, combined_vector))

        new_row = pd.DataFrame({'book_embedding': [vector_str], 'name': [name]})
        new_df = pd.concat([new_df, new_row], ignore_index=True)

        print(f'Progress: {index / length:.2%}', end='\r')

    return new_df



def parse_embedding(embedding_str):
    """Convert the string of comma-separated floats into a NumPy array."""
    return np.array([float(x) for x in embedding_str.split(',')])

def find_closest_records(record, new_df, n=5):
    record_vector = parse_embedding(record['book_embedding']).reshape(1, -1)

    distances = []
    names = []

    length = new_df.shape[0]
    for index, row in new_df.iterrows():
        other_vector = parse_embedding(row['book_embedding']).reshape(1, -1)

        similarity = cosine_similarity(record_vector, other_vector)[0][0]

        distance = 1 - similarity

        distances.append(distance)
        names.append(row['name'])
        
        print(f'Progress: {index / length:.2%}', end='\r')

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

In [None]:
new_df = create_new_dataframe(exp_df)

In [None]:
new_df.head()

In [None]:
new_df['book_embedding'].shape

In [None]:
2771 * 103063 / 1000000

In [None]:
new_df.to_csv('books_embeddings.csv', index=False)

In [23]:
load_df = pd.read_csv("books_embeddings.csv")

In [None]:
find_closest_records(load_df.iloc[0], load_df, n=10)