In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("Datasets/BooksDataset.csv")
df_clean = pd.read_csv("Datasets/BooksDatasetClean.csv")

In [None]:
wdf = df_clean.copy()
wdf["Description"] = wdf["Description"].fillna(wdf["Category"])
wdf["Description"] = wdf["Description"].fillna(wdf["Title"])
wdf["Category"] = wdf["Category"].fillna("")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

wdf['Category_list'] = wdf['Category'].str.split(' , ')
wdf['Category_list'] = wdf['Category_list'].apply(lambda arr: [s.strip() for s in arr])


In [None]:

mlb = MultiLabelBinarizer()
encoded_categories = mlb.fit_transform(wdf['Category_list'])

encoded_df = pd.DataFrame(encoded_categories, columns=mlb.classes_)

wdf = pd.concat([wdf, encoded_df], axis=1)


In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


In [None]:
def product_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.cpu() 


text = "Check working"


product_text_embedding(text).shape

In [None]:
exp_df = wdf.copy()

exp_df = exp_df.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher', 'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'])

exp_df.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

import pandas as pd
import numpy as np

def create_new_dataframe(df):
    numerical_cols = df.select_dtypes(include=np.number).columns
    combined_data = []
    length = df.shape[0]
    for index, row in df.iterrows():
        vector = product_text_embedding(row['Description']).reshape(768)

        numerical_values = row[numerical_cols].values
        name = row['Title']

        combined_vector = np.concatenate((vector, numerical_values))

        #vector_str = ','.join(map(str, combined_vector))
        combined_row = np.append(name, combined_vector)

        # Append the row to the combined data
        combined_data.append(combined_row)
        print(f'Progress: {index / length:.2%}', end='\r')

    combined_array = np.array(combined_data, dtype=object)  # Use dtype=object to handle mixed types

    return combined_array


def find_closest_records(record, new_df, n=5):
    record_vector = (record[1:]).reshape(1, -1)

    distances = []
    names = []

    length = new_df.shape[0]
    for index in range(new_df.shape[0]):
        row = new_df[index]
        name = row[0]
        other_vector = np.array(row[1:], dtype=float).reshape(1, -1)
        #print("Record vector:", record_vector)
        #print("Other vector (loop):", other_vector)
        similarity = cosine_similarity(record_vector, other_vector).flatten()
        distance = 1 - similarity

        distances.append(distance)
        names.append(name)

        print(f'Progress: {index / length:.2%}', end='\r')

    sorted_indices = sorted(range(len(distances)), key=lambda k: distances[k])
    closest_names = [names[i] for i in sorted_indices[:n]]

    return closest_names

In [None]:
def find_closest_records_broadcasting(record, df, n = 5):
    record_vector = (record[1:]).reshape(1,-1)
    print("record vector", record_vector)
    data_matrix = df[:, 1:].astype(float)
    #print("Data matrix (broadcast):", data_matrix)
    all_names = df[:,0]
    similiraties = cosine_similarity(record_vector, data_matrix).flatten()
    distances = 1 - similiraties
    sorted_indeces = np.argsort(distances)[:n]
    names = all_names[sorted_indeces]
    return list(names)

In [None]:
#numpy_df = create_new_dataframe(exp_df)

In [None]:
#np.save("books_embeddings", numpy_df)

In [None]:
df_from_file = np.load("books_embeddings.npy", allow_pickle= True)

In [None]:
find_closest_records_broadcasting(df_from_file[9809], df_from_file, n=20)

In [None]:
np.set_printoptions(threshold=np.inf, linewidth=np.inf, suppress=True)

In [None]:
find_closest_records(df_from_file[0], df_from_file, n=20)

In [None]:
find_closest_records(df_from_file[0], df_from_file, n=20)