In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Datasets/BooksDataset.csv")
df_clean = pd.read_csv("Datasets/BooksDatasetClean.csv")

In [None]:
df.shape, df_clean.shape

In [None]:
df.columns, df_clean.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df["Category"].value_counts()

In [8]:
wdf = df_clean.copy()
wdf["Description"] = wdf["Description"].fillna(wdf["Category"])
wdf["Description"] = wdf["Description"].fillna(wdf["Title"])
wdf["Category"] = wdf["Category"].fillna("")

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

wdf['Category_list'] = wdf['Category'].str.split(' , ')
wdf['Category_list'] = wdf['Category_list'].apply(lambda arr: [s.strip() for s in arr])


In [None]:
wdf.head()

In [11]:

mlb = MultiLabelBinarizer()
encoded_categories = mlb.fit_transform(wdf['Category_list'])

encoded_df = pd.DataFrame(encoded_categories, columns=mlb.classes_)

wdf = pd.concat([wdf, encoded_df], axis=1)


In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:


def product_text_embedding(text):
    encoding = tokenizer.batch_encode_plus( [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'] 
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state 

    sentence_embedding = word_embeddings.mean(dim=1)
    return sentence_embedding


# Input text
text = f"peepeepoopoo"


product_text_embedding(text).shape

In [None]:
exp_df = wdf.copy()

exp_df = exp_df.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher', 'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'])

exp_df.head()

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_new_dataframe(df):
  new_df = pd.DataFrame()
  numerical_cols = df.select_dtypes(include=np.number).columns

  length = df.shape[0]
  for index, row in df.iterrows():
    vector = product_text_embedding(row['Description']).reshape(768)
    numerical_values = row[numerical_cols].values
    name = row['Title']  

    combined_vector = np.concatenate((vector, numerical_values))  

    new_df = pd.concat([new_df, pd.DataFrame({'book_embedding': [combined_vector], 
                                            'name': [name]})], ignore_index=True)
    
    print(index/length, end='\r')

  return new_df

def find_closest_record(record, new_df):
  record_vector = record['book_embedding']

  closest_name = None
  min_distance = float('inf')

  for index, row in new_df.iterrows():
    other_vector = row['book_embedding']

    print(record_vector.shape, other_vector.shape)
    # Calculate cosine similarity for combined vectors
    similarity = cosine_similarity(record_vector.reshape(1, -1), other_vector.reshape(1, -1))[0][0]

    distance = 1 - similarity

    if distance < min_distance:
      min_distance = distance
      closest_name = row['Title']

  return closest_name


# Example Usage:
# Assuming you have a DataFrame called 'df' and a function 'f' that generates vectors
# new_df = create_new_dataframe(df, f)

# Example record from the new DataFrame
# record = new_df.iloc[0]

# Find the closest record
# closest_record_name = find_closest_record(record, new_df)
# print(f"The closest record to '{record['name']}' is '{closest_record_name}'.") 


In [None]:
new_df = create_new_dataframe(exp_df)

In [None]:
find_closest_record(new_df.iloc[0], new_df)

In [None]:
exp_df.shape