In [None]:
import pandas as pd
import numpy as np

In [None]:
dataset = pd.read_csv("BooksDatasetClean.csv")

### Simply preprocess for dataset

- Fill in some missing values
- Split all categories into separate columns of categories
- Delete redundant columns

In [None]:
# Create new preprocessed dataset
preprocessed_dataset = dataset.copy()

# Fill missing values
preprocessed_dataset["Description"] = preprocessed_dataset["Description"].fillna(preprocessed_dataset["Category"])
preprocessed_dataset["Description"] = preprocessed_dataset["Description"].fillna(preprocessed_dataset["Title"])
preprocessed_dataset["Category"] = preprocessed_dataset["Category"].fillna("")

# Split categories into a list
preprocessed_dataset['Category_list'] = preprocessed_dataset['Category'].str.split(' , ')
preprocessed_dataset['Category_list'] = preprocessed_dataset['Category_list'].apply(lambda arr: [s.strip() for s in arr])

# Split list category into different columns
exploded = preprocessed_dataset.explode('Category_list')
dummies = pd.get_dummies(exploded['Category_list'])
preprocessed_dataset = preprocessed_dataset.join(dummies.groupby(exploded.index).sum())

# Drop unnecessary columns
preprocessed_dataset.drop(columns=['Authors', 'Category', 'Category_list', 'Publisher',
                                    'Price Starting With ($)', 'Publish Date (Month)', 'Publish Date (Year)'], inplace=True)

# Show new dataset
preprocessed_dataset.head(1)

### Setup embeddings producer

- Import libraries
- Setup torch and BERT
- Create embeddings producer function

In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel

# Setup torch
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


# Define function to get text embedding
def produce_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.cpu() 


### Function to produce all embeddings
- Never tested
- Probably needs optimization
- Has to save intermediate results
- Has to have ability to start from arbitrary point
- Saving in numpy format has to be provided

In [None]:
def create_new_dataframe(df):
    numerical_cols = df.select_dtypes(include=np.number).columns
    combined_data = []
    length = df.shape[0]
    for index, row in df.iterrows():
        vector = produce_text_embedding(row['Description']).reshape(768)

        numerical_values = row[numerical_cols].values
        name = row['Title']

        combined_vector = np.concatenate((vector, numerical_values))

        combined_row = np.append(name, combined_vector)

        combined_data.append(combined_row)
        print(f'Progress: {index / length:.2%}', end='\r')

    combined_array = np.array(combined_data, dtype=object) 

    return combined_array

### Importing numpy vectors
- Imports numpy matrix. Also such matrix has to be saved by previous method
- Consists of all books in dataframe
- For each vector first element is the book name, others are embedding

In [None]:
dataset = np.load("books_embeddings.npy", allow_pickle= True)

### Get most similar vectors in the dataset


In [None]:
def find_closest_records(record, df, n=5):
    record_vector = (record[1:]).astype(np.float64).reshape(1,-1)
    record_vector = torch.from_numpy(record_vector)
    data_matrix =(df[:, 1:].astype(np.float64))
    data_matrix = torch.from_numpy(data_matrix)
    all_names = df[:,0]
    distances = 1 - (torch.cosine_similarity(record_vector, data_matrix))
    sorted_indeces = np.argsort(distances)[:n]
    names = all_names[sorted_indeces]
    return list(names)

In [None]:
print(dataset[1000])
find_closest_records(dataset[1000], dataset, n=20)