In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("output.csv")
dataset.head(30)

Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish
0,Classical Mythology,"Mark P. O. Morford, Robert J. Lenardon",Provides an introduction to classical myths pl...,Social Science,"Oxford University Press, USA",2003
1,Decision in Normandy,Carlo D'Este,"Here, for the first time in paperback, is an o...",History,Harper Perennial,1991
2,Flu,Gina Bari Kolata,"""Scientists have recently discovered shards of...",Medical,Macmillan,1999
3,Under the Black Flag,David Cordingly,"For this rousing, revisionist history, the for...",Pirates,Random House (NY),1995
4,The Middle Stories,Sheila Heti,"Part Dorothy Parker, part Jose Saramago, with ...",Fiction,House of Anansi,2001
5,More Cunning Than Man,Robert Hendrickson,"This eye-opening, well-researched examination ...",Medical,Zebra Books,1999
6,The Testament,John Grisham,Heart of darkness... In a plush Virginia offic...,Adventure stories,Island,1999
7,Beloved,Toni Morrison,WINNER OF THE NOBEL PRIZE IN LITERATURE.,Fiction,Plume Books,1988
8,Our Dumb Century,Scott Dikkers,The Onion has quickly become the world's most ...,Humor,Crown,1999
9,New Vegetarian,Celia Brooks Brown,In New Vegetarian Celia Brooks Brown presents ...,International cooking,,2001


### Simply preprocess for dataset

- Fill in some missing values
- Split all categories into separate columns of categories
- Delete redundant columns

In [3]:
# Create new preprocessed dataset
preprocessed_dataset = dataset.copy()
preprocessed_dataset['Category'] = preprocessed_dataset['Category'].fillna('')
preprocessed_dataset.drop(columns=['Authors','Publisher', 'Publish'], inplace=True)

# Split list category into different columns
dummies = pd.get_dummies(preprocessed_dataset['Category'])
preprocessed_dataset = preprocessed_dataset.join(dummies)
preprocessed_dataset.drop(columns=['Category'], inplace=True)

# Drop unnecessary columns


# Show new dataset
preprocessed_dataset.head(30)

Unnamed: 0,Title,Description,Unnamed: 3,'Allo 'allo (Television program),.,1900-45,1944,1990-1999,3-D Illusions-Art,364614153,...,"Yoga, Kriya",Yorkshire (England),Young Adult Fiction,Young Adult Nonfiction,Young adult fiction,Young women,Zen Buddhism,nemška književnost - romani,poems,Émotions
0,Classical Mythology,Provides an introduction to classical myths pl...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Decision in Normandy,"Here, for the first time in paperback, is an o...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Flu,"""Scientists have recently discovered shards of...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Under the Black Flag,"For this rousing, revisionist history, the for...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The Middle Stories,"Part Dorothy Parker, part Jose Saramago, with ...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,More Cunning Than Man,"This eye-opening, well-researched examination ...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,The Testament,Heart of darkness... In a plush Virginia offic...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,Beloved,WINNER OF THE NOBEL PRIZE IN LITERATURE.,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,Our Dumb Century,The Onion has quickly become the world's most ...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,New Vegetarian,In New Vegetarian Celia Brooks Brown presents ...,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Setup embeddings producer

- Import libraries
- Setup torch and BERT
- Create embeddings producer function

In [4]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel

# Setup torch
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    print("gpu")
    torch.cuda.manual_seed_all(random_seed)
else:
    print("cpu")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


# Define function to get text embedding
def produce_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.cpu() 


cpu


### Function to produce all embeddings
- Never tested
- Probably needs optimization
- Has to save intermediate results
- Has to have ability to start from arbitrary point
- Saving in numpy format has to be provided

In [5]:
def create_new_dataframe(df):
    numerical_cols = df.select_dtypes(include=np.number).columns
    combined_data = []
    length = df.shape[0]
    for index, row in df.iterrows():
        vector = produce_text_embedding(row['Description']).reshape(768)

        numerical_values = row[numerical_cols].values
        name = row['Title']

        combined_vector = np.concatenate((vector, numerical_values))

        combined_row = np.append(name, combined_vector)

        combined_data.append(combined_row)
        print(f'Progress: {index / length:.2%}', end='\r')

    combined_array = np.array(combined_data, dtype=object) 

    return combined_array

### Importing numpy vectors
- Imports numpy matrix. Also such matrix has to be saved by previous method
- Consists of all books in dataframe
- For each vector first element is the book name, others are embedding

In [6]:
new_df = create_new_dataframe(dataset)
np.save('books_embeddings_new_dataset.npy', new_df)

Progress: 100.00%

In [7]:
dataset = np.load("books_embeddings_new_dataset.npy", allow_pickle= True)

### Get most similar vectors in the dataset


In [8]:
def find_closest_records(record, df, n=5):
    record_vector = (record[1:]).astype(np.float64).reshape(1,-1)
    record_vector = torch.from_numpy(record_vector)
    data_matrix =(df[:, 1:].astype(np.float64))
    data_matrix = torch.from_numpy(data_matrix)
    all_names = df[:,0]
    distances = 1 - (torch.cosine_similarity(record_vector, data_matrix))
    sorted_indeces = np.argsort(distances)[:n]
    names = all_names[sorted_indeces]
    return list(names)

In [13]:
# print(dataset[1085])
find_closest_records(dataset[1085], dataset, n=20)

['Dune',
 'Immortalis',
 'Fortress Draconis',
 "Conquerors' Heritage",
 'Krondor: The Assassins',
 'Hammerfall',
 'Guardians of the Lost',
 'Hammerfall',
 'The Risen Empire',
 "Destiny's Way: Star Wars Legends",
 'Cloak of Deception: Star Wars Legends',
 'The Fifth Sorceress',
 'Lord of Snow and Shadows',
 'Dune: The Machine Crusade',
 'Shards of a Broken Crown',
 'Drowning World',
 'The Scar',
 'Metaplanetary',
 "Lord Valentine's Castle",
 'The Lion of Senet']