In [1]:
import sys
import os
import pandas as pd

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

### Load datasets

In [2]:
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw_data", "kaggle_second_sem")
PATH_BOOKS = os.path.join(DATA_PATH, "books_data.csv")
PATH_RATINGS = os.path.join(DATA_PATH, "books_rating.csv")

df_books = pd.read_csv(PATH_BOOKS)
df_ratings = pd.read_csv(PATH_RATINGS)

df_books.shape, df_ratings.shape

((212404, 10), (3000000, 10))

### Produce simple embeddings
Consist of:
- book_id 
- author label encoding 
- genre label encoding
- embedding of the description text

In [23]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel

# Setup torch
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    print("gpu")
    torch.cuda.manual_seed_all(random_seed)
else:
    print("cpu")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


# Define function to get text embedding
def produce_text_embedding(text):
    encoding = tokenizer.batch_encode_plus(
        [text],
        padding=True,              
        truncation=True,           
        return_tensors='pt',      
        add_special_tokens=True    
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state  

    sentence_embedding = word_embeddings.mean(dim=1)

    return sentence_embedding.to(device) 

cpu


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import LabelEncoder

# Определяем устройство (CPU или GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

df_books["Id"] = df_books.index
df_books["Description"] = df_books["description"].fillna("No description")

labelEncoder = LabelEncoder()
df_books["Author_id"] = labelEncoder.fit_transform(df_books["authors"])
df_books["Genre_id"] = labelEncoder.fit_transform(df_books["categories"])

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def get_book_embedding(row):
    description = row['Description']
    author_id = [row['Author_id']]
    genre_id = [row['Genre_id']]
    books_id = [row['Id']]

    if(books_id[0] % 1000 == 0):
        print(books_id[0])

    text_embedding = produce_text_embedding(description)

    text_embedding = torch.Tensor.cpu(text_embedding.squeeze(0))
    combined_embedding = np.concatenate([books_id, author_id, genre_id, text_embedding])
    return combined_embedding

df_books['Embedding'] = df_books.apply(get_book_embedding, axis=1)

embeddings_matrix = np.vstack(df_books['Embedding'].values)
np.save("/kaggle/working/primitive_embeddings.npy", embeddings_matrix)

print(embeddings_matrix.shape)
print(embeddings_matrix)


Using device: cpu


### Generate user's ratings sequences

In [4]:
user_id_agg = df_ratings.groupby(["User_id"])["Title"].agg(list).reset_index()
user_id_agg.rename(columns={"Title": "Titles"}, inplace=True)
user_id_agg_cut = user_id_agg[user_id_agg["Titles"].map(len) > 8]

df_ratings.shape, user_id_agg["User_id"].nunique(), user_id_agg.shape, user_id_agg_cut.shape

((3000000, 10), 1008972, (1008972, 2), (36677, 2))

In [7]:
user_id_agg_copy = user_id_agg.copy()
user_id_agg_copy

Unnamed: 0,User_id,Titles
0,A00109803PZJ91RLT7DPN,[This Calder Range (Calder Saga's)]
1,A00117421L76WVWG4UX95,[The Queen of Harlem: A Novel]
2,A0015610VMNR0JC9XVL1,"[The Richest Man in Babylon, The richest man i..."
3,A002258237PFYJV336T05,[Swan Place]
4,A00264602WCXBHHFPLTQ4,[The Berenstain Bears and Too Much Vacation]
...,...,...
1008967,AZZZELE3I0CKD,[The New Complete Portuguese Water Dog (Howell...
1008968,AZZZJY3RMN57G,[River Thieves: A Novel]
1008969,AZZZT14MS21I6,[One More Bridge to Cross: Lowering the Cost o...
1008970,AZZZYCR4NZADZ,"[Out, Out]"


In [26]:
title_to_id = dict(zip(df_books["Title"], df_books["Id"]))

user_id_agg_copy["Titles"] = user_id_agg_copy["Titles"].apply(
    lambda seq: [title_to_id.get(x, x) for x in seq]
)
