In [1]:
import sys
import os
import pandas as pd

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [6]:
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "raw_data", "kaggle_second_sem")
PATH_BOOKS = os.path.join(DATA_PATH, "books_data.csv")
PATH_RATINGS = os.path.join(DATA_PATH, "books_rating.csv")

df_books = pd.read_csv(PATH_BOOKS)
df_ratings = pd.read_csv(PATH_RATINGS)

In [11]:
import torch
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [18]:
print(df_books.shape, df_ratings.shape)
print(df_books.columns) 
print(df_ratings.columns)

df_ratings["User_id"].unique().shape[0]/df_ratings["User_id"].shape[0]

(212404, 10) (3000000, 10)
Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount'],
      dtype='object')
Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object')


0.33632433333333334

In [22]:
user_counts = df_ratings.groupby('User_id').size()
filtered_user_ids = user_counts[user_counts > 50].index
filtered_df = df_ratings[df_ratings['User_id'].isin(filtered_user_ids)]

filtered_df["User_id"].unique().shape

filtered_df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
6,826414346,Dr. Seuss: American Icon,,A14OJS0VWMOSWO,Midwest Book Review,3/4,5.0,1100131200,A memorably excellent survey of Dr. Seuss' man...,Theodor Seuss Giesel was best known as 'Dr. Se...
59,963923080,Rising Sons and Daughters: Life Among Japan's ...,,A3NIQK6ZLYEP1L,Michael Valdivielso,0/0,4.0,1239667200,Almost a day by day view,Steven Wardell went to Japan without reading a...
61,854968350,Muslim Women's Choices: Religious Belief and S...,,ATDE9JYCPI0L1,Alyssa A. Lappen,0/0,2.0,1109808000,Oh dear,I was excited to find a book ostensibly about ...
140,789480662,Eyewitness Travel Guide to Europe,,A281NPSIMI1C2R,"Rebecca of Amazon ""The Rebecca Review""",19/19,5.0,1023235200,The Major Sights in Twenty Countries - Amazing!,"Yes, this is one heavy book. It is more than l..."


In [13]:
df_books.isna().sum()

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64

In [18]:
df_books[df_books["ratingsCount"].isna() == False].sort_values("ratingsCount", ascending=False).head() 

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
26834,The Alchemist,"""My heart is afraid that it will have to suffe...",['Paulo Coelho'],http://books.google.com/books/content?id=pTr44...,http://books.google.com/books?id=pTr44Sx6oWQC&...,Harper Collins,2006-04-25,http://books.google.com/books?id=pTr44Sx6oWQC&...,['Fiction'],4895.0
76345,Unbroken,#1 NEW YORK TIMES BESTSELLER • NOW A MAJOR MOT...,['Laura Hillenbrand'],http://books.google.com/books/content?id=1PeLD...,http://books.google.com/books?id=1PeLDQAAQBAJ&...,Random House Trade Paperbacks,2014-07-29,http://books.google.com/books?id=1PeLDQAAQBAJ&...,['Biography & Autobiography'],4572.0
62005,A Mission to Millions; The Story of Ernie Alle...,#1 NEW YORK TIMES BESTSELLER • NOW A MAJOR MOT...,['Laura Hillenbrand'],http://books.google.com/books/content?id=1PeLD...,http://books.google.com/books?id=1PeLDQAAQBAJ&...,Random House Trade Paperbacks,2014-07-29,http://books.google.com/books?id=1PeLDQAAQBAJ&...,['Biography & Autobiography'],4572.0
128081,Blue Like Jazz: Nonreligious Thoughts on Chris...,A popular minister recounts his zealous early ...,['Donald Miller'],http://books.google.com/books/content?id=fNC3f...,http://books.google.nl/books?id=fNC3fdxYgZMC&p...,Thomas Nelson Inc,2012-04,http://books.google.nl/books?id=fNC3fdxYgZMC&d...,['Biography & Autobiography'],4562.0
57822,Eclipse,As the love triangle heats up in the third boo...,['Stephenie Meyer'],http://books.google.com/books/content?id=lw99O...,http://books.google.com/books?id=lw99Oii9R90C&...,"Little, Brown Books for Young Readers",2007-08-07,https://play.google.com/store/books/details?id...,['Young Adult Fiction'],4392.0


In [26]:
df_books[df_books["Title"] == "War and Peace"]

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
77196,War and Peace,Covering the period from the French invasion u...,['Leo Tolstoy'],http://books.google.com/books/content?id=W5_iD...,http://books.google.com/books?id=W5_iDQAAQBAJ&...,Lulu.com,2016-12-23,http://books.google.com/books?id=W5_iDQAAQBAJ&...,['Fiction'],


In [12]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class BookEmbedding(nn.Module):
    def __init__(self, vocab_size, author_size, category_size, embed_dim, max_len):
        super().__init__()
        
        # Текстовый эмбеддер (используем предобученную трансформерную модель)
        self.text_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.text_proj = nn.Linear(384, embed_dim)  # Преобразуем размерность эмбеддинга

        # Эмбеддинги авторов и категорий
        self.author_embedding = nn.Embedding(author_size, embed_dim)
        self.category_embedding = nn.Embedding(category_size, embed_dim)

        # Рейтинги (нормализуем и добавляем линейный слой)
        self.rating_proj = nn.Linear(2, embed_dim)  # [средний рейтинг, кол-во оценок]

        # Позиционные эмбеддинги
        self.position_embedding = nn.Embedding(max_len, embed_dim)

    def forward(self, book_ids, authors, categories, descriptions, ratings, positions):
        # Текстовый эмбеддинг
        tokens = self.tokenizer(descriptions, padding=True, truncation=True, return_tensors="pt")
        text_embeds = self.text_model(**tokens).pooler_output  # Получаем эмбеддинг текста
        text_embeds = self.text_proj(text_embeds)

        # Эмбеддинги авторов и категорий
        author_embeds = self.author_embedding(authors)
        category_embeds = self.category_embedding(categories)

        # Обрабатываем рейтинги
        rating_embeds = self.rating_proj(ratings)

        # Позиционные эмбеддинги
        pos_embeds = self.position_embedding(positions)

        # Финальный эмбеддинг книги
        book_embeds = text_embeds + author_embeds + category_embeds + rating_embeds + pos_embeds
        return book_embeds


ModuleNotFoundError: No module named 'transformers'