In [10]:
import pandas as pd
from ast import literal_eval
from model.components.preprocessors.data_preprocessor_v2 import DataPreprocessor
from sklearn.metrics.pairwise import cosine_similarity

# Set the float format
pd.options.display.float_format = '{:.2f}'.format

"""# Import Data"""
filepath = 'https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv'
print(f'Importing Data from {filepath}...')
# Import data from the goodbooks-10k repo
books_df = pd.read_csv(filepath, index_col=[0], converters={"genres": literal_eval})
books_ratings = pd.read_csv(filepath)
print('Date import complete.\n')

Importing Data from https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv...
Date import complete.


In [11]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import pandas as pd
import numpy as np
import tensorflow as tf


class FeatureExtractor:
    def __init__(self, model_name="distilbert-base-uncased"):
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = TFDistilBertModel.from_pretrained(model_name)

    def extract_features(self, books_df_processed):
        document_embeddings = []
        for author, title, desc, genres in zip(books_df_processed['author'], books_df_processed['title'],
                                               books_df_processed['description'], books_df_processed['genres']):
            # Concatenate author, title, and description
            input_text = author + ' ' + title + ' ' + desc
            genre_text = ' '.join(genres)
            input_text = input_text + ' ' + genre_text

            # Tokenize input text
            inputs = self.tokenizer(input_text, padding=True, truncation=True, return_tensors="tf")

            # Forward pass through BERT model
            outputs = self.model(inputs)

            # Extract embeddings
            last_hidden_states = outputs.last_hidden_state
            # You can choose to use the embedding of the [CLS] token or pool the embeddings to get a single vector
            pooled_embedding = tf.reduce_mean(last_hidden_states, axis=1)
            document_embeddings.append(pooled_embedding.numpy())

        # Combine document embeddings with other features
        ##language_features = pd.get_dummies(books_df_processed['language_code']).values
        composite_feature_vector = np.vstack([document_embeddings])

        return composite_feature_vector

In [12]:
"""# Preprocessing"""
print('Performing Preprocessing...')
preprocessor = DataPreprocessor()
books_df_processed = preprocessor.preprocess(books_df)

indices = pd.Series(books_df_processed.index, index=books_df_processed['title']).drop_duplicates()
print('Preprocessing complete.\n')

Performing Preprocessing...
Preprocessing complete.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['author'] = books_df_subset['authors'].apply(lambda x: x[0]).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['description'] = books_df_subset['description'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_df_subset['title'] = books_df_subse

In [None]:
"""# Feature Extraction"""
print('Performing Feature Extraction...')
featureExtractor = FeatureExtractor()
composite_feature_vector = featureExtractor.extract_features(books_df_processed)
print('Feature Extraction complete.\n')

Performing Feature Extraction...







Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
"""# Similarity Measure"""
print('Generating Similarity Measures...')
# Using Cosine Similarity
cosine_sim = cosine_similarity(composite_feature_vector)
print('Similarity Measure generation complete.\n')

In [None]:
def recommend_items(title, similarity_measure=cosine_sim, fuzzy=False):
    # Convert input title to lowercase
    title = title.lower()

    # Get the index of the item that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all items with that item
    sim_scores = list(enumerate(similarity_measure[idx]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar items
    sim_scores = sim_scores[1:11]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar items
    return books_df_processed['title'].iloc[item_indices]
