# Feature Extraction and Processing for GoodReads Dataset
## Pre-processing input data, creating Embeddings, etc.

### Setup

In [1]:
import json
import os
import numpy as np
import pandas as pd
from IPython.display import display
import pprint
import tensorflow as tf

Dataset Directory

In [2]:
DIR = './data/cleaned/'
fn_books = 'goodreads_books_mystery_thriller_crime.pkl'
fn_reviews = 'goodreads_reviews_mystery_thriller_crime.pkl'

In [3]:
df_reviews = pd.read_pickle(os.path.join(DIR, fn_reviews))
df_books = pd.read_pickle(os.path.join(DIR, fn_books))

In [4]:
df_reviews.head()

Unnamed: 0_level_0,user_id,book_id,rating,review_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5e212a62bced17b4dbe41150e5bb9037,8842281e1d1347389f2ab93d60773d4d,6392944,3,I haven't read a fun mystery book in a while a...
2ede853b14dc4583f96cf5d120af636f,8842281e1d1347389f2ab93d60773d4d,28684704,3,"A fun, fast paced science fiction thriller. I ..."
8e4d61801907e591018bdc3442a9cf2b,8842281e1d1347389f2ab93d60773d4d,32283133,0,http://www.telegraph.co.uk/culture/10...
022bb6daffa49adc27f6b20b6ebeb37d,8842281e1d1347389f2ab93d60773d4d,17860739,4,An amazing and unique creation: JJ Abrams and ...
0e317947e1fd341f573192111bb2921d,8842281e1d1347389f2ab93d60773d4d,8694005,3,The Name of the Rose is a thrilling Dan Brown-...


In [5]:
df_reviews.dtypes

user_id        object
book_id        uint32
rating          uint8
review_text    object
dtype: object

In [6]:
df_books.head()

Unnamed: 0_level_0,title,text_reviews_count,popular_shelves,average_rating,description,author_id,author_name
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6066814,"b'Crowner Royal (Crowner John Mystery, #13)'",15,"[{'count': '159', 'name': 'to-read'}, {'count'...",3.93,"London, 1196. At the command of Richard the Li...",37778,Bernard Knight
33394837,"b""The House of Memory (Pluto's Snitch #2)""",60,"[{'count': '54', 'name': 'currently-reading'},...",4.33,,242185,Carolyn Haines
29074697,b'The Slaughtered Virgin of Zenopolis (Inspect...,23,"[{'count': '90', 'name': 'to-read'}, {'count':...",3.49,"BATHS, BANKS AND ROMAN INSURRECTION Detective ...",15104629,David Blake
1902202,"b'Dead in the Morning (Patrick Grant, #1)'",8,"[{'count': '51', 'name': 'to-read'}, {'count':...",3.3,"Gerald breezily introduced his wife, Helen, to...",190988,Margaret Yorke
9671977,b'Aristotele e i misteri di Eleusi',3,"[{'count': '48', 'name': 'to-read'}, {'count':...",3.54,"""I misteri di Eleusi"" e il quinto romanzo di A...",337108,Margaret Doody


In [7]:
df_join = pd.merge(df_reviews, df_books, left_on="book_id", right_index=True)
df_join.head()

Unnamed: 0_level_0,user_id,book_id,rating,review_text,title,text_reviews_count,popular_shelves,average_rating,description,author_id,author_name
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0e317947e1fd341f573192111bb2921d,8842281e1d1347389f2ab93d60773d4d,8694005,3,The Name of the Rose is a thrilling Dan Brown-...,b'The Name of the Rose',99,"[{'count': '8209', 'name': 'to-read'}, {'count...",4.11,The year is 1327. Franciscans in a wealthy Ita...,1730,Umberto Eco
4276918357312212384ac6415ceb9159,8842281e1d1347389f2ab93d60773d4d,6652906,3,** spoiler alert ** Hooked me equally as wel...,"b'The Girl Who Played with Fire (Millennium, #2)'",772,"[{'count': '6613', 'name': 'fiction'}, {'count...",4.22,"Part blistering espionage thriller, part rivet...",706255,Stieg Larsson
9ee704921386f88893900829c037abd0,4fdf8e419e36ae2e82bc44376768e280,6652906,4,Don't start these books unless you're ready to...,"b'The Girl Who Played with Fire (Millennium, #2)'",772,"[{'count': '6613', 'name': 'fiction'}, {'count...",4.22,"Part blistering espionage thriller, part rivet...",706255,Stieg Larsson
26e59823f1936fe9030d85262f1477e1,446728d221c1343b92e1e4ff5545a843,6652906,5,Loved it! Not as much as the first but it this...,"b'The Girl Who Played with Fire (Millennium, #2)'",772,"[{'count': '6613', 'name': 'fiction'}, {'count...",4.22,"Part blistering espionage thriller, part rivet...",706255,Stieg Larsson
83370bd38023a2fd928b2b6114c2b210,fe0ad83a30bcd7fbe65ac1670b2b01e1,6652906,5,Once again Larsson does not disappoint. Althou...,"b'The Girl Who Played with Fire (Millennium, #2)'",772,"[{'count': '6613', 'name': 'fiction'}, {'count...",4.22,"Part blistering espionage thriller, part rivet...",706255,Stieg Larsson


In [8]:
df_join.describe()

Unnamed: 0,book_id,rating,text_reviews_count,average_rating,author_id
count,22726.0,22726.0,22726.0,22726.0,22726.0
mean,12494170.0,3.702323,1417.56442,3.861244,2209577.0
std,10616420.0,1.176469,3763.856472,0.282423,3636851.0
min,230.0,0.0,1.0,2.19,130.0
25%,1002539.0,3.0,34.0,3.7,17061.0
50%,11835450.0,4.0,184.0,3.88,147152.0
75%,21169350.0,5.0,879.0,4.05,3413185.0
max,36402840.0,5.0,24868.0,5.0,17320600.0


In [9]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 6066814 to 23826
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               50000 non-null  object 
 1   text_reviews_count  50000 non-null  uint32 
 2   popular_shelves     50000 non-null  object 
 3   average_rating      50000 non-null  float64
 4   description         50000 non-null  object 
 5   author_id           50000 non-null  int64  
 6   author_name         48081 non-null  object 
dtypes: float64(1), int64(1), object(4), uint32(1)
memory usage: 2.9+ MB


### Join Reviews and Book Infos
#### only 4272 book-review pairs are overlapping in total

In [10]:
df_join[100:105]

Unnamed: 0_level_0,user_id,book_id,rating,review_text,title,text_reviews_count,popular_shelves,average_rating,description,author_id,author_name
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3ba2d0f573bd246be03abe6d7e02fe5d,1e946b8f76d5a75414946767cd18cff9,6411961,3,Perfect reading for when you're home sick on a...,"b'The Lost Symbol (Robert Langdon, #3)'",21569,"[{'count': '9279', 'name': 'currently-reading'...",3.66,WHAT IS LOST... WILL BE FOUND In this stunning...,630,Dan Brown
94a7379eb0cdd8ec55b2c943eb59a52b,9059ac97d0f44419021a6c092014e721,6411961,4,WHAT WORKS IN THE LOST SYMBOL The entire fir...,"b'The Lost Symbol (Robert Langdon, #3)'",21569,"[{'count': '9279', 'name': 'currently-reading'...",3.66,WHAT IS LOST... WILL BE FOUND In this stunning...,630,Dan Brown
a901824aeb09e8eff73ad165d7011082,248a173e53445b16a3fea5ef89df81fb,6411961,4,I guess I am a sucker for Dan Brown's books. I...,"b'The Lost Symbol (Robert Langdon, #3)'",21569,"[{'count': '9279', 'name': 'currently-reading'...",3.66,WHAT IS LOST... WILL BE FOUND In this stunning...,630,Dan Brown
dd451f680f846d20376c5a5ca9ef4eab,00ce07379fb4a962964dcfde4e146a84,6411961,4,"Overall, I really liked this book. I think Dan...","b'The Lost Symbol (Robert Langdon, #3)'",21569,"[{'count': '9279', 'name': 'currently-reading'...",3.66,WHAT IS LOST... WILL BE FOUND In this stunning...,630,Dan Brown
d3f2384966f5c9951c0f914b1424e705,86b0f8caad0c89c9b9ee9c5061e7d3db,6411961,2,To be reviwed later.,"b'The Lost Symbol (Robert Langdon, #3)'",21569,"[{'count': '9279', 'name': 'currently-reading'...",3.66,WHAT IS LOST... WILL BE FOUND In this stunning...,630,Dan Brown


In [11]:
df_join.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22726 entries, 0e317947e1fd341f573192111bb2921d to 921812c9edc173c6d12e000723b9e667
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             22726 non-null  object 
 1   book_id             22726 non-null  uint32 
 2   rating              22726 non-null  uint8  
 3   review_text         22726 non-null  object 
 4   title               22726 non-null  object 
 5   text_reviews_count  22726 non-null  uint32 
 6   popular_shelves     22726 non-null  object 
 7   average_rating      22726 non-null  float64
 8   description         22726 non-null  object 
 9   author_id           22726 non-null  int64  
 10  author_name         22279 non-null  object 
dtypes: float64(1), int64(1), object(6), uint32(2), uint8(1)
memory usage: 1.8+ MB


Reviews + Books joinen ? für Titel usw. ?

In [12]:
book_title_lookup = tf.keras.layers.StringLookup()

In [13]:
book_title_lookup.adapt(df_books['title'])
print(f"Vocabulary: {book_title_lookup.get_vocabulary()[:5]}")

Vocabulary: ['[UNK]', 'The Woman in White', 'The Adventures of Sherlock Holmes', 'Rebecca', 'The Hound of the Baskervilles']


now we can translate raw tokens (title) into embedding ids (here -> 1)

In [14]:
book_title_lookup("The Woman in White")


<tf.Tensor: shape=(), dtype=int64, numpy=1>

you have OOV (out of vocabulary) for unknown tokens. StringLookup can use multiple OOV indices. the more indices, the less likely two different feature values will hash to the same OOV index. To take it to the next level, let's _just_ use hashing

In [15]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

book_title_hashing = tf.keras.layers.Hashing(
    num_bins=num_hashing_bins
)


In [16]:
book_title_hashing(['The Woman in White'])

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([145026], dtype=int64)>

Now let's turn those integers into **Embeddings**
> An embedding layer has two dimensions: the first dimension tells us how many distinct categories we can embed; the second tells us how large the vector representing each of them can be. When creating the embedding layer for movie titles, we are going to set the first value to the size of our title vocabulary (or the number of hashing bins). The second is up to us: the larger it is, the higher the capacity of the model, but the slower it is to fit and serve.


In [17]:
book_title_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup. (for whatever reason..)
    input_dim=book_title_lookup.vocabulary_size(),
    output_dim=32
)

String -> Integer -> Embedding

In [18]:
book_title_model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string), book_title_lookup, book_title_embedding])

In [19]:
book_title_model.predict(['The Woman in White'])

array([[[ 0.01066036,  0.01711277,  0.04086823, -0.04574196,
         -0.00500498,  0.02479979,  0.04201926,  0.0478866 ,
         -0.00592098,  0.00510074, -0.02413275,  0.01563613,
          0.04536356,  0.0133328 ,  0.01701099, -0.04780373,
         -0.03997608,  0.00044893, -0.0154395 , -0.01296217,
         -0.03021021,  0.02254916, -0.02640446,  0.04348096,
          0.01255165,  0.03839214,  0.03369657,  0.00730436,
          0.01088259,  0.00588713,  0.03960041, -0.04956684]]],
      dtype=float32)

Jetzt das gleiche mit User IDs 
--> nicht nötig, da sie schon pure ints sind.

### Normalize Ratings
not sure if necessary, but hey, doesn't hurt.
not sure if min/max rescaling (0 - 1) or normalization is better...

In [20]:
rating_scaling = tf.keras.layers.Rescaling(scale=1/5.)

In [21]:
rating_normalization = tf.keras.layers.Normalization()
rating_normalization.adapt(df_reviews['rating'])
for x in df_reviews['rating'].sample(5):
    print(f"normalized rating: {rating_normalization(x)}, was {x}")

normalized rating: [[1.0962977]], was 5
normalized rating: [[1.0962977]], was 5
normalized rating: [[0.25075173]], was 4
normalized rating: [[-1.4403403]], was 2
normalized rating: [[1.0962977]], was 5


jetzt hab ich halt Kontinuerliche Werte aus Diskreten gemacht. das ist eigentlich dumm.

In [22]:
rating_scaling(df_reviews['rating'].sample())

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>

Ich bleib bei den rescaled Ratings. ist dann schön 0 - 1 aber auch nicht kategorisch, weil ja durchaus ein linearer Zusammenhang besteht.

### Processing text features

## TODO: nur englische Bücher verwenden!

In [23]:
description_text = tf.keras.layers.TextVectorization()
description_text.adapt(df_books['description'])


In [24]:
sample = df_books['description'].sample()
sample

book_id
2516539    Storm Kayamas old high school friend, Tanner W...
Name: description, dtype: object

In [25]:
description_text(sample)

<tf.Tensor: shape=(1, 206), dtype=int64, numpy=
array([[  1270, 219574,    118,    330,    307,    142,   5852,   2924,
           207,      3,   3006,    101,   1413,      9,      6,    152,
             6,  60990,      6,    158,    466,     10,   2793,      8,
           610,    220,    820,      5,     10,   9207,    183,      3,
          1420,  10155,  30317,      7,      2,   8588,   6135,    425,
           467,      4,    811,   2052,  79411,      8,    953,      2,
          3046,     11,      3,  60990, 110065,   8023,  25124,   2361,
             3,    396,      7,      2,    630,      5,  11222,  23015,
             2,    183,      5,     19,  25101,  14118,   7719,     23,
           727,    120,      3,    346,  12035,     13,   1270,      4,
         79411,      6,    125,    224,     21,  13593,     13,      2,
          1684,     16,   1270,    133,    103,     11,   5852,     20,
          2960,   1232,     75,     14,    635,     15,    440,     10,
          2098, 

In [26]:
description_text.get_vocabulary()[35:45]


['into', 'will', 'they', 'him', 'have', 'e', 'life', 'all', 'out', 'its']

> To finish the processing, we now need to embed the text. Because each title contains multiple words, we will get multiple embeddings for each title. For use in a donwstream model these are usually compressed into a single embedding. Models like RNNs or Transformers are useful here, but averaging all the words' embeddings together is a good starting point.

## User Model

In [27]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(df_reviews["user_id"])

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocabulary_size(), 32)

user_id_model = tf.keras.Sequential([tf.keras.Input(shape=(1,), dtype=tf.string), user_id_lookup, user_id_embedding])


außer ID hab ich keine Infos zum User. könnte noch die Timestamps hinzufügen, ja. will ich aber erstmal nicht.

In [28]:
for row in df_reviews['user_id'].sample(2):
  print(f"Computed representations: {user_id_model.predict([row])}")


Computed representations: [[[ 0.03676773  0.02399799  0.03393122 -0.02682499 -0.01497364
    0.01548943 -0.0463771  -0.0040454   0.02141673  0.00218928
   -0.01313556 -0.00419117 -0.01833665 -0.04250544 -0.04027412
   -0.03447889  0.00763812  0.01837161 -0.0106344   0.03355536
   -0.00362672 -0.02506605  0.02969397 -0.02095327  0.00200088
   -0.03212874 -0.04628074  0.02463732  0.00325201  0.02798146
   -0.00209218  0.00934591]]]
Computed representations: [[[-0.01933162 -0.00532739  0.04377028  0.01310365 -0.0256615
    0.03164845  0.02509285  0.0153225   0.01659072 -0.04679772
   -0.0037107  -0.04951607  0.00525152  0.00392311 -0.04494044
    0.04446206 -0.04955762 -0.01787345  0.04363413  0.02332539
   -0.0259964  -0.01419444 -0.00015927 -0.02043463 -0.03585851
    0.00241078  0.00375281  0.04275284 -0.03490371  0.03385906
   -0.04698585  0.01285278]]]


## Book Model

In [29]:
class BookModel(tf.keras.Model):

  def __init__(self, title_vocabulary, description_vocabulary, author_vocabulary):
    super().__init__()

    max_tokens = 10_000
    embedding_dim = 32

    self.title_vectorization_layer = tf.keras.layers.TextVectorization(max_tokens=max_tokens)
    self.title_vectorization_layer.adapt(title_vocabulary)
    
    self.title_embedding = tf.keras.Sequential([
      self.title_vectorization_layer,
      tf.keras.layers.Embedding(max_tokens, embedding_dim, name="embedding"),
      # We average the embedding of individual words to get one embedding vector
      # per description.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.description_text_vectorization_layer = tf.keras.layers.TextVectorization(
      max_tokens=max_tokens)
    self.description_text_vectorization_layer.adapt(description_vocabulary)
    
    self.description_text_embedding = tf.keras.Sequential([
      self.description_text_vectorization_layer,
      tf.keras.layers.Embedding(max_tokens, embedding_dim, name="embedding"),
      # We average the embedding of individual words to get one embedding vector
      # per description.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    # author IDs are numerical
    #from sample list of author ids create a continous range
    # train an embedding representation of this range
    self.author_id_lookup = tf.keras.layers.IntegerLookup()
    self.author_id_lookup.adapt(author_vocabulary)

    self.author_id_embedding = tf.keras.Sequential([
      tf.keras.Input(shape=(1,), dtype='int64'), 
      self.author_id_lookup,
      tf.keras.layers.Embedding(self.author_id_lookup.vocabulary_size(), embedding_dim, name="embedding")
      ])


  def call(self, inputs):
    #TODO
    return tf.concat([
        self.title_embedding(inputs["title"]),
        self.description_text_embedding(inputs["description"]),
        self.author_id_embedding(inputs["author_id"]),
    ], axis=1)


In [30]:
book_model = BookModel(
    title_vocabulary = df_books['title'],
    description_vocabulary = df_books['description'],
    author_vocabulary=df_books['author_id'])

In [31]:
sample = df_books[['title', 'description', 'author_id']].sample(1)
sample

Unnamed: 0_level_0,title,description,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11297506,"b'Murder on the Interstate (Logan & Cafferty, ...","While traveling a northern Arizona highway, Se...",2014086


In [32]:
book_model(sample)

<tf.Tensor: shape=(1, 96), dtype=float32, numpy=
array([[ 1.36308745e-02, -1.00424782e-04,  4.92974836e-03,
        -7.21854623e-03, -4.45580896e-04,  2.48784572e-03,
        -6.33176969e-05, -1.01633072e-02,  2.00869725e-03,
        -2.38350127e-02, -2.04396266e-02, -8.13369639e-03,
        -2.53176522e-02,  9.26260836e-03, -3.75619042e-03,
        -1.51012400e-02, -9.51328035e-03,  2.56669300e-04,
        -2.34544463e-02, -1.81150436e-02,  4.27870173e-03,
         1.14700885e-03,  1.35055976e-02, -3.79976630e-03,
        -1.89826742e-03, -8.42983648e-03,  1.42757725e-02,
         9.74639202e-04, -1.94532294e-02, -6.94482122e-03,
        -2.23167194e-03,  1.12741655e-02,  5.38914511e-03,
        -5.17953187e-03,  7.79309217e-03, -3.75817763e-03,
         3.98984551e-03,  1.26012517e-02,  1.04575492e-02,
         2.63223075e-04, -7.79145584e-03,  6.11317065e-03,
         6.04703731e-04, -1.57060695e-03, -3.40280379e-03,
         4.63121152e-03,  1.84182706e-03, -2.08296068e-03,
       

## Review Model

In [33]:
class ReviewModel(tf.keras.Model):

  def __init__(self, user_ids, review_ids, book_ids):
    super().__init__()

    max_tokens = 10_000
    embedding_dim = 32

    # Book IDs are numerical
    #from sample list of book ids create a continous range
    # train an embedding representation of this range
    self.book_id_lookup = tf.keras.layers.IntegerLookup()
    self.book_id_lookup.adapt(book_ids)

    self.book_id_embedding = tf.keras.Sequential([
      tf.keras.Input(shape=(1,), dtype='int64'), 
      self.book_id_lookup,
      tf.keras.layers.Embedding(self.book_id_lookup.vocabulary_size(), embedding_dim, name="embedding")
      ])

    # Review and User IDs are alphanumerical
    
    self.review_id_vectorization_layer = tf.keras.layers.TextVectorization(
      max_tokens=max_tokens)
    self.review_id_vectorization_layer.adapt(review_ids)

    self.review_id_embedding = tf.keras.Sequential([
      self.review_id_vectorization_layer,
      tf.keras.layers.Embedding(self.review_id_vectorization_layer.vocabulary_size(), embedding_dim, name="embedding"),
      # We average the embedding of individual words to get one embedding vector
      # per description.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.user_id_vectorization_layer = tf.keras.layers.TextVectorization(
      max_tokens=max_tokens)
    self.user_id_vectorization_layer.adapt(user_ids)

    self.user_id_embedding = tf.keras.Sequential([
      self.user_id_vectorization_layer,
      tf.keras.layers.Embedding(self.user_id_vectorization_layer.vocabulary_size(), embedding_dim, name="embedding"),
      #TODO MASK ZERO??
      # We average the embedding of individual words to get one embedding vector
      # per description.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])


  def call(self, inputs: pd.DataFrame):
    #TODO
    if 'review_id' in inputs.keys():
      review_input = inputs['review_id']
    elif inputs.index.name == 'review_id':
      review_input = inputs.index
    else:
      raise ValueError("'review_id' neither index nor column")
    return tf.concat([
        self.book_id_embedding(inputs["book_id"]),
        self.review_id_embedding(review_input),
        self.user_id_embedding(inputs["user_id"]),
    ], axis=1)


In [34]:
review_model = ReviewModel(
    user_ids = df_reviews['user_id'],
    review_ids = df_reviews.index,
    book_ids=df_reviews['book_id'])



In [35]:
sample = df_reviews.sample(1)
sample

Unnamed: 0_level_0,user_id,book_id,rating,review_text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
152c0a5b68a9271bba1716c9ac867393,659186143b55358ca7f9b26cc6aa0634,18775247,4,i'm pissed i didn't read this sooner lol


In [36]:
review_model(sample)

<tf.Tensor: shape=(1, 96), dtype=float32, numpy=
array([[-0.0028586 , -0.03091831,  0.01607896,  0.04697912,  0.03025583,
         0.01199814, -0.00444038,  0.03963006, -0.03829606, -0.03585865,
        -0.01786268,  0.01868767,  0.02994077, -0.0010972 ,  0.03992815,
        -0.02445447,  0.00245376,  0.04150835, -0.0403934 , -0.04680669,
        -0.03060129,  0.02973701, -0.01777003,  0.00031988,  0.01246957,
         0.01589235,  0.02303443, -0.02682594, -0.00256842, -0.01515793,
        -0.00168462,  0.01027632,  0.04173422,  0.04804875, -0.01320882,
         0.02330193, -0.02479473, -0.02070893, -0.00672935, -0.02630199,
         0.04590357,  0.04963528,  0.0407191 ,  0.03217853, -0.02726812,
         0.00869119,  0.01918832,  0.0481332 , -0.0193388 , -0.02571688,
        -0.02887375,  0.04855379, -0.02993959,  0.00309888, -0.0434199 ,
        -0.00871726,  0.04729148, -0.04802017,  0.02213817,  0.00420422,
         0.04742939,  0.03508562,  0.00080667,  0.00336868, -0.0441296 ,
  

In [37]:
assert False, "Breakpoint"

AssertionError: Breakpoint

In [44]:
type(df_books['title'].iloc[0])

bytes

### Retrieve trained word embeddings and save them to disk

In [38]:
import io
def save_embedding_to_disk(embedding_name: str, embedding_model: tf.keras.Model, vectorization_layer: tf.keras.layers.TextVectorization):
    weights = embedding_model.get_layer('embedding').get_weights()[0]
    vocab = vectorization_layer.get_vocabulary()
    out_dir = "./data/embeddings/"
    out_v = io.open(os.path.join(out_dir, f'{embedding_name}_vectors.tsv'), 'w', encoding='utf-8')
    out_m = io.open(os.path.join(out_dir, f'{embedding_name}_metadata.tsv'), 'w', encoding='utf-8')

    for index, word in enumerate(vocab):
        if index == 0:
            continue  # skip 0, it's padding.
        vec = weights[index]
        out_v.write('\t'.join([str(x) for x in vec]) + "\n")
        out_m.write(str(word) + "\n")
    out_v.close()
    out_m.close()

### BookModel

In [39]:
book_model_embeddings = {
    "title": (book_model.title_embedding, book_model.title_vectorization_layer),
    "description": (book_model.description_text_embedding, book_model.description_text_vectorization_layer),
    "author_id": (book_model.author_id_embedding, book_model.author_id_lookup)
}

for key, (embedding, vectorize) in book_model_embeddings.items():
    try:
        save_embedding_to_disk(key, embedding, vectorize)
    except Exception as e:
        print("error @ ", key, ": ", repr(e))
    

error @  title :  UnicodeDecodeError('utf-8', b'\xc3', 0, 1, 'unexpected end of data')


#### Review Model

In [45]:
review_model_embeddings = {
    "book_id": (review_model.book_id_embedding, review_model.book_id_lookup),
    "review_id": (review_model.review_id_embedding, review_model.review_id_vectorization_layer),
    "user_id": (review_model.user_id_embedding, review_model.user_id_vectorization_layer)
}

for key, (embedding, vectorize) in review_model_embeddings.items():
    try:
        save_embedding_to_disk(key, embedding, vectorize)
    except Exception as e:
        print("error @ ", key, ": ", repr(e))

### View Embeddings in Embedding Projector

In [None]:
%load_ext tensorboard

## TODO: Review ID kann vermutlich weg?
### bzw. brauch ich das ReviewModel *überhaupt*?