# Matrix Factorisation - Intuition

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

In [3]:
ratings = pd.read_csv("data/ratings.csv")
items = pd.read_csv("data/items.csv")
users = pd.read_csv("data/users.csv")

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Small Data

| movie_id | title                 |
| -------: | :-------------------- |
|      1   | Toy Story (1995)      |
|     71   | Lion King, The (1994) |
|     95   | Aladdin (1992)        |
|     50   | Star Wars (1972)      |
|    176   | Aliens (1986)         |
|     82   | Jurassic Park (1993)  |


In [5]:
from recoflow.datasets import SampleData

In [6]:
sample_users, sample_items, sample_ratings = SampleData(users, items, ratings)

In [7]:
from recoflow.preprocessing import EncodeUserItem

In [8]:
interaction, n_users, n_items, user_encoder, item_encoder = EncodeUserItem(sample_ratings,
                                                         "user_id", "movie_id", "rating", "unix_timestamp")

Number of users:  10
Number of items:  6


In [9]:
interaction.head()

Unnamed: 0,user_id,movie_id,RATING,TIMESTAMP,USER,ITEM
1052,2,50,5,888552084,1,1
1090,8,50,5,879362124,4,1
3672,6,95,2,883602133,2,4
4280,1,82,5,878542589,0,3
4596,12,82,4,879959610,6,3


In [10]:
max_rating = interaction.RATING.max()
min_rating = interaction.RATING.min()
min_rating, max_rating

(1, 5)

In [11]:
from recoflow.vis import InteractionVis, TrainTestVis

In [12]:
InteractionVis(interaction)

In [13]:
from recoflow.preprocessing import RandomSplit, StratifiedSplit, ChronoSplit

In [14]:
train, test = RandomSplit(interaction, [0.6, 0.4])

In [15]:
TrainTestVis(train, test)

## Build Model

In [16]:
from recoflow.models import ExplicitMatrixFactorisationBias

In [17]:
n_factors = 2
model = ExplicitMatrixFactorisationBias(n_users, n_items, n_factors, max_rating, min_rating)

In [18]:
model.summary()

In [19]:
%%time
output = model.fit([train.USER, train.ITEM], train.RATING, shuffle=True, batch_size=1, epochs=100, verbose=0, 
                   validation_data=([test.USER, test.ITEM], test.RATING))

Wall time: 21.9 s


In [20]:
from recoflow.vis import MetricsVis

In [21]:
MetricsVis(output.history)

In [22]:
from recoflow.recommend import UserEmbedding, ItemEmbedding

In [23]:
item_embedding = UserEmbedding(model, "ItemEmbedding")
user_embedding = ItemEmbedding(model, "UserEmbedding")

In [24]:
item_embedding

array([[ 0.2530496 , -0.5111764 ],
       [ 0.73075414, -0.55731034],
       [-0.42168757, -0.4707426 ],
       [-0.6180135 , -0.6195485 ],
       [-0.9349068 , -0.90708137],
       [-0.89944446, -0.95990777]], dtype=float32)

In [25]:
user_embedding

array([[ 0.76338285,  0.6608079 ],
       [-0.57110083,  0.72774523],
       [-0.7177329 , -0.38117373],
       [-0.15819004,  0.12245224],
       [ 0.17812128,  0.82140344],
       [-0.15531518,  0.7387702 ],
       [ 0.27006432,  0.54701644],
       [ 0.61769456,  0.59973997],
       [-0.8317416 , -0.3363021 ],
       [ 0.55941665,  0.52651054]], dtype=float32)

In [26]:
from recoflow.vis import EmbeddingVis

In [27]:
EmbeddingVis(item_embedding, n_factors, "ITEM")

In [28]:
EmbeddingVis(user_embedding, n_factors, "USER")

In [29]:
EmbeddingVis(user_embedding, n_factors, "USER") | EmbeddingVis(item_embedding, n_factors, "ITEM") 

In [30]:
from recoflow.recommend import GetPredictions, GetRankingTopK

In [31]:
predictions = GetPredictions(model, interaction)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step


In [32]:
InteractionVis(predictions)

In [38]:
import altair as alt

In [53]:
def SimilarityVis(item_embedding, user_embedding):
    
    item_embedding_df_wide = pd.DataFrame(item_embedding)
    user_embedding_df_wide = pd.DataFrame(user_embedding)

    item_embedding_df_wide.reset_index(inplace=True)
    item_embedding_df_wide["idx"] = item_embedding_df_wide["index"].apply(lambda x: "I" + str(x))
    item_embedding_df_wide.columns = ["index", "X0", "X1", "idx" ]

    user_embedding_df_wide.reset_index(inplace=True)
    user_embedding_df_wide["idx"] = user_embedding_df_wide["index"].apply(lambda x: "U" + str(x))
    user_embedding_df_wide.columns = ["index", "X0", "X1", "idx" ]

    embedding_df_wide = pd.concat([item_embedding_df_wide, user_embedding_df_wide])
    
    base = alt.Chart(embedding_df_wide).encode(
        alt.X("X0:Q", axis = alt.Axis(bandPosition = 0.5)),
        alt.Y("X1:Q", axis = alt.Axis(bandPosition = 0.5))
    )

    vis = base.mark_point(size=0) + base.mark_text().encode(text="idx")
    
    #return vis, 
    return embedding_df_wide

In [54]:
embedding_df_wide = SimilarityVis(item_embedding, user_embedding)

In [55]:
embedding_df_wide.head()

Unnamed: 0,index,X0,X1,idx
0,0,0.25305,-0.511176,I0
1,1,0.730754,-0.55731,I1
2,2,-0.421688,-0.470743,I2
3,3,-0.618014,-0.619548,I3
4,4,-0.934907,-0.907081,I4


In [56]:
base = alt.Chart(embedding_df_wide).encode(
    alt.X("X0:Q", axis = alt.Axis(bandPosition = 0.5)),
    alt.Y("X1:Q", axis = alt.Axis(bandPosition = 0.5))
)

vis = base.mark_point(size=1) + base.mark_text().encode(text="idx")