# **Implementing the Content Based Filtering Algorithm from Scratch**

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# user content(x_u) -> neural net -> v_u
# movie content(x_m) -> neural net -> v_m
# y is the actual rating

# predicted rating = np.dot(v_u, v_m)

# user input vector x_u represents user's taste (ex: avg rating for each genre)
# movie input vector x_m represents movie's content (ex: year, genre(one hot), avg rating for that genre)

# each user has 1 vector x_u but each movies can have multiple vectors x_m, if it belongs to multiple genres.
# the neural network still receives one user vector and one movie vector per training example.
# ex: x_u and x_m1 for genre 1, x_u and x_m2 for genre 2, and so on.

In [3]:
item_train = np.loadtxt('content_item_train.csv', delimiter=',') # x_m
user_train = np.loadtxt('content_user_train.csv', delimiter=',') # x_u
y_train = np.loadtxt('content_y_train.csv', delimiter=',') # y

In [4]:
print(item_train.shape)
print(user_train.shape)
print(y_train.shape)

(58187, 17)
(58187, 17)
(58187,)


In [5]:
user_train[0]
# user id, rating count, rating average, average rating for genre 1, average rating for genre 2, ... average rating for genre 14

array([ 2.    , 16.    ,  4.0625,  3.9   ,  5.    ,  0.    ,  0.    ,
        4.    ,  4.2   ,  4.    ,  4.    ,  0.    ,  3.    ,  4.    ,
        0.    ,  4.25  ,  3.875 ])

In [6]:
item_train[0]
# movie id, year, average rating for movie, genre 1(one hot), genre 2(one hot), ... genre 14(one hot)

array([6.87400000e+03, 2.00300000e+03, 3.96183206e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00])

In [7]:
y_train[0]
# user_train[0] rated item_train[0] with 4.0

np.float64(4.0)

In [8]:
# making copies of unscaled data
item_train_save = item_train 
user_train_save = user_train

In [9]:
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

In [10]:
item_train, item_test = train_test_split(item_train, train_size=0.8, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.8, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_train, train_size=0.8, shuffle=True, random_state=1)   

In [11]:
print(item_train.shape)
print(item_test.shape)

(46549, 17)
(11638, 17)


In [12]:
user_train[0]

array([ 1.11518128, -0.89065377,  0.55107954,  0.67356455,  0.61127079,
        0.57779233,  0.71133592,  0.72105638,  0.46401949,  0.69378025,
        0.19576629,  0.25667377,  0.30812552,  0.54095844,  0.51780862,
        0.76467069,  0.47074832])

In [13]:
scaler = MinMaxScaler((-1,1)) # using MinMaxScaler as y has a upper bound and a lower bound
scaler.fit(y_train.reshape(-1,1)) # reshaping to 2D array as required by scaler

ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))

In [14]:
print(ynorm_train.shape)
print(ynorm_test.shape)

(46549, 1)
(11638, 1)


In [15]:
num_user_features = user_train.shape[1] -3 # excluding user id, rating count, rating average
num_item_features = item_train.shape[1] -1 # excluding movie id

In [16]:
num_outputs = 32
tf.random.set_seed(1)

user_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

item_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

input_user = tf.keras.layers.Input(shape=(num_user_features,)) # in keras shape does not include batch size
vu = user_NN(input_user)
vu = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vu) # normalising each vector to unit length

input_item = tf.keras.layers.Input(shape=(num_item_features,))
vm = item_NN(input_item)
vm = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm)

output = tf.keras.layers.Dot(axes=1)([vu, vm]) # dot product measures similarity between a given user and item

model = tf.keras.Model([input_user, input_item], output)
model.summary()

In [17]:
tf.random.set_seed(1)
cost_function = tf.keras.losses.MeanSquaredError()
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss=cost_function)

In [18]:
tf.random.set_seed(1)
model.fit([user_train[:,3:], item_train[:,1:]], ynorm_train, epochs=30)

Epoch 1/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 465us/step - loss: 0.1246
Epoch 2/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 490us/step - loss: 0.1182
Epoch 3/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 526us/step - loss: 0.1164
Epoch 4/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 456us/step - loss: 0.1148
Epoch 5/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 476us/step - loss: 0.1136
Epoch 6/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 470us/step - loss: 0.1125
Epoch 7/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 486us/step - loss: 0.1116
Epoch 8/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 468us/step - loss: 0.1108
Epoch 9/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 466us/step - loss: 0.1100
Epoch 10/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x11987aa50>

In [19]:
model.evaluate([user_test[:,3:], item_test[:,1:]], ynorm_test)  # gets the average deviation froma a given rating

[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287us/step - loss: 0.1058


0.10576813668012619

In [20]:
# new user who favors comedy, romance, sci-fi
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])


In [21]:
user_vec.shape

(1, 17)

In [22]:
item_vecs = np.loadtxt('content_item_vecs.csv', delimiter=',') # set of item vectors

In [23]:
item_vecs.shape

(1883, 17)

In [24]:
user_vecs = np.tile(user_vec, (len(item_vecs), 1)) # making a copy of user_vec for each item vector

In [25]:
user_vecs.shape

(1883, 17)

In [26]:
scaled_user_vecs = scalerUser.transform(user_vecs)
scaled_item_vecs = scalerItem.transform(item_vecs)
y_p = model.predict([scaled_user_vecs[:,3:], scaled_item_vecs[:,1:]])
predictions = scaler.inverse_transform(y_p)

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step


In [27]:
predictions

array([[3.6807437],
       [3.9911597],
       [4.068294 ],
       ...,
       [3.56056  ],
       [3.9111624],
       [3.5531023]], shape=(1883, 1), dtype=float32)

In [28]:
# similar movies have similar item vectors vm. a similarity measure is the squared distance between two item vectors.

In [29]:
def sq_dist(a, b):
    d = sum(np.square(a - b))
    return d

In [30]:
# using the trained item_NN to build a small model which generates vm

input_item_m = tf.keras.layers.Input(shape=(num_item_features,))    
vm_m = item_NN(input_item_m)                                    
vm_m = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(vm_m)                       
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

In [31]:
vms = model_m.predict(scaled_item_vecs[:,1:])

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 538us/step


In [32]:
vms.shape

(1883, 32)

In [33]:
len(vms)

1883

In [34]:
# creating a similarity matrix based on squared distances between item vectors
dim = len(vms)
distance = np.zeros((dim, dim))

for i in range(dim):
    for j in range(dim):
        if i == j:
            distance[i,j] = np.inf # skipping self-similarity
        else:
            distance[i,j] = sq_dist(vms[i], vms[j])

In [35]:
min_idx = np.argmin(distance[0])
min_idx

np.int64(192)