# Colaborative filtering

In [1]:
!pip install -Uqq fastai

In [2]:
from fastcore.all import *
from fastai.collab import *
from fastai.tabular.all import *
import pandas as pd
from fastdownload import download_url
from fastai.vision.all import *

In [11]:
# Data should already be splited
dataset_path = Path("..\\datasets\\movies\\ratings_small.csv")
train_path = Path("..\\datasets\\movies\\ratings_small_train.csv")
test_path = Path("..\\datasets\\movies\\ratings_small_test.csv")

In [75]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
column_names = ['userId', 'movieId', 'rating', 'timestamp']
df_train.columns = column_names
df_test.columns = column_names

df_all = pd.concat([df_train, df_test], axis=0)
valid_idx = list(range(len(df_train), len(df_all)))
df_all.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,105,3006,4.0,1085640181
1,423,5291,4.5,1353690457
2,263,4406,3.0,1117842764
3,564,2313,5.0,974713583
4,430,1923,4.5,1111488767


## Tabular learner

In [84]:
dls = TabularDataLoaders.from_df(df_all,
                                 train_path,
                                 procs=[Categorify],
                                 cat_names=['userId', 'movieId'], 
                                 cont_names=['timestamp'], 
                                 y_names='rating',
                                 valid_idx=valid_idx,
                                 bs=64)
# test_dl is the same as validation set
test_dl = dls.test_dl(df_test, with_labels=True)
dls.show_batch()

Unnamed: 0,userId,movieId,timestamp,rating
0,665,3408,1010198000.0,4.0
1,475,1580,1447330000.0,3.0
2,104,106072,1446674000.0,3.5
3,598,4452,1008572000.0,4.0
4,390,21,867076200.0,5.0
5,485,3396,1337749000.0,3.5
6,60,1673,1125829000.0,4.5
7,353,2028,1140389000.0,3.0
8,564,1728,974842900.0,4.0
9,482,1334,949292200.0,4.0


In [85]:
print(len(dls.train_ds))
print(len(dls.valid_ds))

99002
1000


In [86]:
learn = tabular_learner(dls)
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,time
0,0.88825,0.868549,00:30
1,0.818372,0.783069,00:28
2,0.724087,0.754221,00:28
3,0.635117,0.757278,00:28
4,0.561133,0.77712,00:28


In [87]:
# For some reason, sometimes it starts from begining
# loss is exactly as bad as from the first cycle
learn.fine_tune(3)

epoch,train_loss,valid_loss,time
0,0.761521,0.788739,00:28


epoch,train_loss,valid_loss,time
0,0.657828,0.770664,00:28
1,0.616675,0.770147,00:28
2,0.490362,0.801018,00:28


In [88]:
learn.validate(dl=test_dl)

(#1) [0.8010179996490479]

In [89]:
# Test on some examples
data = [[73, 1097, 1255504951],  # -> 4
       [1, 31, 1260759144]]  # -> 2.5
df = pd.DataFrame(data, columns=['userId', 'movieId', 'timestamp'])
dl = learn.dls.test_dl(df)
rating,probs = learn.get_preds(1, dl)
print(f"Rating: {rating}.")

Rating: tensor([[3.7068],
        [2.5908]]).


## Colaborative learner

In [78]:
# Can't load custom train and valid dataset, so the split is random each time
dls = CollabDataLoaders.from_df(df_all, 
                                valid_pct=0.01,
                                bs=64)
dls.show_batch()

Unnamed: 0,userId,movieId,rating
0,77,3897,3.0
1,529,1953,3.0
2,615,5463,3.5
3,135,18,4.0
4,102,2240,1.0
5,23,2953,3.5
6,522,5956,4.0
7,73,96610,4.0
8,73,6377,4.0
9,212,830,2.0


In [79]:
print(len(dls.train_ds))
print(len(dls.valid_ds))

99002
1000


In [80]:
learn = collab_learner(dls, y_range=(0.5,5.5))
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,time
0,1.07603,1.004944,00:10
1,0.780467,0.754815,00:09
2,0.697403,0.719355,00:09
3,0.661287,0.715405,00:10
4,0.622536,0.713353,00:10


In [83]:
learn.fine_tune(3)

epoch,train_loss,valid_loss,time
0,0.431251,0.732663,00:11


epoch,train_loss,valid_loss,time
0,0.345681,0.731203,00:10
1,0.310165,0.739009,00:10
2,0.267664,0.742005,00:10


In [81]:
# Test on some examples
data = [[73, 1097],  # -> 4
       [1, 31]]  # -> 2.5
df = pd.DataFrame(data, columns=['userId', 'movieId'])
dl = learn.dls.test_dl(df)
rating, probs = learn.get_preds(1, dl)
print(f"Rating: {rating}.")

Rating: tensor([3.8148, 2.7384]).
