In [52]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

import multiprocessing as mp
import pandas as pd
import numpy as np

import pickle
import datetime

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from funk_svd.dataset import fetch_ml_ratings
from funk_svd.utils import _timer
from funk_svd import SVD


# import read_pickles
# import TrainTest Splitter

from funk_svd.svd import SVD

In [53]:
class TrainTestSplitter:
    """Handles splitting of dataframes into train and test sets."""
    def __init__(self):
        pass

    @classmethod
    def split_by_date(self, df, split_date):
        train = df[df["Date"] < split_date]
        test = df[df["Date"] >= split_date]
        return train, test

    @classmethod
    def split_by_percent(self, df, percent=0.8, random_split=False):
        if random_split:
            df = df.sample(frac=1, random_state=1)
        split_index = int(df.shape[0] * percent)
        train = df.iloc[:split_index]
        test = df.iloc[split_index:]
        return train, test
    
    @classmethod
    def split_by_users(self, df, n_reviews_in_test=10):
        train_list, test_list = zip(*df.groupby('UserID').apply(self.split_user_reviews, n_reviews_in_test))
        # Concatenate the list of DataFrames into a single DataFrame for train and test
        train = pd.concat([x for x in train_list if x is not None])
        test = pd.concat([x for x in test_list if x is not None])
        return train, test


    def split_user_reviews(group, n_reviews_in_test):
        if len(group) > n_reviews_in_test:
            return group.iloc[:-10], group.iloc[-10:]
        else:
            return group, None
        

def read_pickles(path_to_folder):
    df_movies = pd.read_pickle(path_to_folder+"movies.pickle")
    df_users = pd.read_pickle(path_to_folder+"users.pickle")
    df_ratings = pd.read_pickle(path_to_folder+"ratings.pickle")
    return df_movies, df_users, df_ratings

In [54]:
movies, users, ratings = read_pickles("../ucu-recsys-movielens/data/ml-1m-after_eda/")

In [55]:
ratings.columns = ['u_id', 'i_id', 'rating', 'Timestamp', 'Datetime', 'Date']

In [56]:
train, test = TrainTestSplitter.split_by_date(ratings, datetime.datetime.strptime('2000-12-02', '%Y-%m-%d').date())

In [57]:
train_user_to_item = train.pivot(index='u_id', columns='i_id', values='rating')
val_user_to_item = test.pivot(index='u_id', columns='i_id', values='rating')

In [58]:
svd = SVD(lr=0.01, reg=0.1, n_epochs=100, n_factors=5,
          early_stopping=False, shuffle=False, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=test)

Preprocessing data...

Preprocessing data...

Epoch 1/100  | val_loss: 0.93 - val_rmse: 0.96 - val_mae: 0.76 - took 0.0 sec
Epoch 2/100  | val_loss: 0.91 - val_rmse: 0.96 - val_mae: 0.76 - took 0.0 sec
Epoch 3/100  | val_loss: 0.91 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 4/100  | val_loss: 0.91 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 5/100  | val_loss: 0.91 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 6/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 7/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 8/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 9/100  | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 10/100 | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 11/100 | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 12/100 | val_loss: 0.90 - val_rmse: 0.95 - val_mae: 0.75 - took 0.0 sec
Epoch 13/100 | val

<funk_svd.svd.SVD at 0x1fdd91a9550>

In [62]:
pred = svd.predict(test)
mae = mean_absolute_error(test['rating'], pred)
test['pred'] = np.random.normal(loc=3, scale=0.5, size=len(pred))

print(f'Test MAE: {mae:.2f}')

Test MAE: 0.75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['pred'] = np.random.normal(loc=3, scale=0.5, size=len(pred))


In [65]:
print(pred[:25])

[4.309891052165428, 3.4280481067763118, 3.9716537576688102, 3.9770967645829276, 3.9226874854193934, 4.2281711962075965, 3.8879647821610734, 4.226429169066795, 3.8099047126852854, 4.147639706751417, 3.8084515357528614, 3.3908693628481004, 3.794871200944771, 4.025607069910902, 3.9684607794006257, 3.895957313931766, 3.637653622312006, 3.681276271308918, 3.7504970699509808, 3.795775112525225, 3.6500331016744783, 4.230715881289319, 3.8127093953783358, 4.43577195254901, 3.3152844028416872]


In [63]:
test.head()

Unnamed: 0,u_id,i_id,rating,Timestamp,Datetime,Date,pred
0,1,1193,5,978300760,2000-12-31 22:12:40,2000-12-31,3.246398
1,1,661,3,978302109,2000-12-31 22:35:09,2000-12-31,3.62657
2,1,914,3,978301968,2000-12-31 22:32:48,2000-12-31,2.660016
3,1,3408,4,978300275,2000-12-31 22:04:35,2000-12-31,3.390802
4,1,2355,5,978824291,2001-01-06 23:38:11,2001-01-06,2.826671


In [61]:
test.to_pickle('df_5_factors.pkl')