In [2]:
from jenga.tasks.movies import PredictMovieRatingsTask

## Our task is to predict future movie ratings based on past ratings and genre information

In [3]:
task = PredictMovieRatingsTask()

The training data comprises of past user/item interactions, the corresponding ratings and genre information

In [5]:
task.advance_current_year()

train_data = task.current_new_train_data()
train_data

Unnamed: 0,genres,year,user,movie
1119,Action|Sci-Fi|Thriller,2016,9,257
1120,Drama,2016,9,314
1121,Comedy,2016,9,506
1122,Horror|Thriller,2016,9,514
1123,Drama|Fantasy|Romance,2016,9,694
...,...,...,...,...
100811,,2016,609,9238
100814,,2016,609,9268
100815,,2016,609,9274
100820,,2016,609,9307


In [7]:
train_ratings = task.current_new_train_ratings()
train_ratings

array([1. , 3.5, 4. , ..., 3.5, 2.5, 4.5], dtype=float32)

We provide a baseline model which computes embeddings of users and items

In [8]:
model = task.fit_baseline_model(train_data, train_ratings)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


The test data comprises of held out ratings from the future

In [10]:
test_data = task.current_test_data()
test_data

Unnamed: 0,genres,year,user,movie
1434,Adventure|Children|Fantasy,2017,14,0
1436,Comedy|Drama|Romance,2017,14,43
1440,Drama,2017,14,224
1441,Mystery|Thriller,2017,14,254
1442,Action|Sci-Fi|Thriller,2017,14,257
...,...,...,...,...
100831,,2017,609,9416
100832,,2017,609,9443
100833,,2017,609,9444
100834,,2017,609,9445


In [12]:
predicted_ratings = model.predict(test_data)
task.score_on_test_ratings(predicted_ratings)

1.2711102

We can check what happens when a large portion of the genre data is missing

In [13]:
from jenga.corruptions.text import MissingValues

missing_values_corruption = MissingValues(column='genres', fraction=0.8, na_value='')

corrupted_test_data = missing_values_corruption.transform(test_data)
corrupted_test_data

Unnamed: 0,genres,year,user,movie
1434,Adventure|Children|Fantasy,2017,14,0
1436,,2017,14,43
1440,,2017,14,224
1441,,2017,14,254
1442,,2017,14,257
...,...,...,...,...
100831,,2017,609,9416
100832,,2017,609,9443
100833,,2017,609,9444
100834,,2017,609,9445


In [14]:
predicted_ratings = model.predict(corrupted_test_data)
task.score_on_test_ratings(predicted_ratings)

1.2565356

### Here's how to run on the evaluation on all the slices of the data

In [15]:
task = PredictMovieRatingsTask()

missing_values_corruption = MissingValues(column='genres', fraction=0.8, na_value='')

while task.advance_current_year():
    train_data = task.current_accumulated_train_data()
    train_ratings = task.current_accumulated_train_ratings()
    
    model = task.fit_baseline_model(train_data, train_ratings)
    
    test_data = task.current_test_data()
    corrupted_test_data = missing_values_corruption.transform(test_data)
    
    predicted_ratings = model.predict(test_data)
    rmse_clean = task.score_on_test_ratings(predicted_ratings)

    predicted_ratings = model.predict(corrupted_test_data)
    rmse_corrupted = task.score_on_test_ratings(predicted_ratings)
    
    print('RMSE on clean test data', rmse_clean)
    print('RMSE on corrupted test data', rmse_corrupted)
    print('')
    

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
RMSE on clean test data 0.88500464
RMSE on corrupted test data 0.8847652

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
RMSE on clean test data 1.0500158
RMSE on corrupted test data 1.050098

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
RMSE on clean test data 1.0796599
RMSE on corrupted test data 1.0797361

