# Search for a subset of data suitable for a test set

We want to find some set of items that were all rated by some subset of users. This would give us a test subset with full feedback.

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[autoreload of data_loading.amazon failed: Traceback (most recent call last):
  File "/Users/sbokupripeku/miniforge3/envs/reduction_bandits/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 257, in check
    superreload(m, reload, self.old_objects)
  File "/Users/sbokupripeku/miniforge3/envs/reduction_bandits/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 455, in superreload
    module = reload(module)
  File "/Users/sbokupripeku/miniforge3/envs/reduction_bandits/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/Users/sbokupripeku/git/work/examples/reduction_bandits/data_loading/amazon.py", line 4, in <module>
    from data_loading import RecommenderDataset
ImportError: cannot import n

In [20]:
import os
os.chdir('..')

In [21]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from surprise import Dataset

from data_loading import get_movielens_data

In [22]:
# Reproducibility
seed = 42
np.random.seed(seed)

In [23]:
PROJECT_DIR = '.'

In [24]:
jokes = Dataset.load_builtin("jester")
ratings = pd.DataFrame(data=jokes.raw_ratings)
ratings.columns = ['user_id', 'item_id', 'rating', 'other']
del ratings['other']  # all_ratings.drop('other', axis=0)

In [25]:
ratings

Unnamed: 0,user_id,item_id,rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281
3,1,13,-6.781
4,1,15,0.875
...,...,...,...
1761434,63978,57,-8.531
1761435,63978,24,-9.062
1761436,63978,124,-9.031
1761437,63978,58,-8.656


## Check users that all have rated same top-k items

In [26]:
def users_that_rated_all_items(ratings, items):
    user_sets = []
    for item_id in items:
        item_users = set(ratings[ratings.item_id == item_id].user_id)
        user_sets.append(item_users)
    return set.intersection(*user_sets)

In [28]:
def users_that_rated_all_top_k_items(ratings, k):
    items = ratings.groupby('item_id').size().sort_values(ascending=False)
    top_k_items = set(items[:k].index)
    top_k_ratings = ratings[ratings.item_id.isin(top_k_items)]
    top_k_users = users_that_rated_all_items(top_k_ratings, top_k_items)
    return top_k_users

In [29]:
top_k_users = users_that_rated_all_top_k_items(ratings, 140)

In [30]:
top_k_users

{'227', '304', '476', '477', '701'}

In [17]:
users_rated_top_k_items = {
    "k": [],
    "users": [],
}
for k in range(140, 141):
    users_rated_top_k_items['k'].append(k)
    users_rated_top_k_items['users'].append(users_that_rated_all_top_k_items(ratings, k))

In [18]:
pd.DataFrame(users_rated_top_k_items)

Unnamed: 0,k,users
0,140,5


## Check users that each has rated at least k items (may be different)

In [12]:
def users_that_rated_at_least_k_items(ratings, k):
    users = ratings.groupby('user_id').size().sort_values(ascending=False)
    at_least_k_users = users[users > k]
    return len(at_least_k_users)

In [15]:
users_rated_at_least_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 150, 10):
    users_rated_at_least_k_items['k'].append(k)
    users_rated_at_least_k_items['users'].append(users_that_rated_at_least_k_items(ratings, k))

In [16]:
pd.DataFrame(users_rated_at_least_k_items)

Unnamed: 0,k,users
0,1,57732
1,11,37263
2,21,24164
3,31,17049
4,41,12792
5,51,10089
6,61,8204
7,71,6824
8,81,5751
9,91,4840
