# Search for a subset of data suitable for a test set

We want to find some set of items that were all rated by some subset of users. This would give us a test subset with full feedback.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from data_loading import get_movielens_data

In [4]:
# Reproducibility
seed = 42
np.random.seed(seed)

In [5]:
PROJECT_DIR = '.'

In [6]:
ratings = pd.read_table(f"{PROJECT_DIR}/dataset/movielens/ratings.dat", sep="::",
                           names=["user_id", "item_id", "rating", "timestamp"], engine='python')

In [7]:
ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392
...,...,...,...,...
10000049,71567,2107,1.0,912580553
10000050,71567,2126,2.0,912649143
10000051,71567,2294,5.0,912577968
10000052,71567,2338,2.0,912578016


## Check users that all have rated same top-k items

In [8]:
def users_that_rated_all_items(ratings, items):
    user_sets = []
    for item_id in items:
        item_users = set(ratings[ratings.item_id == item_id].user_id)
        user_sets.append(item_users)
    return set.intersection(*user_sets)

In [9]:
def users_that_rated_all_top_k_items(ratings, k):
    items = ratings.groupby('item_id').size().sort_values(ascending=False)
    top_k_items = set(items[:k].index)
    top_k_ratings = ratings[ratings.item_id.isin(top_k_items)]
    top_k_users = users_that_rated_all_items(top_k_ratings, top_k_items)
    return len(top_k_users)

In [15]:
users_rated_top_k_items = {
    "k": [],
    "users": [],
}
for k in range(999, 1000):
    users_rated_top_k_items['k'].append(k)
    users_rated_top_k_items['users'].append(users_that_rated_all_top_k_items(ratings, k))

In [16]:
pd.DataFrame(users_rated_top_k_items)

Unnamed: 0,k,users
0,999,0


## Check users that each has rated at least k items (may be different)

In [12]:
def users_that_rated_at_least_k_items(ratings, k):
    users = ratings.groupby('user_id').size().sort_values(ascending=False)
    at_least_k_users = users[users > k]
    return len(at_least_k_users)

In [13]:
users_rated_at_least_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 1100, 100):
    users_rated_at_least_k_items['k'].append(k)
    users_rated_at_least_k_items['users'].append(users_that_rated_at_least_k_items(ratings, k))

In [14]:
pd.DataFrame(users_rated_at_least_k_items)

Unnamed: 0,k,users
0,1,69878
1,101,26191
2,201,13473
3,301,8157
4,401,5311
5,501,3640
6,601,2552
7,701,1890
8,801,1412
9,901,1061
