# Search for a subset of data suitable for a test set

We want to find some set of items that were all rated by some subset of users. This would give us a test subset with full feedback.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from data_loading import get_movielens_data

In [4]:
# Reproducibility
seed = 42
np.random.seed(seed)

In [5]:
PROJECT_DIR = '.'

In [6]:
ratings = pd.read_csv(
    f"{PROJECT_DIR}/dataset/amazon/Sports_and_Outdoors.csv", names=["item_id", "user_id", "rating", "timestamp"]
)


In [7]:
ratings.head()

Unnamed: 0,item_id,user_id,rating,timestamp
0,31895,A23K73OVXJ04EG,5.0,1391212800
1,31895,A2681T699HV6H1,4.0,1384905600
2,31895,A374PA18DCGS5Y,1.0,1477008000
3,31895,A14PVW2N5YBWSA,5.0,1476748800
4,31895,A2KWBC44QI2567,1.0,1476662400


In [19]:
ratings.groupby('user_id').size()

user_id
A0000040I1OM9N4SGBD8     2
A0000074RA15UCBH3ON5     1
A000013090ZI3HIT9N5V     3
A0000196KBA0ICH151EG     1
A0000618JRL8NVY0J0AN     1
                        ..
AZZZV0D9D5V05            1
AZZZVAXZQB7JJ           13
AZZZWV7EIR8PG            1
AZZZY1W55XHZR            3
AZZZYAYJQSDOJ            5
Length: 6703391, dtype: int64

In [18]:
ratings.groupby('user_id').size().sort_values(ascending=False)

user_id
A3OXHLG6DIBRW8    433
AN81JUYW2SL24     373
AVU1ILDDYW301     347
A2O489VAPVIH35    338
A8VI7KMUHI7ZH     310
                 ... 
A1Q1HTWEDJICOP      1
A316JTX825MIRP      1
A316JU6IVL0EZ9      1
A316JVZ7N3HPB5      1
A2W4D16OPPSCK0      1
Length: 6703391, dtype: int64

## Check users that all have rated same top-k items

In [8]:
def users_that_rated_all_items(ratings, items):
    user_sets = []
    for item_id in items:
        item_users = set(ratings[ratings.item_id == item_id].user_id)
        user_sets.append(item_users)
    return set.intersection(*user_sets)

In [9]:
def users_that_rated_all_top_k_items(ratings, k):
    items = ratings.groupby('item_id').size().sort_values(ascending=False)
    top_k_items = set(items[:k].index)
    top_k_ratings = ratings[ratings.item_id.isin(top_k_items)]
    top_k_users = users_that_rated_all_items(top_k_ratings, top_k_items)
    return len(top_k_users)

In [10]:
users_rated_top_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 50):
    users_rated_top_k_items['k'].append(k)
    users_rated_top_k_items['users'].append(users_that_rated_all_top_k_items(ratings, k))

In [11]:
pd.DataFrame(users_rated_top_k_items)

Unnamed: 0,k,users
0,1,9990
1,2,5
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


## Check users that each has rated at least k items (may be different)

In [12]:
def users_that_rated_at_least_k_items(ratings, k):
    users = ratings.groupby('user_id').size().sort_values(ascending=False)
    at_least_k_users = users[users > k]
    return len(at_least_k_users)

In [15]:
users_rated_at_least_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 550, 50):
    users_rated_at_least_k_items['k'].append(k)
    users_rated_at_least_k_items['users'].append(users_that_rated_at_least_k_items(ratings, k))

In [16]:
pd.DataFrame(users_rated_at_least_k_items)

Unnamed: 0,k,users
0,1,2299429
1,51,1618
2,101,173
3,151,40
4,201,22
5,251,10
6,301,6
7,351,2
8,401,1
9,451,0
