# Search for a subset of data suitable for a test set

We want to find some set of items that were all rated by some subset of users. This would give us a test subset with full feedback.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from datetime import datetime

from data_loading import get_movielens_data

In [4]:
# Reproducibility
seed = 42
np.random.seed(seed)

In [5]:
PROJECT_DIR = '.'

In [6]:
ratings = pd.read_csv(
    f"{PROJECT_DIR}/dataset/amazon/Books.csv", names=["item_id", "user_id", "rating", "timestamp"]
)


In [7]:
ratings.item_id.unique().shape

(2930451,)

In [8]:
ratings

Unnamed: 0,item_id,user_id,rating,timestamp
0,0001713353,A1C6M8LCIX4M6M,5.0,1123804800
1,0001713353,A1REUF3A1YCPHM,5.0,1112140800
2,0001713353,A1YRBRK2XM5D5,5.0,1081036800
3,0001713353,A1V8ZR5P78P4ZU,5.0,1077321600
4,0001713353,A2ZB06582NXCIV,5.0,1475452800
...,...,...,...,...
51311616,B01HJDS76Y,A3P8PUZFHWFK1E,4.0,1467244800
51311617,B01HJDS76Y,A26Q9T9G9MTX9S,1.0,1467158400
51311618,B01HJEB422,AB9M1MQPBAS2J,5.0,1487030400
51311619,B01HJEB422,A2TO93KMH3DJIK,5.0,1485648000


In [9]:
max_ts = ratings.timestamp.max()

In [10]:
min_ts = ratings.timestamp.min()

In [15]:
max_ts

1538438400

In [16]:
min_ts

832550400

In [18]:
datetime.fromtimestamp(1538438400)

datetime.datetime(2018, 10, 2, 2, 0)

In [17]:
datetime.fromtimestamp(832550400)

datetime.datetime(1996, 5, 20, 2, 0)

In [21]:
YEARS_BEHIND = 5.8
max_ts = ratings.timestamp.max()
start_time = max_ts - 3600 * 24 * 365 * YEARS_BEHIND
end_time = max_ts - 3600 * 24 * 365 * (YEARS_BEHIND-0.5)

In [23]:
datetime.fromtimestamp(start_time), datetime.fromtimestamp(end_time)

(datetime.datetime(2012, 12, 15, 1, 0), datetime.datetime(2013, 6, 15, 14, 0))

In [22]:
new_ratings = ratings[ratings.timestamp > start_time]
new_ratings = new_ratings[new_ratings.timestamp < end_time]

In [14]:
del ratings

In [15]:
datetime.fromtimestamp(1506988800)

datetime.datetime(2017, 10, 3, 2, 0)

In [16]:
new_ratings

Unnamed: 0,item_id,user_id,rating,timestamp
380,0001384198,A310S8GKL783R8,5.0,1523577600
381,0001384198,ABO3HOSXIPOZY,4.0,1523404800
382,0001384198,AU4Z1Q4AX9ILA,5.0,1523318400
383,0001384198,A1TGZVJZK8SMBX,5.0,1523232000
384,0001384198,A3CCP99A7Z7HHT,3.0,1523145600
...,...,...,...,...
51311540,B01HJ4GFPS,ARAE0E3LSLL35,5.0,1523577600
51311541,B01HJ4GFPS,A1QDUNLE8SAQAL,5.0,1523318400
51311575,B01HJ71K4Q,A2FRD6XBB8923L,4.0,1538352000
51311584,B01HJ8KWDU,A2AYU3DW4HHCSJ,5.0,1538006400


In [17]:
N_ARMS = 400

In [18]:
actions = new_ratings.groupby("item_id").size().sort_values(ascending=False)[:N_ARMS]

In [19]:
len(actions)

400

In [20]:
actions = list(actions.index)

In [21]:
top_ratings = new_ratings[new_ratings["item_id"].isin(actions)]

In [22]:
top_ratings = top_ratings.sort_values("timestamp", ascending=1)

In [23]:
top_ratings

Unnamed: 0,item_id,user_id,rating,timestamp
28422930,1503949184,AUOZCP7MJ8Y34,4.0,1522713600
2923910,0091956943,A2H5MLJ4FWPDZH,5.0,1522713600
2923911,0091956943,A13P6FG0HWN8NJ,4.0,1522713600
2923972,0091956943,A35BBREAKV3AT,5.0,1522713600
28422937,1503949184,AF2ZEM2B0Y107,1.0,1522713600
...,...,...,...,...
51309935,B01HB9Q7CW,A3BYZXHJMXPFAG,5.0,1537920000
51309936,B01HB9Q7CW,A3NZRO52W86KF8,5.0,1537920000
51309933,B01HB9Q7CW,A3QLH30O6L0ZTT,5.0,1538006400
51309932,B01HB9Q7CW,A1811AKYX3RPSN,4.0,1538265600


In [24]:
user_stream = top_ratings[["user_id", "timestamp"]]

In [25]:
top_ratings.groupby('user_id').size().sort_values(ascending=False)[:10000]

user_id
A2MPS6SED8DVCW    45
A39YL17T9LO0SP    41
A2R2O6R00PXFC2    40
A1JLU5H1CCENWX    37
A3SJW5TGW16UDT    36
                  ..
A3U5R1T0U6OMG6     2
A14M932494HEP      2
A15UNCBR5LLPQE     2
A15UOLX72HCR05     2
A2Z3LHEZ99BU3E     2
Length: 10000, dtype: int64

In [26]:
top_users = list(top_ratings.groupby('user_id').size().sort_values(ascending=False)[:5000].index)

In [27]:
user_stream = user_stream[user_stream.user_id.isin(top_users)]

In [28]:
user_stream

Unnamed: 0,user_id,timestamp
28422937,AF2ZEM2B0Y107,1522713600
28422980,A3S0CAA3NO7KAJ,1522713600
28422933,A3DGIT33XGIHD9,1522713600
33492086,A3NO3YDQ9U76TQ,1522713600
33492041,A3OGIGTQ6SIHSN,1522713600
...,...,...
37943159,AUH8IK46Z6V9R,1537488000
37943150,A2FHQI74OKE1XI,1537574400
37943151,A3IDANEJLTGHUQ,1537574400
51273672,A15ORM6PIO5VUX,1537660800


In [29]:
# user_stream = user_stream.iloc[-100000:]

In [30]:
# user_stream = top_ratings[["user_id", "timestamp"]].iloc[-100000:]

In [31]:
users = set(user_stream.user_id)

In [32]:
len(users)

5000

In [33]:
top_ratings = top_ratings[top_ratings.user_id.isin(users)]

In [34]:
user_stream

Unnamed: 0,user_id,timestamp
28422937,AF2ZEM2B0Y107,1522713600
28422980,A3S0CAA3NO7KAJ,1522713600
28422933,A3DGIT33XGIHD9,1522713600
33492086,A3NO3YDQ9U76TQ,1522713600
33492041,A3OGIGTQ6SIHSN,1522713600
...,...,...
37943159,AUH8IK46Z6V9R,1537488000
37943150,A2FHQI74OKE1XI,1537574400
37943151,A3IDANEJLTGHUQ,1537574400
51273672,A15ORM6PIO5VUX,1537660800


In [35]:
top_ratings["reward"] = np.where(top_ratings["rating"] >= 0.0, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_ratings["reward"] = np.where(top_ratings["rating"] >= 0.0, 1, 0)


In [36]:
reward_list = top_ratings[["user_id", "item_id", "reward", "rating"]]

In [37]:
reward_list = reward_list[reward_list['reward'] == 1]

In [38]:
reward_list = reward_list.set_index("user_id")
watched_list_series = (
    reward_list.groupby("user_id")["item_id"].agg(set=set).set
)
user_id_to_watched_list_index = {
    uid: ind for ind, uid in enumerate(watched_list_series.index)
}

In [39]:
len(user_id_to_watched_list_index)

5000

In [40]:
reward_matrix = np.zeros((len(user_id_to_watched_list_index), len(actions)))
for uid, ind in user_id_to_watched_list_index.items():
    watched_list = watched_list_series.iloc[ind]
    watched_indices = [i for i in range(len(actions)) if actions[i] in watched_list]
    # Binary vector of rewards for each arm.
    reward_matrix[ind, watched_indices] = 1

In [41]:
reward_matrix.shape

(5000, 400)

In [42]:
reward_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
mean_rew = np.mean(reward_matrix, axis=0)
np.sort(mean_rew)[::-1][:50]

array([0.0754, 0.0714, 0.0678, 0.0664, 0.0562, 0.0492, 0.0484, 0.0482,
       0.047 , 0.0456, 0.0444, 0.044 , 0.0422, 0.0418, 0.0408, 0.0402,
       0.0396, 0.039 , 0.0388, 0.0388, 0.0384, 0.038 , 0.0372, 0.037 ,
       0.0364, 0.036 , 0.036 , 0.0356, 0.0354, 0.0344, 0.0342, 0.034 ,
       0.0338, 0.0336, 0.0332, 0.0332, 0.0332, 0.0332, 0.0324, 0.0324,
       0.0322, 0.0316, 0.0314, 0.0314, 0.0308, 0.0306, 0.0306, 0.0302,
       0.0302, 0.03  ])

In [44]:
exp_rewards = np.zeros((len(user_stream), N_ARMS))
for i, user_id in enumerate(user_stream.user_id):
    ind = user_id_to_watched_list_index[user_id]
    exp_rewards[i, :] = reward_matrix[ind, :]

In [45]:
exp_rewards.shape

(25148, 400)

In [46]:
exp_rewards

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
mean_rew = np.mean(exp_rewards, axis=0)
np.sort(mean_rew)[::-1][:50]

array([0.12462224, 0.12319071, 0.11332909, 0.10827899, 0.09615079,
       0.09519644, 0.09372515, 0.09066327, 0.08545411, 0.08541435,
       0.08354541, 0.08231271, 0.08179577, 0.0806426 , 0.08004613,
       0.07789884, 0.07475744, 0.07380309, 0.07348497, 0.0723318 ,
       0.07054239, 0.06923016, 0.06903133, 0.06831557, 0.06811675,
       0.06736122, 0.0671624 , 0.06600923, 0.06577064, 0.06473676,
       0.06410052, 0.064021  , 0.06366311, 0.06354382, 0.06247018,
       0.06239065, 0.061317  , 0.05944807, 0.05889136, 0.05821537,
       0.05821537, 0.05773819, 0.05769843, 0.0574996 , 0.05698266,
       0.05698266, 0.05650549, 0.05634643, 0.0555909 , 0.05511373])

In [48]:
mean_rew = exp_rewards.mean(axis=0)

In [49]:
np.sort(mean_rew)[::-1][:50]

array([0.12462224, 0.12319071, 0.11332909, 0.10827899, 0.09615079,
       0.09519644, 0.09372515, 0.09066327, 0.08545411, 0.08541435,
       0.08354541, 0.08231271, 0.08179577, 0.0806426 , 0.08004613,
       0.07789884, 0.07475744, 0.07380309, 0.07348497, 0.0723318 ,
       0.07054239, 0.06923016, 0.06903133, 0.06831557, 0.06811675,
       0.06736122, 0.0671624 , 0.06600923, 0.06577064, 0.06473676,
       0.06410052, 0.064021  , 0.06366311, 0.06354382, 0.06247018,
       0.06239065, 0.061317  , 0.05944807, 0.05889136, 0.05821537,
       0.05821537, 0.05773819, 0.05769843, 0.0574996 , 0.05698266,
       0.05698266, 0.05650549, 0.05634643, 0.0555909 , 0.05511373])

In [50]:
mean_rew.sum()

7.634046445045332

In [53]:
np.arange(0.1, 12, 0.1)

array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
        1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
        2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
        3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
        4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
        5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
        6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
        7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
        8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
       10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
       11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9])

In [74]:
ratings = pd.read_csv(
    f"{PROJECT_DIR}/dataset/amazon/Video_Games.csv", names=["item_id", "user_id", "rating", "timestamp"]
)

years_search = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6, 6.5, 7, 7.5, 8]
# years_search = [5.8]
# years_search = np.arange(1, 8, 0.2)


max_sum = 0
best_year = 0
for YEARS_BEHIND in years_search:
    print(f"{YEARS_BEHIND} years behind")
    max_ts = ratings.timestamp.max()
    start_time = max_ts - 3600 * 24 * 365 * YEARS_BEHIND
    end_time = max_ts - 3600 * 24 * 365 * (YEARS_BEHIND-0.5)

    new_ratings = ratings[ratings.timestamp > start_time]
    new_ratings = new_ratings[new_ratings.timestamp < end_time]

    N_ARMS = 400
    actions = new_ratings.groupby("item_id").size().sort_values(ascending=False)[:N_ARMS]
    actions = list(actions.index)
    top_ratings = new_ratings[new_ratings["item_id"].isin(actions)]
    
    top_ratings = top_ratings.sort_values("timestamp", ascending=1)

    user_stream = top_ratings[["user_id", "timestamp"]]

    top_users = list(top_ratings.groupby('user_id').size().sort_values(ascending=False)[:5000].index)
    user_stream = user_stream[user_stream.user_id.isin(top_users)]
    
    # This line makes sure we use the 5k users sorted by timestamp.
    user_stream = user_stream.drop_duplicates(subset=['user_id'])
    
    print(f"Length of user_stream: {len(user_stream)}")
#     if len(user_stream) > 100000:
#         user_stream = user_stream.iloc[-100000:]
#         print("ENOUGH")
    
    users = set(user_stream.user_id)
    top_ratings = top_ratings[top_ratings.user_id.isin(users)]

    top_ratings["reward"] = np.where(top_ratings["rating"] >= 0.0, 1, 0)
    reward_list = top_ratings[["user_id", "item_id", "reward", "rating"]]
    reward_list = reward_list[reward_list['reward'] == 1] 
    reward_list = reward_list.set_index("user_id")
    
    watched_list_series = (
        reward_list.groupby("user_id")["item_id"].agg(set=set).set
    )
    user_id_to_watched_list_index = {
        uid: ind for ind, uid in enumerate(watched_list_series.index)
    }
    reward_matrix = np.zeros((len(user_id_to_watched_list_index), len(actions)))
    for uid, ind in user_id_to_watched_list_index.items():
        watched_list = watched_list_series.iloc[ind]
        watched_indices = [i for i in range(len(actions)) if actions[i] in watched_list]
        # Binary vector of rewards for each arm.
        reward_matrix[ind, watched_indices] = 1

    mean_rew = reward_matrix.mean(axis=0)
    print(np.sort(mean_rew)[::-1][:50])
    print(f"Sum of rewards:\t {mean_rew.sum()}")
    if mean_rew.sum() > max_sum:
        max_sum = mean_rew.sum()
        best_year = YEARS_BEHIND
    print('\n')

0.5 years behind
Length of user_stream: 5000
[0.1644 0.1644 0.1636 0.0684 0.0672 0.0672 0.0668 0.0562 0.0562 0.05
 0.0456 0.0452 0.0292 0.029  0.0268 0.0244 0.0238 0.0238 0.0204 0.0154
 0.0136 0.0136 0.013  0.0118 0.0106 0.0104 0.0104 0.0102 0.009  0.009
 0.0084 0.0082 0.0082 0.0082 0.008  0.008  0.008  0.0074 0.0074 0.0074
 0.0072 0.007  0.0068 0.0068 0.0062 0.0062 0.0062 0.006  0.006  0.006 ]
Sum of rewards:	 2.272


1 years behind
Length of user_stream: 5000
[0.128  0.128  0.1082 0.0754 0.0754 0.0754 0.0754 0.0752 0.0492 0.0492
 0.0492 0.0492 0.0492 0.0492 0.049  0.0486 0.046  0.0452 0.0392 0.0388
 0.0264 0.0262 0.0222 0.0206 0.0206 0.0198 0.0198 0.0198 0.0196 0.0194
 0.0192 0.019  0.0176 0.0162 0.016  0.0152 0.0152 0.0146 0.0144 0.0144
 0.0136 0.013  0.0124 0.0122 0.012  0.0112 0.0108 0.0102 0.01   0.0098]
Sum of rewards:	 2.571


1.5 years behind
Length of user_stream: 5000
[0.075  0.0728 0.071  0.071  0.071  0.071  0.0606 0.0606 0.0606 0.0604
 0.0542 0.0542 0.0466 0.0466 0.0466 0

In [75]:
best_year

2

In [84]:
len(user_stream.user_id.unique())

30000

In [87]:
np.sort(mean_rew)[::-1][:50]

array([0.04733, 0.04656, 0.04648, 0.04082, 0.04082, 0.04082, 0.04075,
       0.04057, 0.04005, 0.03971, 0.03756, 0.03412, 0.02974, 0.02644,
       0.02637, 0.02371, 0.02121, 0.02084, 0.01922, 0.01897, 0.01897,
       0.0189 , 0.01869, 0.01869, 0.01807, 0.01807, 0.01669, 0.01581,
       0.01509, 0.01463, 0.01458, 0.01453, 0.0143 , 0.01429, 0.01413,
       0.01385, 0.0138 , 0.01378, 0.01374, 0.01369, 0.01351, 0.01296,
       0.01258, 0.01256, 0.01249, 0.01241, 0.0124 , 0.01228, 0.0122 ,
       0.01197])

How to sort the users by timestamp?
I have the users sorted by their ratings.

In [8]:
ratings.groupby('user_id').size()

user_id
A000033826RVJH496D4A    3
A00007762BKXYRMOCC0A    1
A0001176G5I54P8WV7J5    1
A0001258YUYGHOWU7Y0C    1
A0001392IVCRENBEIEYS    1
                       ..
AZZZXMJMM84SU           1
AZZZYCR4NZADZ           2
AZZZZCVMZCKUJ           1
AZZZZW74AAX75           1
AZZZZXVAOWWME           2
Length: 15362619, dtype: int64

In [9]:
ratings.groupby('user_id').size().sort_values(ascending=False)

user_id
A2OJW07GQRNJUT    9684
A2F6N60Z96CAJI    9074
A328S9RN3U5M68    7077
AHUT55E980RDR     5842
A1X8VZWTOG8IS6    4437
                  ... 
A389V60VNXAC1I       1
A389V4BWF5QL0J       1
A1WJEWVJJ6ISMD       1
A1WJEX7RQEFKSE       1
A1P1A4OKMU0GXL       1
Length: 15362619, dtype: int64

## Check users that all have rated same top-k items

In [8]:
def users_that_rated_all_items(ratings, items):
    user_sets = []
    for item_id in items:
        item_users = set(ratings[ratings.item_id == item_id].user_id)
        user_sets.append(item_users)
    return set.intersection(*user_sets)

In [9]:
def users_that_rated_all_top_k_items(ratings, k):
    items = ratings.groupby('item_id').size().sort_values(ascending=False)
    top_k_items = set(items[:k].index)
    top_k_ratings = ratings[ratings.item_id.isin(top_k_items)]
    top_k_users = users_that_rated_all_items(top_k_ratings, top_k_items)
    return len(top_k_users)

In [10]:
users_rated_top_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 50):
    users_rated_top_k_items['k'].append(k)
    users_rated_top_k_items['users'].append(users_that_rated_all_top_k_items(ratings, k))

In [11]:
pd.DataFrame(users_rated_top_k_items)

Unnamed: 0,k,users
0,1,9990
1,2,5
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0


## Check users that each has rated at least k items (may be different)

In [12]:
def users_that_rated_at_least_k_items(ratings, k):
    users = ratings.groupby('user_id').size().sort_values(ascending=False)
    at_least_k_users = users[users > k]
    return len(at_least_k_users)

In [15]:
users_rated_at_least_k_items = {
    "k": [],
    "users": [],
}
for k in range(1, 550, 50):
    users_rated_at_least_k_items['k'].append(k)
    users_rated_at_least_k_items['users'].append(users_that_rated_at_least_k_items(ratings, k))

In [16]:
pd.DataFrame(users_rated_at_least_k_items)

Unnamed: 0,k,users
0,1,2299429
1,51,1618
2,101,173
3,151,40
4,201,22
5,251,10
6,301,6
7,351,2
8,401,1
9,451,0
