In [72]:
import numpy as np
import pandas as pd
from collections import Counter
import operator
import copy
import random
from sklearn import model_selection
raw_games = pd.read_csv("steam-200k.csv", names =["user_id", "game", "behavior", "value", "misc."])
playtime = raw_games[raw_games["behavior"] == "play" ]
purchased = raw_games[raw_games["behavior"] == "purchase" ]

In [190]:
def bag_of_games(data_set):
    user_ids = data_set["user_id"].unique()
    user_bag = {}#.fromkeys(map((lambda x: str(x)),user_ids), {})
    
    
    for user in user_ids:
        
        user_bag[str(user)] = {}
        for row in data_set[data_set["user_id"] == user][["game", "value"]].itertuples():
            user_bag[str(user)][row[1]] = row[2]
        
    for user, gamelist in user_bag.items():
        factor  = 1/sum(gamelist.values())
        for game in gamelist.keys():
            gamelist[game] = factor*gamelist[game]
    
        
    return user_bag

In [192]:
%time playtime_bag = bag_of_games(playtime)

CPU times: user 57.2 s, sys: 718 ms, total: 58 s
Wall time: 1min


In [169]:
len(playtime_bag)

11350

In [197]:
def leave_n_in(bag, n=1, p = .25):
    copy_bag = copy.deepcopy(bag)
    user_ids = list(copy_bag.keys())
    
    N = 0
    targets = []
    for user in bag.keys():
        game_list = bag[user] 
        j = len(game_list)
        N += j
        if j > n:
            targets.append(user)
            
                
    k = int(p*N)
    popped = []
    removed = {}
    i = 0
    h = len(targets)
    while i != k:
        rand_1 = int(h*(random.random()))
        user = targets[rand_1]
        gamelist = copy_bag[user]
        j = len(gamelist) 
        if j <= n:
            pass
        else:
            if user not in popped:
                popped.append(user)
            rand_2 = int(j*random.random())
            games  = list(gamelist.keys())
            selected = games[rand_2]
            removed[(user, selected)] = gamelist.pop(selected)   
            i += 1
    else:
        query_bag = {}
        for user in popped:
            query_bag[user] = copy_bag.pop(user)
        return copy_bag, removed, query_bag

In [209]:
%time train_bag, heldout, query_bag = leave_n_in(playtime_bag)

CPU times: user 1.59 s, sys: 17.5 ms, total: 1.6 s
Wall time: 1.77 s


In [210]:
def baseline(train_bag, query_bag, heldout, user_reg=25, game_reg=25):
    total_bag = copy.deepcopy(train_bag)
    total_bag.update(query_bag)
    n = 0
    total = 0
    for userid in train_bag:
        gamelist = train_bag[userid]
        n += len(list(gamelist.keys()))
        total += sum(gamelist.values())
    else:
        total_avg = total/n
        
    user_baseline = {}
    for userid in total_bag:
        user = total_bag[userid]
        user_baseline[userid] = (sum(user.values()) - len(user)*total_avg)/(len(user)+user_reg)
    
    predictions = {}
    for test_point in heldout.keys():
        test_user = test_point[0]
        test_game = test_point[1]
        bu = user_baseline[test_user]
        n = 0
        bi = 0
        for userid in train_bag:
            user = train_bag[userid]
            if test_game in user.keys():
                bi += user[test_game] - user_baseline[userid] - total_avg
                n += 1
        else:
            bi = bi/(n + game_reg)
        
        
        predictions[test_point] = total_avg + bu + bi
    return predictions

In [211]:
%time predictions = baseline(train_bag, query_bag, heldout)

CPU times: user 40.9 s, sys: 420 ms, total: 41.4 s
Wall time: 45.9 s


In [212]:
def get_metrics(actuals, preds):
    abs_diff = {}
    square_diff = {}
    for point in preds.keys():
        diff = float(actuals[point]) - float(preds[point])
        abs_diff[point] = abs(diff)
        square_diff[point] = diff*diff
    MAE = np.array(list(abs_diff.values())).mean()
    MSE = np.array(list(square_diff.values())).mean()
    RMSE = MSE**.5
    return MAE, RMSE

In [213]:
actuals = heldout
preds = predictions
get_metrics(actuals, preds)

(0.62716647362889011, 0.68496917488601394)

In [203]:
def bag_folder(bag, folds = 4):
    percent = 1/folds
    users = np.array(list(bag.keys()))
    np.random.shuffle(users)
    cutoffs = np.array(range(folds))*percent*len(users)
    first_fold  = {"test_bag": {user : bag[user] for user in users[:int(cutoffs[1])]},
                   "train_bag": {user : bag[user] for user in users[int(cutoffs[1]):]}}
    folded = {1: first_fold}
    for i in range(2,folds):
        split = (int(cutoffs[i-1]),int(cutoffs[i]))
        test_slice = users[split[0]: split[1]]
        train_slice = []
        for user in users:
            if user not in test_slice:
                train_slice.append(user)
        folded[i] = {"test_bag": {user : bag[user] for user in test_slice},
                     "train_bag": {user : bag[user] for user in train_slice}}
    
    last_fold  = {"test_bag": {user : bag[user] for user in users[int(cutoffs[-1]):]},
                  "train_bag": {user : bag[user] for user in users[:int(cutoffs[-1])]}}
    folded[folds] = last_fold
    return folded

In [204]:
folded = bag_folder(playtime_bag, 5)

In [206]:
absolute, square = [], []
for i in folded:
    fold = folded[i]
    test_bag = fold["test_bag"]
    train_bag = fold["train_bag"]
    query_bag, heldout = leave_1_out(test_bag)
    predictions = baseline(train_bag, query_bag, heldout)
    actuals = heldout
    preds = predictions
    MAE, RMSE = get_metrics(actuals, preds)
    absolute.append(MAE)
    square.append(RMSE)


In [207]:
np.array(absolute).mean()

0.35956849481895781

In [208]:
np.array(square).mean()

0.44728913949529447