In [3]:
import numpy as np
import pandas as pd
from collections import Counter
import operator
import copy
import random
from sklearn import model_selection
raw_games = pd.read_csv("steam-200k.csv", names =["user_id", "game", "behavior", "value", "misc."])
playtime = raw_games[raw_games["behavior"] == "play" ]
purchased = raw_games[raw_games["behavior"] == "purchase" ]

In [36]:
def bag_of_games(data_set):
    user_ids = list(Counter(data_set["user_id"]).keys())
    bag = np.empty(len(user_ids), dict)
    user_bag = {}.fromkeys(map((lambda x: str(x)),user_ids), {})
    
    
    for row in data_set.itertuples():
        user_bag[str(row[1])][row[2]] = float(row[4])
        
    i = 0
    for user, gamelist in user_bag.items():
        factor  = 1/sum(gamelist.values())
        for game in gamelist.keys():
            gamelist[game] = factor*gamelist[game]
        
    return user_bag

In [37]:
playtime_bag = bag_of_games(playtime)

In [60]:
def bag_split(bag, test = 0.25):
    users = np.array(list(bag.keys()))
    np.random.shuffle(users)
    splitter = int(test*len(users))
    test_bag = {user : bag[user] for user in users[:splitter]}
    train_bag = {user : bag[user] for user in users[splitter:]}
    return train_bag, test_bag

In [61]:
train_bag, test_bag = bag_split(playtime_bag)

In [125]:
def leave_1_out(bag):
    copy_bag = copy.deepcopy(bag)
    removed = []
    for user in copy_bag:
        gamelist = copy_bag[user]
        games  = list(gamelist.keys())
        rand = int(len(games)*random.random())
        selected = games[rand]
        rating = gamelist[selected]
        removed.append([user, selected, gamelist.pop(selected)])
    else:
        return copy_bag, removed

In [130]:
query_bag, heldout = leave_1_out(test_bag)

In [151]:
def baseline(train_bag, query_bag, heldout, user_reg=25, game_reg=25):
    total_bag = copy.deepcopy(train_bag)
    total_bag.update(query_bag)
    n = 0
    total = 0
    for userid in train_bag:
        gamelist = train_bag[userid]
        n += len(list(gamelist.keys()))
        total += sum(gamelist.values())
    else:
        total_avg = total/n
        
    user_baseline = {}
    for userid in total_bag:
        user = total_bag[userid]
        user_baseline[userid] = (sum(user.values()) - len(user)*total_avg)/(len(user)+user_reg)
    
    predictions = [[]]*len(heldout)
    for i in range(len(heldout)):
        bu = user_baseline[heldout[i][0]]
        game = heldout[i][1]
        n = 0
        bi = 0
        for userid in train_bag:
            user = train_bag[userid]
            if game in user.keys():
                bi += user[game] - user_baseline[userid] - total_avg
                n += 1
        else:
            bi = bi/(n + game_reg)
        
        
        predictions[i] =[heldout[i][0], heldout[i][1], total_avg + bu + bi]
        
    return predictions

In [152]:
predictions = baseline(train_bag, query_bag, heldout)

In [218]:
def get_metrics(actuals, preds):
    diff =np.empty(len(preds), float)
    for n in range(len(preds)):
        diff[n] = float(actuals[n]) - float(preds[n])
    abs_diff = abs(diff)
    square_diff = diff*diff
    MAE = abs_diff.mean()
    MSE = square_diff.mean()
    RMSE = MSE**.5
    return MAE, RMSE

In [219]:
actuals = np.array(heldout)[:, 2]
preds = np.array(predictions)[:, 2]
get_metrics(actuals, preds)

(5.9038507431930445e-05, 5.9349979509493552e-05)

In [198]:
def bag_folder(bag, folds = 4):
    percent = 1/folds
    users = np.array(list(bag.keys()))
    np.random.shuffle(users)
    cutoffs = np.array(range(folds))*percent*len(users)
    first_fold  = {"test_bag": {user : bag[user] for user in users[:int(cutoffs[1])]},
                   "train_bag": {user : bag[user] for user in users[int(cutoffs[1]):]}}
    folded = {1: first_fold}
    for i in range(2,folds):
        split = (int(cutoffs[i-1]),int(cutoffs[i]))
        test_slice = users[split[0]: split[1]]
        train_slice = []
        for user in users:
            if user not in test_slice:
                train_slice.append(user)
        folded[i] = {"test_bag": {user : bag[user] for user in test_slice},
                     "train_bag": {user : bag[user] for user in train_slice}}
    
    last_fold  = {"test_bag": {user : bag[user] for user in users[int(cutoffs[-1]):]},
                  "train_bag": {user : bag[user] for user in users[:int(cutoffs[-1])]}}
    folded[folds] = last_fold
    return folded

In [208]:
folded = bag_folder(playtime_bag, 5)

In [220]:
absolute, square = [], []
for i in folded:
    fold = folded[i]
    test_bag = fold["test_bag"]
    train_bag = fold["train_bag"]
    query_bag, heldout = leave_1_out(test_bag)
    predictions = baseline(train_bag, query_bag, heldout)
    actuals = np.array(heldout)[:, 2]
    preds = np.array(predictions)[:, 2]
    MAE, RMSE = get_metrics(actuals, preds)
    absolute.append(MAE)
    square.append(RMSE)


In [223]:
np.array(absolute).mean()

3.923294797901406e-05

In [224]:
np.array(square).mean()

4.0236311005247149e-05