In [5]:
import numpy as np
import pandas as pd
from collections import Counter
import operator
import copy
import random
from sklearn import model_selection
raw_games = pd.read_csv("steam-200k.csv", names =["user_id", "game", "behavior", "value", "misc."])
playtime = raw_games[raw_games["behavior"] == "play" ]
purchased = raw_games[raw_games["behavior"] == "purchase" ]

In [6]:
def bag_of_games(data_set):
    user_ids = list(Counter(data_set["user_id"]).keys())
    bag = np.empty(len(user_ids), dict)
    user_bag = {}.fromkeys(user_ids, {})
    
    
    for row in data_set.itertuples():
        user_bag[row[1]][row[2]] = row[4]
        
    i = 0
    for key, value in user_bag.items():
        factor  = 1/sum(value.values())
        for key in value.keys():
            value[key] = factor*value[key]
        bag[i] = value
        i+=1
    return bag

In [7]:
playtime_bag = bag_of_games(playtime)

In [8]:
def bag_split(bag, test = 0.25):
    np.random.shuffle(bag)
    splitter = int(test*len(bag))
    test_bag = bag[:splitter]
    train_bag = bag[splitter:]
    return train_bag, test_bag

In [9]:
train_bag, test_bag = bag_split(playtime_bag)

In [10]:
def leave_p_out(bag, percent = .1, make_new_users = True):
    copy_bag = copy.deepcopy(bag)
    deleted = int(percent*len(bag))
    i = 0
    removed = []
    while i != deleted:
        rand=int(len(bag)*random.random())
        if len(copy_bag[rand]) == 1*(not make_new_users) :
            continue
        else:
            selected = copy_bag[rand]
            keys = list(selected.keys())
            deleted_key = keys[int(random.random()*len(keys))]
            removed.append([rand, deleted_key, copy_bag[rand].pop(deleted_key)])
            i+=1
    else:
        return copy_bag, removed

In [11]:
query_bag, missing = leave_p_out(test_bag, .1)

In [12]:
def baseline(train_bag, query_bag, heldout, user_reg=25, game_reg=25):
    stack_query_bag = copy.deepcopy(query_bag)
    total_bag = np.hstack((train_bag, stack_query_bag))
    n = 0
    total = 0
    for user in total_bag:
        n += len(list(user.keys()))
        total += sum(user.values())
    else:
        total_avg = total/n
        
    user_baseline = {}
    games = []
    for userid in range(len(total_bag)):
        user = total_bag[userid]
        user_baseline[userid] = (sum(user.values())/(len(user)+user_reg)) - total_avg
        games.extend(list(user.keys()))
    print()
    games = np.unique(games)
    games_baseline = {}.fromkeys(games, 0)
    for game in games:
        n = 0
        for userid in range(len(total_bag)):
            user = total_bag[userid]
            if game in user.keys():
                games_baseline[game] += user[game] - user_baseline[userid] - total_avg
                n += 1
        else:
            games_baseline[game] = games_baseline[game]/(n + game_reg)
    
    predictions = [[]]*len(heldout)
    for i in range(len(heldout)):
        bu = user_baseline[heldout[i][0]]
        bi = games_baseline[heldout[i][1]]
        predictions[i] =[heldout[i][0], heldout[i][1], total_avg + bu + bi]
        
    return predictions

In [13]:
predictions = baseline(train_bag, query_bag, missing)




In [14]:
def get_metrics(actuals, preds):
    diff =np.empty(len(preds), float)
    for n in range(len(preds)):
        diff[n] = float(actuals[n]) - float(preds[n])
    abs_diff = abs(diff)
    square_diff = diff*diff
    MAE = abs_diff.mean()
    MSE = square_diff.mean()
    return MAE, MSE

In [15]:
actuals = np.array(missing)[:, 2]
preds = np.array(predictions)[:, 2]
get_metrics(actuals, preds)

(8.5451859927558223e-07, 6.7235543146411918e-12)

In [16]:
query_bag1, missing1 = leave_p_out(test_bag, .1, False)
predictions1 = baseline(train_bag, query_bag1, missing1)
actuals1 = np.array(missing1)[:, 2]
preds1 = np.array(predictions1)[:, 2]
get_metrics(actuals1, preds1)




(9.0141362878364116e-07, 6.7146952934019104e-12)