In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import statistics
import tensorflow as tf






In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding="utf-8")
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

In [5]:
# Some data structures that will be useful
allHours = []
usersPerGame = defaultdict(set)
gamesPerUser = defaultdict(set)
games = set()
for l in readJSON("train.json.gz"):
    gamesPerUser[l[0]].add(l[1])
    usersPerGame[l[1]].add(l[0])
    allHours.append(l)
    games.add(l[1])

In [6]:
gameCountc = defaultdict(int)
totalPlayedc = 0

#REFACTOR THIS TO INCLUDE THE ENTIRE DATASET
for user,game,_ in allHours:
  gameCountc[game] += 1
  totalPlayedc += 1
mostPopularc = [(gameCountc[x], x) for x in gameCountc]
mostPopularc.sort()
mostPopularc.reverse()
mostPopularc_nums = [i[0] for i in mostPopularc]

usersPerGamec = defaultdict(set)
gamesPerUserc = defaultdict(set)
for u,g,_ in allHours:
    gamesPerUserc[u].add(g)
    usersPerGamec[g].add(u)

def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom


def get_popularity_percentile_c(g):
    counter = 0
    for i in mostPopularc:
        if i[1] == g:
            return 1 - (counter/len(mostPopularc))
        counter += 1
    #case where game does not appear, TODO
    return 0

def mean_item_jaccard_c(u,g):
    jlist = [Jaccard(usersPerGamec[g],usersPerGamec[gc],) for gc in list(gamesPerUserc[u]) if g!=gc]
    if len(jlist) == 0:
        return False
    return sum(jlist)/len(jlist)

def max_item_jaccard_c(u,g):
    return max([Jaccard(usersPerGamec[g],usersPerGamec[gc],) for gc in list(gamesPerUserc[u]) if g!=gc], default=0)



In [7]:
def read_pairs_Played():
    test = []
    print(test)
    for l in open("pairs_Played.csv"):
        if l.startswith("userID"):
            continue
        u,g = l.strip().split(',')
        test.append((u,g,0))
    return test

def write_pairs_Played(predictions):
    predictions_f = open("predictions_Played.csv", 'w')
    predictions_f.write("userID,gameID,prediction\n")
    for u,g,n in predictions:
        predictions_f.write(u + ',' + g + ',' + str(n) + '\n')
    predictions_f.close()
    

In [8]:
userIDs = {}
itemIDs = {}
interactions = []
playedSet = set()

userSet = set()
gameSet = set()

for u,i,r in allHours: #change to trainHours if necess
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))


nUsers,nItems = len(userIDs),len(itemIDs)

itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactions:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

items = list(itemIDs.keys())

In [28]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lambI, lambG):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lambI = lambI
        self.lambG = lambG
        
    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return (self.lambI * (tf.nn.l2_loss(self.betaI))) + (self.lambG * (tf.nn.l2_loss(self.gammaU))) +  (self.lambG * tf.nn.l2_loss(self.gammaI))

    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))



def trainingStepBPR(model, interactions):
    Nsamples = 150000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(5, 0.00001, 0.00001)
for i in range(800):
    if i == 400:
        optimizer.learning_rate = 0.00001
    obj = trainingStepBPR(modelBPR, interactions)

    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.50948447
iteration 20, objective = 0.47556764
iteration 30, objective = 0.46087337
iteration 40, objective = 0.4517896
iteration 50, objective = 0.44577694
iteration 60, objective = 0.442838
iteration 70, objective = 0.44014913
iteration 80, objective = 0.43844894
iteration 90, objective = 0.4385128
iteration 100, objective = 0.435513
iteration 110, objective = 0.43550178
iteration 120, objective = 0.43578362
iteration 130, objective = 0.43905908
iteration 140, objective = 0.4399476
iteration 150, objective = 0.43526947
iteration 160, objective = 0.44063962
iteration 170, objective = 0.4359154
iteration 180, objective = 0.43638206
iteration 190, objective = 0.43580627
iteration 200, objective = 0.4378103
iteration 210, objective = 0.43456268
iteration 220, objective = 0.43524468
iteration 230, objective = 0.43359262
iteration 240, objective = 0.43716258
iteration 250, objective = 0.4337182
iteration 260, objective = 0.43599638
iteration 270, objective = 0.43

In [29]:
import copy

all_copy = copy.deepcopy(allHours)
random.shuffle(all_copy)
logr_training = []

for u,g,_ in all_copy[:50000]:
    logr_training.append((u,g,1))
    while True:
        zg = random.choice(tuple(games))
        if not zg in gamesPerUser[u]:
            break
    logr_training.append((u,zg,0))


In [30]:
def feat_bprp(u,g,modelBPR):
    try: 
        return [1] + [modelBPR.predict(userIDs[u], itemIDs[g]).numpy()] + [get_popularity_percentile_c(g)]
    except KeyError:
        return [1] + [0.6] + [max_item_jaccard_c(u,g)]


def pipeline(reg, XT, XP, training_set):
    y = [r for _,_,r in training_set]
    mod = linear_model.LogisticRegression(C=reg, class_weight='balanced')
    mod.fit(XT,y)
    y_pred = mod.predict(XP)
    return y_pred



In [31]:
X_bprp = [feat_bprp(u,g, modelBPR) for u,g,_ in logr_training]
y_bprp = [r for _,_,r in logr_training]


In [32]:
test_set = read_pairs_Played()

mod = linear_model.LogisticRegression(C=0.0001, class_weight='balanced')
mod.fit(X_bprp, y_bprp)

test_predictions = []
for u,g,_ in test_set:
    try:
        f = [1] + [modelBPR.predict(userIDs[u], itemIDs[g]).numpy()] + [get_popularity_percentile_c(g)]
    except KeyError:
        f = [1] + [0.22] + [get_popularity_percentile_c(g)]
    test_predictions.append((u, g, mod.predict([f])[0]))

write_pairs_Played(test_predictions)


[]


In [14]:
##################################################
# Hours played prediction                        #
##################################################

In [15]:

globalAverage = sum([r["hours_transformed"] for u,g,r in allHours])/(len(allHours))
ug_review = {}
for u,g,r in allHours:
    ug_review[(u,g)] = r['hours_transformed']
hours = [(u,g,r['hours_transformed']) for u,g,r in allHours]

def iterate2(lambU,lambI,betaU,betaI,alpha,hoursList,hyperp):

    mse_i = MSE([(alpha + betaU[u] + betaI[g]) for u,g,r in hoursList], [r for _,_,r in hoursList])
    #prev = mse_i
    
    d = 1
    while d <= hyperp:
        alpha_t = 0
        betaU_t = defaultdict(lambda: 0)
        betaI_t = defaultdict(lambda: 0)
        for u,g,r in hoursList:
            alpha_t += r - (betaU[u] + betaI[g])

        alpha = alpha_t/len(hoursList)

        for u in betaU:
            sum_u = 0
            for i in gamesPerUser[u]:
                sum_u += ug_review[u,i] - (alpha + betaI[i])
            sum_u = sum_u/(lambU + len(gamesPerUser[u]))
            betaU_t[u] = sum_u

        betaU = betaU_t

        for i in betaI:
            sum_i = 0
            for u in usersPerGame[i]:
                sum_i += ug_review[u,i] - (alpha + betaU[u])
            sum_i = sum_i/(lambI + len(usersPerGame[i]))
            betaI_t[i] = sum_i
        
        betaI = betaI_t

        mse_i = MSE([(alpha + betaU[u] + betaI[g]) for u,g,r in hoursList], [r for _,_,r in hoursList])  

        #d = abs(mse_i - prev)
        d += 1
        #print(d)
        #prev = mse_i
        #print(mse_i)
    return alpha, betaU, betaI        

def MSE(y, ypred):
    differences = [(x-y)**2 for x,y in zip(ypred,y)]
    return sum(differences) / len(differences)

In [16]:
betaU = defaultdict(lambda: 0)
betaI = defaultdict(lambda: 0)

alpha = globalAverage 

alpha = globalAverage 
alpha, betaU, betaI = iterate2(8,3, betaU, betaI, alpha, hours, 400)
validMSE = MSE([(alpha + betaU[u] + betaI[g]) for u,g,_ in hours], [r for _,_,r in hours])
print(validMSE)


2.7905799620497387


In [17]:
def read_pairs_Hours():
    test = []
    print(test)
    for l in open("pairs_Hours.csv"):
        if l.startswith("userID"):
            continue
        u,g = l.strip().split(',')
        test.append((u,g,0))
    return test

def write_pairs_Hours(predictions):
    predictions_f = open("predictions_Hours.csv", 'w')
    predictions_f.write("userID,gameID,prediction\n")
    for u,g,n in predictions:
        predictions_f.write(u + ',' + g + ',' + str(n) + '\n')
    predictions_f.close()
    

In [18]:
test = read_pairs_Hours()
pred_would = [(u,g,(alpha + betaU[u] + betaI[g])) for u,g,r in test]
write_pairs_Hours(pred_would)

[]
