In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [4]:
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras import initializers
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, Multiply, Reshape, Flatten, Concatenate
from keras.optimizers import Adam
from keras.regularizers import l2
import pandas as pd

import json, sys, random, os, datetime, math
print(tf.__version__)



2.3.0


In [33]:
from_json_df = pd.read_json('/content/gdrive/My Drive/NCF_video_bundle/video_bundle_data2.json')

In [None]:
from_json_df.head(5)

In [36]:
# traindata = pd.read_csv("/content/gdrive/My Drive/NCF_video_bundle/video_bundle_data.csv") 
traindata = from_json_df[['bundle_name','bundle_id','items']]

In [None]:
traindata.head()

In [39]:
complete_array = []
for index, row in traindata.iterrows():
  for item in row['items']:
    complete_array.append([item['item_id'], item['item_name'], item['genre'], row['bundle_id']])
df = pd.DataFrame(complete_array, columns=['item_id', 'item_name', 'genre', 'bundle_id'])
print(df.shape)
df.head()

(3525, 4)


Unnamed: 0,item_id,item_name,genre,bundle_id
0,326950,Sword of Asumi,"Adventure, Indie, RPG",450
1,331490,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG",450
2,331491,Sword of Asumi - Graphic Novel,"Adventure, Indie, RPG",450
3,331492,Sword of Asumi - Character Creator,"Adventure, Indie, RPG",450
4,348540,Divine Slice of Life,"Adventure, Casual, Indie",450


In [55]:
item_ids = df["item_id"].unique().tolist()
len(item_ids)
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
df['itemId'] = df["item_id"].map(item2item_encoded)

df.head()

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId
0,326950,Sword of Asumi,"Adventure, Indie, RPG",450,0
1,331490,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG",450,1
2,331491,Sword of Asumi - Graphic Novel,"Adventure, Indie, RPG",450,2
3,331492,Sword of Asumi - Character Creator,"Adventure, Indie, RPG",450,3
4,348540,Divine Slice of Life,"Adventure, Casual, Indie",450,4


In [56]:
bundle_ids = df["bundle_id"].unique().tolist()
bundle2bundle_encoded = {x: i for i, x in enumerate(bundle_ids)}
bundle_encoded2bundle = {i: x for i, x in enumerate(bundle_ids)}
df["bundleId"] = df["bundle_id"].map(bundle2bundle_encoded)
df.head()

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId,bundleId
0,326950,Sword of Asumi,"Adventure, Indie, RPG",450,0,0
1,331490,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG",450,1,0
2,331491,Sword of Asumi - Graphic Novel,"Adventure, Indie, RPG",450,2,0
3,331492,Sword of Asumi - Character Creator,"Adventure, Indie, RPG",450,3,0
4,348540,Divine Slice of Life,"Adventure, Casual, Indie",450,4,0


In [57]:
# split training and challenge data
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
challenge = df[~msk]
train.head()

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId,bundleId
0,326950,Sword of Asumi,"Adventure, Indie, RPG",450,0,0
1,331490,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG",450,1,0
3,331492,Sword of Asumi - Character Creator,"Adventure, Indie, RPG",450,3,0
4,348540,Divine Slice of Life,"Adventure, Casual, Indie",450,4,0
5,352010,Beach Bounce,"Adventure, Casual, Indie",450,5,0


In [58]:
challenge.head()

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId,bundleId
2,331491,Sword of Asumi - Graphic Novel,"Adventure, Indie, RPG",450,2,0
9,408770,Highschool Possession,"Adventure, Indie",450,9,0
13,396620,Quantum Flux - Soundtrack,"Action, Indie",450,13,0
28,528381,Naruto Shippuden Uncut: The Tailed Beast vs The Tailless Tailed Beast,,1474,28,2
38,528391,Naruto Shippuden Uncut: Infiltrator,,1474,38,2


In [60]:
# %%time
# save data in dok matrix (optimized sparse matrix object)
# create a sparse playlistid x trackindex matrix
# if a playlistid i has song j, mat[i,j]=1
mat = sp.dok_matrix((train.shape[0], len(df['itemId'].unique())), dtype=np.float32)
for bundle_id, item_id in zip(train['bundleId'], train['itemId']):
    # print(f'{bundle_id}, {item_id}')
    # print(mat.shape)
    mat[bundle_id, item_id] = 1.0
# sp.save_npz('steam_train_matrix.npz', mat)

In [70]:
def get_model(num_users, num_items, latent_dim=8, dense_layers=[64, 32, 16, 8],
              reg_layers=[0, 0, 0, 0], reg_mf=0):

    # input layer
    input_user = Input(shape=(1,), dtype='int32', name='user_input')
    input_item = Input(shape=(1,), dtype='int32', name='item_input')

    print(reg_mf)
    
    # embedding layer
    mf_user_embedding = Embedding(input_dim=num_users, output_dim=latent_dim,
                        name='mf_user_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mf_item_embedding = Embedding(input_dim=num_items, output_dim=latent_dim,
                        name='mf_item_embedding',
                        embeddings_initializer='RandomNormal',
                        embeddings_regularizer=l2(reg_mf), input_length=1)
    mlp_user_embedding = Embedding(input_dim=num_users, output_dim=int(dense_layers[0]/2),
                         name='mlp_user_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)
    mlp_item_embedding = Embedding(input_dim=num_items, output_dim=int(dense_layers[0]/2),
                         name='mlp_item_embedding',
                         embeddings_initializer='RandomNormal',
                         embeddings_regularizer=l2(reg_layers[0]), 
                         input_length=1)

    # MF latent vector
    mf_user_latent = Flatten()(mf_user_embedding(input_user))
    mf_item_latent = Flatten()(mf_item_embedding(input_item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP latent vector
    mlp_user_latent = Flatten()(mlp_user_embedding(input_user))
    mlp_item_latent = Flatten()(mlp_item_embedding(input_item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])
    
    mlp_vector = mlp_cat_latent
    
    # build dense layer for model
    for i in range(1,len(dense_layers)):
        layer = Dense(dense_layers[i],
                      activity_regularizer=l2(reg_layers[i]),
                      activation='relu',
                      name='layer%d' % i)
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])
    result = Dense(1, activation='sigmoid', 
                   kernel_initializer='lecun_uniform',name='result')

    model = Model(inputs=[input_user,input_item], outputs=result(predict_layer))

    return model

# get the training samples
def get_train_samples(train_mat, num_negatives):
    user_input, item_input, labels = [], [], []
    num_user, num_item = train_mat.shape
    for (u, i) in train_mat.keys():
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_item)
            while (u, j) in train_mat.keys():
                j = np.random.randint(num_item)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels

# hyperparameters
loaded = True
verbose = 1
epochs = 15   
batch_size = 256
latent_dim = 8
dense_layers = [64, 32, 16, 8]
reg_layers = [0, 0, 0, 0]
reg_mf = [0]
num_negatives = 4
learning_rate = 0.001
learner = 'adam'
dataset = 'steam_video'

# loading data
if loaded:
    train_mat = mat
else:
    train_mat = sp.load_npz('steam_train_matrix.npz')
    
num_users, num_items = train_mat.shape
print('Done loading data!')

Done loading data!


In [71]:

# get model
model = get_model(num_users, num_items, latent_dim, dense_layers, reg_layers)
model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
    
# train model
# generate training instances
user_input, item_input, labels = get_train_samples(train_mat, num_negatives)

# training
hist = model.fit([np.array(user_input), np.array(item_input)], np.array(labels), 
                 batch_size=batch_size, epochs=epochs, verbose=verbose, shuffle=True)

0
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 1, 32)        89440       user_input[0][0]                 
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 1, 32)        89536       item_input[0][0]                 
_____________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [72]:
model_file = '%s_NCF_%d_%s.h5' % (dataset, latent_dim, str(dense_layers))
model.save(model_file, overwrite=True)

In [73]:
%%time
from keras.models import load_model
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

# this is a nice rock/oldies playlist
desired_user_id = 500
model_path = 'steam_video_NCF_8_[64, 32, 16, 8].h5'
print('using model: %s' % model_path)
model = load_model(model_path)
print('Loaded model!')

mlp_user_embedding_weights = (next(iter(filter(lambda x: x.name == 'mlp_user_embedding', model.layers))).get_weights())

# get the latent embedding for your desired user
user_latent_matrix = mlp_user_embedding_weights[0]
one_user_vector = user_latent_matrix[desired_user_id,:]
one_user_vector = np.reshape(one_user_vector, (1,32))

print('\nPerforming kmeans to find the nearest users/playlists...')
# get 100 similar users
# kmeans = KMeans(n_clusters=100, random_state=0, verbose=1).fit(user_latent_matrix)
kmeans = MiniBatchKMeans(n_clusters=100, random_state=0, verbose=1).fit(user_latent_matrix)
desired_user_label = kmeans.predict(one_user_vector)
user_label = kmeans.labels_
neighbors = []
for user_id, user_label in enumerate(user_label):
    # print('user_id:{0} user_label:{1}'.format(user_id, user_label))
    if user_label == desired_user_label:
        neighbors.append(user_id)
print('Found {0} neighbor users/playlists.'.format(len(neighbors))) 

using model: steam_video_NCF_8_[64, 32, 16, 8].h5
Loaded model!

Performing kmeans to find the nearest users/playlists...
Init 1/3 with method: k-means++
Inertia for init 1/3: 18.117542
Init 2/3 with method: k-means++
Inertia for init 2/3: 17.414335
Init 3/3 with method: k-means++
Inertia for init 3/3: 18.221548
Minibatch iteration 1/2800: mean batch inertia: 0.081265, ewa inertia: 0.081265 
Minibatch iteration 2/2800: mean batch inertia: 0.083734, ewa inertia: 0.081441 
Minibatch iteration 3/2800: mean batch inertia: 0.078911, ewa inertia: 0.081260 
Minibatch iteration 4/2800: mean batch inertia: 0.086899, ewa inertia: 0.081664 
Minibatch iteration 5/2800: mean batch inertia: 0.084520, ewa inertia: 0.081868 
Minibatch iteration 6/2800: mean batch inertia: 0.074568, ewa inertia: 0.081346 
Minibatch iteration 7/2800: mean batch inertia: 0.082659, ewa inertia: 0.081440 
Minibatch iteration 8/2800: mean batch inertia: 0.077227, ewa inertia: 0.081138 
Minibatch iteration 9/2800: mean batch

In [74]:
# get the tracks in similar users' playlists
games = []
for user_id in neighbors:
    games += list(df[df['bundleId'] == int(user_id)]['itemId'])
print('Found {0} neighbor items from these games.'.format(len(games))) 

users = np.full(len(games), desired_user_id, dtype='int32')
bundles = np.array(games, dtype='int32')

print('\nRanking most likely games using the NeuMF model...')
# and predict tracks for my user
results = model.predict([users,bundles],batch_size=100, verbose=0) 
results = results.tolist()
print('Ranked the games!')

Found 138 neighbor items from these games.

Ranking most likely games using the NeuMF model...
Ranked the games!


In [75]:
results_df = pd.DataFrame(np.nan, index=range(len(results)), columns=['probability','item_name', 'genre'])
print(results_df.shape)

# loop through and get the probability (of being in the bundle according to my model), the game, and the genre  
for i, prob in enumerate(results):
    print('i: {0} prob: {1}'.format(i,prob))
    results_df.loc[i] = [prob[0], df[df['itemId'] == i].iloc[0]['item_name'], df[df['itemId'] == i].iloc[0]['genre']]
results_df = results_df.sort_values(by=['probability'], ascending=False)

results_df.head(20)

(138, 3)
i: 0 prob: [0.15298321843147278]
i: 1 prob: [0.9998869299888611]
i: 2 prob: [0.04202428460121155]
i: 3 prob: [2.4170958567992784e-07]
i: 4 prob: [1.8240148974557613e-11]
i: 5 prob: [0.7810355424880981]
i: 6 prob: [0.9919615387916565]
i: 7 prob: [0.9998305439949036]
i: 8 prob: [0.21919488906860352]
i: 9 prob: [0.9997233748435974]
i: 10 prob: [3.584334265754451e-16]
i: 11 prob: [0.9911948442459106]
i: 12 prob: [0.15384966135025024]
i: 13 prob: [1.2712458407827712e-16]
i: 14 prob: [0.38211238384246826]
i: 15 prob: [1.9510556739987805e-05]
i: 16 prob: [1.9190532839274965e-05]
i: 17 prob: [0.0011650323867797852]
i: 18 prob: [0.38211238384246826]
i: 19 prob: [1.9510556739987805e-05]
i: 20 prob: [1.9190532839274965e-05]
i: 21 prob: [0.009297490119934082]
i: 22 prob: [0.9900289177894592]
i: 23 prob: [0.9999203681945801]
i: 24 prob: [0.9993526935577393]
i: 25 prob: [0.0002168416976928711]
i: 26 prob: [9.693976608105004e-07]
i: 27 prob: [1.012312168313656e-05]
i: 28 prob: [0.99989831447

Unnamed: 0,probability,item_name,genre
86,0.999971,Naruto Shippuden Uncut: Gaara's Bond,
58,0.999953,Naruto Shippuden Uncut: Hero of the Hidden Leaf,
104,0.999948,Hospital Tycoon,Simulation
81,0.999932,18 Wheels of Steel: Across America,Simulation
74,0.999925,RUSH,"Strategy, Indie, Casual"
23,0.99992,Naruto Shippuden Uncut: Racing Lightning,
134,0.999908,GRID 2 - Drift Pack,"Racing, Sports"
118,0.999908,F1 Race Stars - Music Accessory Pack,Racing
28,0.999898,Naruto Shippuden Uncut: The Tailed Beast vs The Tailless Tailed Beast,
1,0.999887,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG"


In [77]:
df[df['bundleId'] == 500].head(20)

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId,bundleId
2860,298520,Orbital Gear,"Action, Indie, Simulation",280,2289,500
2861,358880,Orbital Gear Soundtrack,"Action, Indie, Simulation",280,2290,500
2862,424830,Bell Ringer,"Action, Indie",280,2291,500


In [78]:
#save df to json for inference
df.to_json(r'/content/origin_steam_video_df.json')

In [81]:
# df.to_json(r'/content/origin_steam_video_df.json')
load_df = pd.read_json('/content/origin_steam_video_df.json')
load_df.head()

Unnamed: 0,item_id,item_name,genre,bundle_id,itemId,bundleId
0,326950,Sword of Asumi,"Adventure, Indie, RPG",450,0,0
1,331490,Sword of Asumi - Soundtrack,"Adventure, Indie, RPG",450,1,0
2,331491,Sword of Asumi - Graphic Novel,"Adventure, Indie, RPG",450,2,0
3,331492,Sword of Asumi - Character Creator,"Adventure, Indie, RPG",450,3,0
4,348540,Divine Slice of Life,"Adventure, Casual, Indie",450,4,0


In [82]:
print(keras.__version__)

2.4.0
