# Importing the libraries

In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.utils.np_utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional,\
 Conv1D, GlobalMaxPooling1D, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.backend import clear_session

# Loading the data

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Method 2. Collaborative filtering
In this method, we would recommend the next games based on how similar users downloaded games. For example, if user1 and user2 download similar games, e.g., user1 download game1, game2, games3 and user2 game1, game2, we would recommend game3 to user2.

## Data preparation
We convert the format of the dataset into a binary dataset. For every (user_id, game_id), we will add a 1 as downloaded. That means that this user has downloaded this game, and we use this binary data for our recommendation system.

In [11]:
df_train['historical_games'] = df_train.apply(lambda x: x['historical_games'] + ' ' + str(x['next_game']), axis=1)
df_train.drop(['next_game'], axis = 1, inplace=True)
# Since the users are different in the training and the test, we mix the training and the test set
df = pd.concat([df_train, df_test])

df = pd.DataFrame([[i, k] for i, j in df.iloc[:, 0:2].values for k in j.split()], columns=['user_id', 'game_id'])
df['downloaded'] = 1
df.head()

Unnamed: 0,user_id,game_id,downloaded
0,2,3,1
1,2,12,1
2,2,262,1
3,2,6094,1
4,2,283,1


In [13]:
# calculate the game-game cosine similarity
def Get_game_game_similarity(user_ids, game_ids):
    gameUserMatrix = csr_matrix(([1]*len(user_ids), (game_ids, user_ids)))
    similarity = cosine_similarity(gameUserMatrix)
    return similarity, gameUserMatrix

In [17]:
def get_recommendations_from_similarity(similarity_matrix, gameUserMatrix, recom_game_num=5):
    userGameMatrix = csr_matrix(gameUserMatrix.T)
    userGameScores = userGameMatrix.dot(similarity_matrix)
    lst = []
    for user_id in range(userGameScores.shape[0]):
        scores = userGameScores[user_id, :]
        downloaded_games = userGameMatrix.indices[userGameMatrix.indptr[user_id]:                              
        userGameMatrix.indptr[user_id+1]]
                
        scores[downloaded_games] = -1
        top_game_ids = np.argsort(scores)[-recom_game_num:][::-1]
        recommendations = pd.DataFrame(top_game_ids.reshape(1, -1), index=[user_id], columns=['Top%s' % (i+1) for i in range(recom_game_num)])
        lst.append(recommendations)
    return pd.concat(lst)

In [18]:
def get_recommendations(df):
    user_label_encoder = LabelEncoder()
    user_ids = user_label_encoder.fit_transform(df.user_id)
    game_label_encoder = LabelEncoder()
    game_ids = game_label_encoder.fit_transform(df.game_id)

    similarity_matrix, gameUserMatrix = Get_game_game_similarity(user_ids, game_ids)
    recommendations = get_recommendations_from_similarity(similarity_matrix, gameUserMatrix)

    recommendations.index = user_label_encoder.inverse_transform(recommendations.index)
    for i in range(recommendations.shape[1]):
        recommendations.iloc[:, i] = game_label_encoder.inverse_transform(recommendations.iloc[:, i])
    return recommendations

In [19]:
recommendations = get_recommendations(df)
recommendations

Unnamed: 0,Top1,Top2,Top3,Top4,Top5
2,11,1,4,2,16
4,16,11,2,4,50
5,4,1,15,8,5
7,5,43,17,158,35
10,9,13,27,2,1
...,...,...,...,...,...
91418,11,1,2,79,108
91419,4,55,3,46,15
91420,4,15,3,32,5
91421,1,4,3,11,21


In [40]:
final_df = recommendations[recommendations.index.isin(df_test['id'].values)]
final_df.head()

Unnamed: 0,Top1,Top2,Top3,Top4,Top5
5,4,1,15,8,5
13,14,45,6,18,70
20,289,20,2,8,24
25,45,6,23,14,8
32,11,12,4,2,22


In [41]:
game = []
for i in final_df.iloc[:, 0:6].values:
    q = []
    for j in i:
        q.append(str(j))
    b = " ".join(q)
    game.append(b)

final_df['next_games'] = game

In [42]:
final_df.drop(['Top1', 'Top2', 'Top3', 'Top4', 'Top5'], axis=1, inplace=True)
final_df.head()

Unnamed: 0,next_games
5,4 1 15 8 5
13,14 45 6 18 70
20,289 20 2 8 24
25,45 6 23 14 8
32,11 12 4 2 22


In [43]:
final_df.to_csv('prediction.csv')

# Method 2. Deep learning

We implemented LSTM and Bi-LSTM models on two types of training sets: type1 and type2. For each type, we tune the models' hyperparameter for achieving the best possible results. Although accuracy is inappropriate for recommender systems, we use this metric for evaluating the models. The implementation of best models are as follows:<br>
### Type 1
We use the pre_padding to pad each historical games to maximum length (29) and fed them into deep learning models

In [54]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [56]:
X = [list(map(int, i.split())) for i in df_train['historical_games']]
y = df_train['next_game']

In [59]:
max_token = max(len(x) for x in X)
X_padded = pad_sequences(X, padding='pre', maxlen=max_token)
X_padded = np.array(X_padded)
y = np.array(y)
pd.DataFrame(X_padded).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,12,262,6094,283,50,1070,233
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,294,241,1,150,12
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,139,144,57,2013
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,114,10,5,31,6504
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,221,3,712,159,4,810,94,746,6170,136,17,1160,78,113


In [60]:
X_padded = X_padded.reshape(X_padded.shape[0], X_padded.shape[1], 1) # 29, 1
X_padded.shape

(30588, 29, 1)

In [65]:
X_test = [list(map(int, i.split())) for i in df_test['historical_games']]
X_test = pad_sequences(X_test, padding='pre', maxlen=29)
X_test = X_test.reshape(13073, 29, 1)

#### LSTM

In [77]:
clear_session()
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=(X_padded.shape[1], 1)))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(7737, activation='softmax'))
model.compile(optimizer = 'adam', loss="sparse_categorical_crossentropy", metrics=['accuracy'])
print(model.summary())

my_callbacks = [ModelCheckpoint("./saved_models/checkpoints/best_model", monitor='accuracy', verbose=1, save_best_only=True)]

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 29, 256)           264192    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 7737)              1988409   
Total params: 2,777,913
Trainable params: 2,777,913
Non-trainable params: 0
_________________________________________________________________
None


In [78]:
model.fit(X_padded, y, epochs=config.epochs, batch_size=config.batch_size, callbacks=my_callbacks) 

wandb.finish()
best_model = tf.keras.models.load_model("./saved_models/checkpoints/best_model")

Epoch 1/10

Epoch 00001: accuracy improved from -inf to 0.01491, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 2/10

Epoch 00002: accuracy improved from 0.01491 to 0.02243, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 3/10

Epoch 00003: accuracy improved from 0.02243 to 0.02968, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 4/10

Epoch 00004: accuracy improved from 0.02968 to 0.03465, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 5/10

Epoch 00005: accuracy improved from 0.03465 to 0.04005, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 6/10

Epoch 00006: accuracy improved from 0.04005 to 0.04185, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 7/10

Epoch 00007: accuracy improved from 0.04185 to 0.04381, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 8/10

Epoch 00008: accuracy improved from 0.04381 to 0.04570, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 9/10

Epoch 00009: accuracy improved from 0.04570 to 0.04852, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


Epoch 10/10

Epoch 00010: accuracy improved from 0.04852 to 0.05025, saving model to ./saved_models/checkpoints\best_model




INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


INFO:tensorflow:Assets written to: ./saved_models/checkpoints\best_model\assets


<tensorflow.python.keras.callbacks.History at 0x1d816e49f08>

### prediction

In [None]:
best_model = tf.keras.models.load_model("./saved_models/checkpoints/best_model")
prediction = best_model.predict(X_test)

results = []
for pred in prediction:
    pred = list(pred)
    sorted_pred = sorted(pred.copy(), reverse=True)
    result = []
    for i in range(5):
        v = sorted_pred[i]
        index = list(pred).index(v)
        result.append(index)
    results.append(result)

final_output = []
for k in results:
    final_output.append(' '.join(map(str, k)))

df_test['next_games'] = final_output
df_test.drop(['historical_games'], inplace=True, axis=1)
df_test

In [None]:
df_test.to_csv('prediction.csv', index=None)

### Type 2
For augmenting the data and working with same-length historical games, We change the format of the dataset. We split each historical games into 5 segments and fed them into deep learning models.

### Data preparation

In [82]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df1 = df_train.apply(lambda x: x['historical_games'] + ' ' + str(x['next_game']), axis=1)
df2 = df_test.apply(lambda x: x['historical_games'], axis=1)
df = pd.concat([df1, df2])
# Tokenizing the data
tokenizer  = Tokenizer()
tokenizer.fit_on_texts(df.values)

In [83]:
df_train['tokenized_history'] = df_train.apply(lambda x: tokenizer.texts_to_sequences([x['historical_games']])[0], axis=1).values
df_train['tokenized_next_game'] = df_train.apply(lambda x: tokenizer.texts_to_sequences([str(x['next_game'])])[0][0], axis=1).values
df_train.drop(['historical_games', 'next_game'], axis=1, inplace=True)

df_test['tokenized_history'] = df_test.apply(lambda x: tokenizer.texts_to_sequences([x['historical_games']])[0], axis=1).values
df_test.drop(['historical_games'], axis=1, inplace=True)
df_train.head()

Unnamed: 0,id,tokenized_history,tokenized_next_game
0,2,"[5, 14, 236, 4580, 291, 51, 1142, 207]",126
1,4,"[283, 266, 1, 163, 14]",104
2,7,"[94, 139, 145, 60, 1709]",309
3,10,"[8, 121, 12, 6, 32, 5930]",22
4,18,"[6, 222, 5, 656, 172, 4, 783, 93, 730, 5405, 1...",240


In [84]:
data = [i for i in df_train['tokenized_history']]
next_game = df_train['tokenized_next_game'].values
for i in range(len(data)):
    data[i].append(next_game[i])

In [85]:
# Segmenting the data into 2 part
new_data = []
for i in range(len(data)):
    for j in range(len(data[i])):
        k = j+6
        if k > len(data[i]):
            break
        new_data.append(data[i][j:k])

print('Before: ')
print(data[0])
print('After: ')
print(new_data[:4])

Before: 
[5, 14, 236, 4580, 291, 51, 1142, 207, 126]
After: 
[[5, 14, 236, 4580, 291, 51], [14, 236, 4580, 291, 51, 1142], [236, 4580, 291, 51, 1142, 207], [4580, 291, 51, 1142, 207, 126]]


In [86]:
# Augmenting the dataset
data = [i for i in df_test['tokenized_history']]
user_ids = df_test['id'].values
test_set = []
ids_set = []
training_set = []
flag = True
c=0
for row, ids in zip(data, user_ids):
    flag = True
    c+=1
    if len(row) == 5:
        if flag:
            test_set.append(row)
            flag = False
    else:
        for j in range(len(row)):
            k = j+6
            if k > len(row):
                break
            if row[-1] in row[j:k]:
                test_set.append(row[j+1:k])
                ids_set.append(ids)
            else:
                training_set.append(row[j:k])

# test_set.sort()                
# test_set = list(k for k,_ in itertools.groupby(test_set))

print('Number of new test data: {}'.format(len(test_set)))
print('Number of new training data: {}'.format(len(training_set)))

Number of new test data: 13252
Number of new training data: 48380


In [80]:
test_set = list(df_test.apply(lambda x: x['tokenized_history'][-5:], axis=1))

for i in training_set:
    new_data.append(i)

new_df_train = pd.DataFrame(new_data)
new_df_train.head()

Unnamed: 0,0,1,2,3,4,5
0,5,14,236,4580,291,51
1,14,236,4580,291,51,1142
2,236,4580,291,51,1142,207
3,4580,291,51,1142,207,126
4,283,266,1,163,14,104


In [81]:
new_df_test = pd.DataFrame(test_set)
new_df_test.head()

Unnamed: 0,0,1,2,3,4
0,97,2713,739,206,297
1,244,6268,925,23,191
2,226,6,22,319,112
3,17,2520,735,6,680
4,13,62,1,283,495


### Model training

In [87]:
vocabulary_size = len(tokenizer.word_index) + 1
print('Unique items: %d' % vocabulary_size)

Unique items: 7805


In [88]:
X = new_df_train.iloc[:, 0:5].values
y = new_df_train.iloc[:, 5].values

In [89]:
max_len = max([len(seq) for seq in X])
max_len

5

In [90]:
X = X.reshape(X.shape[0], X.shape[1], 1)
X.shape

(218654, 5, 1)

In [91]:
y = to_categorical(y, num_classes=vocabulary_size)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

#### LSTM

In [102]:
clear_session()
model = Sequential()
model.add(Embedding(vocabulary_size, 5, input_length=max_len - 1))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.2, verbose=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Bi-LSTM

In [41]:
clear_session()
model = Sequential()
model.add(Embedding(vocabulary_size, 5, input_length=max_len - 1))
model.add(Bidirectional(LSTM(10)))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, y, verbose=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Prediction

In [None]:
x_test = new_df_test.iloc[:, :].values
prediction = model.predict(x_test)

In [None]:
results = []
for pred in prediction:
    pred = list(pred)
    sorted_pred = sorted(pred.copy(), reverse=True)
    result = []
    for i in range(5):
        v = sorted_pred[i]
        index = list(pred).index(v)
        result.append(tokenizer.word_index[str(index)])
    results.append(result)

final_output = []
for k in results:
    final_output.append(' '.join(map(str, k)))

df_test = pd.read_csv('test.csv')
df_test['next_games'] = final_output
df_test.drop(['historical_games'], inplace=True, axis=1)

In [None]:
df_test.to_csv('prediction.csv', index=None)

# Discussion
Based on the result of the above models, the best result was achieved by the collaborative filtering method. The result of each models on the quera is as follows:

| Model | Quera Score |
| ------ | ------ |
| Collaborative Filtering | 130 |
| Type1 LSTM | 105 |
| Type2 Bi-LSTM | 73 |
