In [39]:
#from math import sqrt

#import fastai
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix # sparse matrix

from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')
DPI = 120

In [2]:
raw_data = pd.read_csv("old_data/raw_metacritic_game_user_comments.csv")
game_info_data = pd.read_csv("src/data/game_info.csv")
train_data = pd.read_csv("src/data/user_data_train_no_comments.csv")
test_data = pd.read_csv("src/data/user_data_test_no_comments.csv")
valid_data = pd.read_csv("src/data/user_data_validation_no_comments.csv")

game_info_data = game_info_data.rename(columns={"Unnamed: 0" : "Id"})

In [3]:
all_data = pd.concat([train_data, test_data, valid_data], ignore_index=True)
#all_data = pd.concat([train_data, test_data, valid_data])

In [4]:
#game_info_data
all_data

Unnamed: 0.1,Unnamed: 0,Userscore,Username,Game_ID
0,47246,9,SergeantSoz,119
1,199743,9,SergeantSoz,1540
2,235823,8,SergeantSoz,2227
3,263595,8,SergeantSoz,2941
4,72338,9,tomcrew10,238
...,...,...,...,...
151992,242135,10,AlexN.,2375
151993,258319,9,AlexN.,2834
151994,17628,9,zenmechanic,33
151995,32734,8,zenmechanic,70


In [5]:
# transform usernames into user IDs
user_to_id_map = {}
for i, username in enumerate(np.unique(all_data["Username"])):
    user_to_id_map[username] = i

In [6]:
def format_data(data, usermap):
    # replaces usernames with numeric IDs
    user_ids = []
    for user in data["Username"]:
        user_ids.append(user_to_id_map[user])
    #print(user_ids)
    #for i in range(len(data)):
    #for i in range(len(data)):
    #    data["Username"][i] = user_to_id_map[data["Username"][i]]
        
    #fmt = data["Username"].astype(int)
    data.insert(4, "User_ID", user_ids, allow_duplicates=True)
    data = data.drop(["Unnamed: 0"], axis=1)
    return data

In [7]:
#print(all_data.loc[16220])
#print(all_data["Username"][1])
all_data = format_data(all_data, user_to_id_map)
train_data = format_data(train_data, user_to_id_map)
test_data = format_data(test_data, user_to_id_map)
valid_data = format_data(valid_data, user_to_id_map)

In [8]:
valid_data

Unnamed: 0,Userscore,Username,Game_ID,User_ID
0,3,AlanG.,474,471
1,10,AlanG.,532,471
2,9,AlanG.,532,471
3,9,AlanG.,1525,471
4,10,TonyM.,20,14243
...,...,...,...,...
14913,10,AlexN.,2375,538
14914,9,AlexN.,2834,538
14915,9,zenmechanic,33,22017
14916,8,zenmechanic,70,22017


In [9]:
n_users = len(np.unique(all_data["Username"]))
n_games = len(game_info_data)

#print(len(np.unique((pd.concat([train_data, test_data, valid_data]))["Username"])))
#print(len(np.unique(train_data["Username"]))
#      + len(np.unique(valid_data["Username"]))
#      + len(np.unique(test_data["Username"])))

In [49]:
print(f"users: {n_users}")
print(f"games: {n_games}")

users: 22072
games: 5000


In [69]:
# data class
def np_rand(size, min_i, max_i, rs):
    array = rs.random_sample(size)
    array *= (max_i - min_i)
    array += min_i
    return array
        
class Embeddings:
    """
    requires transformation of data:
    all values must be non-negative to properly maintain bounds
    TODO experiment with non-negativity
    """
    def __init__(self, n_users, n_games, n_range, seed):
        rs = RandomState(MT19937(SeedSequence(seed)))
        
        min_n = n_range[0]
        max_n = n_range[1]
        
        self.bias_users = np_rand(n_users, min_n, max_n, rs)
        self.bias_games = np_rand(n_games, min_n, max_n, rs)
        
        self.embed_users = np_rand((n_users, embed_dim), min_n, max_n, rs)
        self.embed_games = np_rand((n_games, embed_dim), min_n, max_n, rs)
        
        self.user_idx = all_data["User_ID"].to_numpy()
        self.game_idx = all_data["Game_ID"].to_numpy()
        
        self.embed_scores = csr_matrix((all_data["Userscore"].to_numpy(),
                                       (self.user_idx, self.game_idx)),
                                       shape=(n_users, n_games), dtype=np.int8)

In [70]:
# hyperparameters
embed_dim = 5
#range_min = -1
#range_max = 11
n_range = (-1, 11)
seed = 69

# embedding matrices and biases
# np.random.random_sample: [0, 1)
# scales to [min_i, max_i)

#bias_users = np.random.random_sample(n_users)
#bias_games = np.random.random_sample(n_games)
#
#embed_users = np.random.random_sample((n_users, embed_dim))
#embed_games = np.random.random_sample((n_games, embed_dim))

#bias_users = np_rand(n_users, range_min, range_max)
#bias_games = np_rand(n_games, range_min, range_max)
#
#embed_users = np_rand((n_users, embed_dim), range_min, range_max)
#embed_games = np_rand((n_games, embed_dim), range_min, range_max)

In [71]:
e = Embeddings(n_users, n_games, n_range, seed)

In [72]:
#embed_users
e.embed_users

array([[ 1.39894402,  4.11032696,  7.22297058,  6.37883706, 10.84379725],
       [ 5.45284712,  3.16325952,  5.87090294, -0.35518192,  1.4387136 ],
       [ 3.90054405, -0.78199274,  3.11212927, 10.89188467,  1.46171215],
       ...,
       [-0.38230842,  7.02346593,  3.69044782,  5.52890654, -0.58161614],
       [ 6.13048377,  3.57961797,  1.14988756,  0.5575071 ,  9.60928089],
       [10.28555615,  6.01626618,  3.58731089,  2.73254933,  4.4926395 ]])

In [45]:
all_data["Game_ID"].to_numpy()

array([ 119, 1540, 2227, ...,   33,   70,  549])

In [46]:
all_data["User_ID"].to_numpy()

array([12398, 12398, 12398, ..., 22017, 22017, 22017])

In [59]:
# initializes with proper scores

#embed_scores = csr_matrix((all_data["Userscore"].to_numpy(),
#                          (all_data["User_ID"].to_numpy(), all_data["Game_ID"].to_numpy())),
#                          shape=(n_users, n_games), dtype=np.int8)

In [68]:
#embed_scores[1, 1] = 2

#embed_scores
embed_scores[12398, 119]

9

In [34]:
#embed_scores.toarray()

In [63]:
e.embed_scores

<22072x5000 sparse matrix of type '<class 'numpy.int8'>'
	with 150678 stored elements in Compressed Sparse Row format>

In [61]:
#class EmbeddingMatrices:
#    def __init__(self, all_data, train_data, test_data, valid_data):
#        pass

# data batches?

def train(e, cycles=1):
    pass