In [None]:
from __future__ import print_function, division
from builtins import range, input

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz

import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

from tqdm import tqdm

In [None]:
# load in the data
df = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep='\t')
#df = pd.read_csv('../large_files/movielens-20m-dataset/small_rating.csv')
df

In [None]:
# Add column names
df = df.drop(df.columns[1], axis=1)
df.columns = ['user', 'artist', 'plays']

# Drop any rows with missing values
df = df.dropna()
df

In [None]:
# Convert artists names into numerical IDs
df['user_id'] = df['user'].astype("category").cat.codes
df['artist_id'] = df['artist'].astype("category").cat.codes

# Create a lookup frame so we can get the artist
# names back in readable form later.
item_lookup = df[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup.artist_id.astype(str)

# We drop our old user and artist columns
df = df.drop(['user', 'artist'], axis=1)
df

In [None]:
N = df.user_id.max() + 1 # number of users
M = df.artist_id.max() + 1 # number of movies


In [None]:
N

In [None]:
M = int(M)
M

In [None]:
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]


In [None]:
A = lil_matrix((N, M))
print("Calling: update_train")
count = 0
def update_train(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.user_id)
  j = int(row.artist_id)
  A[i,j] = row.plays
  
df_train.apply(update_train, axis=1)


In [None]:
# mask, to tell us which entries exist and which do not
A = A.tocsr()
mask = (A > 0)
save_npz("Atrain.npz", A)


In [None]:
# test ratings dictionary
A_test = lil_matrix((N, M))
print("Calling: update_test")
count = 0
def update_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.user_id)
  j = int(row.artist_id)
  A_test[i,j] = row.plays
df_test.apply(update_test, axis=1)


In [None]:
A_test = A_test.tocsr()
mask_test = (A_test > 0)
save_npz("Atest.npz", A_test)

In [None]:
batch_size = 
epochs = 10
reg = 0.001
# reg = 0

In [None]:
A = load_npz("Atrain.npz")
A_test = load_npz("Atest.npz")
mask = (A > 0) * 1.0
mask_test = (A_test > 0) * 1.0


In [None]:
# make copies since we will shuffle
A_copy = A.copy()
mask_copy = mask.copy()
A_test_copy = A_test.copy()
mask_test_copy = mask_test.copy()

In [None]:
N, M = A.shape
print("N:", N, "M:", M)
print("N // batch_size:", N // batch_size)

In [None]:
# center the data
mu = A.sum() / mask.sum()
print("mu:", mu)


In [None]:
# build the model - just a 1 hidden layer autoencoder
i = Input(shape=(M,))
# bigger hidden layer size seems to help!
x = Dropout(0.7)(i)
x = Dense(700, activation='relu', kernel_regularizer=l2(reg))(x)
x = Dropout(0.5)(x)
x = Dense(M, kernel_regularizer=l2(reg))(x)

In [None]:
def custom_loss(y_true, y_pred):
  mask = K.cast(K.not_equal(y_true, 0), dtype='float32')
  diff = y_pred - y_true
  sqdiff = diff * diff * mask
  sse = K.sum(K.sum(sqdiff))
  n = K.sum(K.sum(mask))
  return sse / n

In [None]:
def generator(A, M):
  while True:
    A, M = shuffle(A, M)
    for i in range(A.shape[0] // batch_size + 1):
      upper = min((i+1)*batch_size, A.shape[0])
      a = A[i*batch_size:upper].toarray()
      m = M[i*batch_size:upper].toarray()
      a = a - mu * m # must keep zeros at zero!
      # m2 = (np.random.random(a.shape) > 0.5)
      # noisy = a * m2
      noisy = a # no noise
      yield noisy, a


In [None]:
def test_generator(A, M, A_test, M_test):
  # assumes A and A_test are in corresponding order
  # both of size N x M
  while True:
    for i in range(A.shape[0] // batch_size + 1):
      upper = min((i+1)*batch_size, A.shape[0])
      a = A[i*batch_size:upper].toarray()
      m = M[i*batch_size:upper].toarray()
      at = A_test[i*batch_size:upper].toarray()
      mt = M_test[i*batch_size:upper].toarray()
      a = a - mu * m
      at = at - mu * mt
      yield a, at


In [None]:
model = Model(i, x)
model.compile(
  loss=custom_loss,
  
  #optimizer=SGD(lr=0.08, momentum=0.9),
  optimizer='adam',
  metrics=['accuracy'],
)


In [None]:
r = model.fit(
  generator(A, mask),
  validation_data=test_generator(A_copy, mask_copy, A_test_copy, mask_test_copy),
  epochs=epochs,
  steps_per_epoch=A.shape[0] // batch_size + 1,
  validation_steps=A_test.shape[0] // batch_size + 1,
)
print(r.history.keys())



In [None]:
# plot losses
plt.plot(r.history['loss'], label="train loss")
plt.plot(r.history['val_loss'], label="test loss")
plt.legend()
plt.show()

In [None]:
# plot mse
plt.plot(r.history['custom_loss'], label="train mse")
plt.plot(r.history['val_custom_loss'], label="test mse")
plt.legend()
plt.show()