In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation,BatchNormalization,Input,Embedding,Dot,Dense,Flatten
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler,TensorBoard,EarlyStopping

from wordcloud import WordCloud
%matplotlib inline

In [3]:
INPUT_DIR = os.path.join("..","artifacts","raw")
df = pd.read_csv(os.path.join(INPUT_DIR,"animelist.csv"),low_memory=True,usecols = ['user_id','anime_id','rating'])

In [None]:
df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [5]:
len(df)

5000000

In [4]:
df.user_id.unique()

array([    0,     1,     2, ..., 16506, 16507, 16508], shape=(15186,))

In [5]:
#filtering users
n_ratings = df['user_id'].value_counts()
df = df[df['user_id'].isin(n_ratings[n_ratings>400].index)].copy()

In [11]:
df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,11852,18247,0.0,10889,1636
1,16489,14467,0.6,15166,749
2,13477,271,0.6,12381,5128
3,14531,136,1.0,13354,1004
4,2119,2619,0.7,1951,3147


In [30]:
len(df)

3242641

In [7]:
min_rating = min(df.rating)
max_rating = max(df.rating)
df['rating'] = df['rating'].apply(lambda x:(x-min_rating)/(max_rating-min_rating)).values.astype(np.float64)
print(min_rating)
print(max_rating)
print(min(df.rating))
print(max(df.rating))

0
10
0.0
1.0


In [8]:
user_id = df['user_id'].unique().tolist()
user2user_endcoded = {x:i for i,x in enumerate(user_id)}
user2user_decoded = {i:x for i,x in enumerate(user_id)}
df['user'] = df['user_id'].map(user2user_endcoded)

In [9]:
anime_ids = df["anime_id"].unique().tolist()
anime2anime_encoded = {x : i for i , x in enumerate(anime_ids)}
anime2anime_decoded = {i : x for i , x in enumerate(anime_ids)}
df["anime"] = df["anime_id"].map(anime2anime_encoded)

In [10]:
df = df.sample(frac=1,random_state=42).reset_index(drop=True)

In [None]:
x = df[['user','anime']]
y = df['rating']
test_size = 1000
split_index = df.shape[0]-test_size
x_train,x_test,y_train,y_test = (x[:split_index],
                                 x[split_index:],
                                 y[:split_index],
                                 y[split_index:])

In [12]:
x_train.shape

(3241641, 2)

In [13]:
#converting data into numpy array
x_train_array = [x_train.iloc[:,0].to_numpy(),x_train.iloc[:,1].to_numpy()]
x_test_array = [x_test.iloc[:,0].to_numpy(),x_test.iloc[:,1].to_numpy()]


In [14]:
type(x_train_array[0])

numpy.ndarray

In [23]:
#model architecture
n_users = len(user2user_endcoded)
n_anime = len(anime2anime_encoded)
def RecommenderNet():
    embedding_size =128

    user = Input(name="user",shape=[1])

    user_embedding = Embedding(name="user_embedding",input_dim=n_users,output_dim=embedding_size)(user)

    anime = Input(name="anime",shape=[1])

    anime_embedding = Embedding(name="anime_embedding",input_dim=n_anime,output_dim=embedding_size)(anime)

    x = Dot(name="dot_product" , normalize=True , axes=2)([user_embedding,anime_embedding])

    x = Flatten()(x)

    x = Dense(1,kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user,anime], outputs=x)
    model.compile(loss="binary_crossentropy",metrics=["mae","mse"],optimizer='Adam')
    return model

In [29]:
n_users

4193

In [26]:
model = RecommenderNet()
model.summary()

In [25]:
start_lr = 0.00001
min_lr = 0.0001
max_lr = 0.00005
batch_size = 10000

ramup_epochs = 5
sustain_epochs = 0
exp_decay = 0.8

def lrfn(epoch):
    if epoch<ramup_epochs:
        return (max_lr-start_lr)/ramup_epochs*epoch + start_lr
    elif epoch<ramup_epochs+sustain_epochs:
        return max_lr
    else:
        return (max_lr-min_lr) * exp_decay ** (epoch-ramup_epochs-sustain_epochs)+min_lr

In [21]:
lr_callback = LearningRateScheduler(lambda epoch:lrfn(epoch) , verbose=0)
checkpoint_filepath = './weights.weights.h5'

model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=True,monitor="val_loss",mode="min",save_best_only=True)

early_stopping = EarlyStopping(patience=3,monitor="val_loss",mode="min",restore_best_weights=True)
my_callbacks = [model_checkpoint,lr_callback,early_stopping]

In [36]:
history = model.fit(
    x = x_train_array,
    y = y_train,
    batch_size = batch_size,
    epochs  = 20,
    verbose = 1,
    validation_data = (x_test_array,y_test),
    callbacks = my_callbacks
)

Epoch 1/20
[1m324/325[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - loss: 0.6964 - mae: 0.3618 - mse: 0.1572

IndexError: tuple index out of range

In [35]:
len(x_train_array[1])

3241641