train

In [None]:
!pip install git+https://github.com/darecophoenixx/wordroid.sblo.jp

In [None]:
%matplotlib inline
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

# Prepare data

In [None]:
ls -la ../input

In [None]:
dir_data_src = '../input/anime-recommendations-database'
os.listdir(dir_data_src)

In [None]:
dir_wd2v_p_src = '../input/anime-recommendations-wd2v-preparation-no-genre'
os.listdir(dir_wd2v_p_src)

In [None]:
anime_csv = pd.read_csv(os.path.join(dir_data_src, 'anime.csv'))
print(anime_csv.shape)
anime_csv.index = anime_csv.anime_id.values
anime_csv.head()

In [None]:
genre = []
for ee in [str(ee).replace(' ', '').split(',') for ee in anime_csv.genre.values]:
    genre.extend(ee)
genre = [str(ee) for ee in np.unique(genre)]

In [None]:
anime_csv_dic = {}
for ii in range(anime_csv.shape[0]):
    anime_id = anime_csv.anime_id.values[ii]
    anime_name = anime_csv.name.values[ii]
    anime_csv_dic[anime_id] = anime_name

anime_csv_dic

In [None]:
rating_csv = pd.read_csv(os.path.join(dir_data_src, 'rating.csv'))
print(rating_csv.shape)
rating_csv.head()

In [None]:
rating_csv2 = rating_csv.loc[rating_csv.rating.values != -1]
rating_csv2.shape
rating_csv2.head(100)

In [None]:
user_dic = {}
for ee in np.unique(rating_csv.user_id.values):
    user_dic[ee] = 'user_id-' + str(ee)

len(user_dic)

In [None]:
from feature_eng.neg_smpl3 import (
    WordAndDoc2vec,
    MySparseMatrixSimilarity,
    Seq, Seq2, Dic4seq,
    get_sim
)

In [None]:
src_dir = dir_wd2v_p_src
corpus_csr = scipy.sparse.load_npz(os.path.join(src_dir, 'corpus_csr.npz'))
dic_user = gensim.corpora.dictionary.Dictionary.load(os.path.join(src_dir, 'word_dic'))
dic_anime = gensim.corpora.dictionary.Dictionary.load(os.path.join(src_dir, 'doc_dic'))

In [None]:
%%time
wd2v = WordAndDoc2vec(corpus_csr, dic_user, dic_anime)
wd2v

In [None]:
len(wd2v.doc_dic), len(wd2v.word_dic)

# Train

In [None]:
num_features = 128
wd2v.make_model(
    num_features=num_features,
    maxnorm=1000, max_num_prod=50
)

In [None]:
wgt_prod = wd2v.wgt_col
print(wgt_prod.shape)
df = pd.DataFrame(wgt_prod[:,:5])
sns.pairplot(df, markers='.')

In [None]:
wgt_user = wd2v.wgt_row
print(wgt_user.shape)
df = pd.DataFrame(wgt_user[:,:5])
sns.pairplot(df, markers='.')

In [None]:
%%time

from keras.callbacks import ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau

def lr_schedule(epoch):
    lr0 = 0.02
    epoch1 = 16
    epoch2 = 16
    epoch3 = 16
    epoch4 = 16
    
    if epoch<epoch1:
        lr = lr0
    elif epoch<epoch1+epoch2:
        lr = lr0/2
    elif epoch<epoch1+epoch2+epoch3:
        lr = lr0/4
    elif epoch<epoch1+epoch2+epoch3+epoch4:
        lr = lr0/8
    else:
        lr = lr0/16
    
    if divmod(epoch,4)[1] == 3:
        lr *= (1/8)
    elif divmod(epoch,4)[1] == 2:
        lr *= (1/4)
    elif divmod(epoch,4)[1] == 1:
        lr *= (1/2)
    elif divmod(epoch,4)[1] == 0:
        pass
    print('Learning rate: ', lr)
    return lr

lr_scheduler = LearningRateScheduler(lr_schedule)

callbacks = [lr_scheduler]

hst = wd2v.train(epochs=64, verbose=2,
           use_multiprocessing=True, workers=4,
           callbacks=callbacks)

In [None]:
hst_history = hst.history

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5))
ax[0].set_title('loss')
ax[0].plot(list(range(len(hst_history["loss"]))), hst_history["loss"], label="Train loss")
ax[1].set_title('acc')
ax[1].plot(list(range(len(hst_history["loss"]))), hst_history["acc"], label="accuracy")
ax[2].set_title('learning rate')
ax[2].plot(list(range(len(hst_history["loss"]))), hst_history["lr"], label="learning rate")
ax[0].legend()
ax[1].legend()
ax[2].legend()

In [None]:
np.save('wgt_col', wd2v.wgt_col)
np.save('wgt_row', wd2v.wgt_row)

In [None]:
os.listdir('.')

In [None]:
wgt_prod = wd2v.wgt_col
print(np.sqrt(np.square(wgt_prod).sum(axis=1)))
print(wgt_prod.shape)
df = pd.DataFrame(wgt_prod[:,:5])
#df['cls'] = ['ph'] + ['c'+str(ii) for ii in cls_prod]
df.head()

In [None]:
sns.pairplot(df, markers='.', height=3.5, diag_kind='hist')

In [None]:
wgt_user = wd2v.wgt_row
print(np.sqrt(np.square(wgt_user).sum(axis=1)))
print(wgt_user.shape)
df = pd.DataFrame(wgt_user[:,:5])
#df['cls'] = ['c'+str(ii) for ii in cls_user]
df.head()

In [None]:
sns.pairplot(df, markers='.', height=3.5, diag_kind='hist')

In [None]:
# df1 = pd.DataFrame(wgt_prod)
# df1['cls'] = ['ph'] + ['c'+str(ii) for ii in cls_prod]
# df2 = pd.DataFrame(wgt_user)
# df2['cls'] = ['r'+str(ii) for ii in cls_user]
# df = pd.concat([df1, df2], axis=0)
# df.head()

# sns.pairplot(df, markers=['.']*8+['s']*7, hue='cls', height=3.5, diag_kind='hist')

# Get Similarity

In [None]:
sim = wd2v.sim
sim

In [None]:
sim.row_dic

In [None]:
anime_csv_dic[32281]

In [None]:
query = sim.sim_row.index[sim.row_dic.token2id['anime_id-32281']]
query

In [None]:
sim.get_sim_bycol(query, num_best=10)

In [None]:
sim.get_sim_byrow(query, num_best=10)