In [None]:
!pip install git+https://github.com/darecophoenixx/wordroid.sblo.jp

In [None]:
%matplotlib inline
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

# Load data
---

In [None]:
ls -la ../input

In [None]:
dir_data_src = '../input/anime-recommendations-database'
os.listdir(dir_data_src)

In [None]:
dir_wd2v_p_src = '../input/anime-recommendations-wd2v-preparation-no-genre'
os.listdir(dir_wd2v_p_src)

In [None]:
#dir_wd2v_feat_src = '../input/anime-recommendations-wd2v-train'
dir_wd2v_feat_src = '../input/fork-of-wd2v-train-omit-no-rated-no-genre'
os.listdir(dir_wd2v_feat_src)

In [None]:
anime_csv = pd.read_csv(os.path.join(dir_data_src, 'anime.csv'))
print(anime_csv.shape)
anime_csv.index = anime_csv.anime_id.values
anime_csv.head()

In [None]:
genre = []
for ee in [str(ee).replace(' ', '').split(',') for ee in anime_csv.genre.values]:
    genre.extend(ee)
genre = [str(ee) for ee in np.unique(genre)]

In [None]:
anime_csv_dic = {}
for ii in range(anime_csv.shape[0]):
    anime_id = anime_csv.anime_id.values[ii]
    anime_name = anime_csv.name.values[ii]
    anime_csv_dic[anime_id] = anime_name

In [None]:
rating_csv = pd.read_csv(os.path.join(dir_data_src, 'rating.csv'))
print(rating_csv.shape)
rating_csv.head()

In [None]:
rating_csv2 = rating_csv.loc[rating_csv.rating.values != -1]
rating_csv2.shape
rating_csv2.head(10)

In [None]:
user_dic = {}
for ee in np.unique(rating_csv.user_id.values):
    user_dic[ee] = 'user_id-' + str(ee)

len(user_dic)

# Get similarity
---

In [None]:
from feature_eng.neg_smpl3 import get_sim, WordAndDoc2vec, get_sim

In [None]:
src_dir = dir_wd2v_p_src

corpus_csr = scipy.sparse.load_npz(os.path.join(src_dir, 'corpus_csr.npz'))
dic_user = gensim.corpora.dictionary.Dictionary.load(os.path.join(src_dir, 'word_dic'))
dic_anime = gensim.corpora.dictionary.Dictionary.load(os.path.join(src_dir, 'doc_dic'))

In [None]:
wgt_row = np.load(os.path.join(dir_wd2v_feat_src, 'wgt_row.npy'))
wgt_row.shape

In [None]:
wgt_col = np.load(os.path.join(dir_wd2v_feat_src, 'wgt_col.npy'))
wgt_col.shape

In [None]:
sim = get_sim(wgt_row, dic_anime, wgt_col, dic_user)
sim

# row side

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]
df.head()

In [None]:
for ii in sim.col_dic.keys():
    if re.match('^genre-', sim.col_dic[ii]):
        d = pd.DataFrame(sim.sim_col.index[[ii],:3])
        d['cls'] = re.sub('^genre-', '', sim.col_dic[ii])
        d.columns = df.columns
        df = pd.concat([df, d], axis=0, ignore_index=True)
        #plt.annotate(text, xy=xy)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
np.unique(df.cls.values).shape

In [None]:
sns.pairplot(df, markers=['.']+['s']*(np.unique(df.cls.values).shape[0]-1), height=3.5, hue='cls', diag_kind='hist')

In [None]:
ax = sns.lmplot(data=df, x='c0', y='c1', markers=['.'], fit_reg=False, height=8)

for ii in sim.col_dic.keys():
    if re.match('^genre-', sim.col_dic[ii]):
        text = re.sub('^genre-', '', sim.col_dic[ii])
        xy = (sim.sim_col.index[ii,0], sim.sim_col.index[ii,1])
        plt.annotate(text, xy=xy)

In [None]:
plt.figure(figsize=(8,8))
ax = plt.hexbin(df.c0, df.c1, cmap='Reds')

for ii in sim.col_dic.keys():
    if re.match('^genre-', sim.col_dic[ii]):
        text = re.sub('^genre-', '', sim.col_dic[ii])
        xy = (sim.sim_col.index[ii,0], sim.sim_col.index[ii,1])
        plt.annotate(text, xy=xy)

## Gundam

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]
df.head()

In [None]:
anime_csv.loc[[10581, 10808, 1090]]

In [None]:
for ii in sim.row_dic.keys():
    anime_id = int(re.sub('^anime_id-', '', sim.row_dic[ii]))
    title = anime_csv_dic[anime_id]
    if re.match('^Mobile Suit Gundam', title):
        print(ii, anime_id, title)
        a = df.loc[ii]
        a.cls = 'Mobile Suit Gundam'
        print(a)
        df.loc[ii] = a

In [None]:
sns.pairplot(df, markers=['.']+['s'], height=3.5, hue='cls', diag_kind='hist')

# Similar Anime to 'Mobile Suit Gundam'
---

In [None]:
[ee for ee in anime_csv_dic.values() if re.match('^Mobile Suit Gundam', ee)]

In [None]:
anime_csv.query("name=='Mobile Suit Gundam'")

In [None]:
def get_animeid(name):
    Id = anime_csv.query("name==@name").anime_id.values[0]
    return 'anime_id-' + str(Id)

get_animeid('Mobile Suit Gundam')

In [None]:
query = sim.sim_row.index[sim.row_dic.token2id[get_animeid('Mobile Suit Gundam')]]
query

anime_id_list = []
sim_list = []
for anime_id, wgt in sim.get_sim_byrow(query, num_best=30):
    anime_id = re.sub('^anime_id-', '', anime_id)
    anime_id_list.append(int(anime_id))
    sim_list.append(wgt)
    #print(anime_csv.loc[int(anime_id),'name'], wgt)

df_show = anime_csv.loc[anime_id_list].copy()
df_show['similarity'] = sim_list
#anime_csv.loc[anime_id_list]
df_show

# Saenai Heroine no Sodatekata
---

In [None]:
anime_csv.query('name.str.contains("Saenai")', engine='python')

In [None]:
query = sim.sim_row.index[sim.row_dic.token2id[get_animeid('Saenai Heroine no Sodatekata')]]
query

In [None]:
sim.get_sim_bycol(query, num_best=30)

## Anime rated by user_id-ZZZZZ

In [None]:
user_id = 72078
pd.merge(rating_csv2.query('user_id==@user_id'), anime_csv, on='anime_id')

In [None]:
animeRated = rating_csv2.query('user_id==@user_id').anime_id.values
animeRated

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]
df.head()

In [None]:
cl = ['rated' if ee1 in animeRated else '*' for ee1 in [int(re.sub('^anime_id-', '', ee)) for ee in list(sim.row_dic.values())]]
df['cls'] = cl
df.head()

In [None]:
query = sim.sim_col.index[sim.col_dic.token2id['user_id-{}'.format(user_id)]]
query

In [None]:
#[('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]
s = pd.Series(dict([('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]), name='user')
s

In [None]:
df = df.append(s)
df.tail()

In [None]:
sns.pairplot(df, markers=['.', 'o', 's'], height=3.5, hue='cls', hue_order=['*','rated','user'], diag_kind='hist')

## Recommended anime

In [None]:
anime_id_list = []
sim_list = []
for anime_id, wgt in sim.get_sim_byrow(query, num_best=30):
    anime_id = re.sub('^anime_id-', '', anime_id)
    #if int(anime_id) not in animeRated:
    if True:
        anime_id_list.append(int(anime_id))
        sim_list.append(wgt)
        #print(anime_csv.loc[int(anime_id),'name'], wgt)

df_show = anime_csv.loc[anime_id_list].copy()
df_show['similarity'] = sim_list
df_show

# Neon Genesis Evangelion
---

In [None]:
anime_csv.query('name.str.contains("Evangelion")', engine='python')

In [None]:
query = sim.sim_row.index[sim.row_dic.token2id[get_animeid('Neon Genesis Evangelion')]]
query

In [None]:
sim.get_sim_bycol(query, num_best=30)

## Anime rated by user_id-ZZZZZ

In [None]:
user_id = 38534
pd.merge(rating_csv2.query('user_id==@user_id'), anime_csv, on='anime_id')

In [None]:
animeRated = rating_csv2.query('user_id==@user_id').anime_id.values
animeRated

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]
df.head()

In [None]:
cl = ['rated' if ee1 in animeRated else '*' for ee1 in [int(re.sub('^anime_id-', '', ee)) for ee in list(sim.row_dic.values())]]
df['cls'] = cl
df.head()

In [None]:
query = sim.sim_col.index[sim.col_dic.token2id['user_id-{}'.format(user_id)]]
query

In [None]:
#[('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]
s = pd.Series(dict([('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]), name='user')
s

In [None]:
df = df.append(s)
df.tail()

In [None]:
sns.pairplot(df, markers=['.', 'o', 's'], height=3.5, hue='cls', hue_order=['*','rated','user'], diag_kind='hist')

## Recommended anime

In [None]:
anime_id_list = []
sim_list = []
for anime_id, wgt in sim.get_sim_byrow(query, num_best=30):
    anime_id = re.sub('^anime_id-', '', anime_id)
    #if int(anime_id) not in animeRated:
    if True:
        anime_id_list.append(int(anime_id))
        sim_list.append(wgt)
        #print(anime_csv.loc[int(anime_id),'name'], wgt)

df_show = anime_csv.loc[anime_id_list].copy()
df_show['similarity'] = sim_list
df_show

# Bishoujo Senshi Sailor Moon
---

In [None]:
anime_csv.query('name.str.contains("Bishoujo Senshi Sailor Moon")', engine='python')

In [None]:
query = sim.sim_row.index[sim.row_dic.token2id[get_animeid('Bishoujo Senshi Sailor Moon')]]
query

In [None]:
sim.get_sim_bycol(query, num_best=30)

## Anime rated by user_id-ZZZZZ

In [None]:
user_id = 51745
pd.merge(rating_csv2.query('user_id==@user_id'), anime_csv, on='anime_id')

In [None]:
animeRated = rating_csv2.query('user_id==@user_id').anime_id.values
animeRated

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]

In [None]:
cl = ['rated' if ee1 in animeRated else '*' for ee1 in [int(re.sub('^anime_id-', '', ee)) for ee in list(sim.row_dic.values())]]
df['cls'] = cl

In [None]:
query = sim.sim_col.index[sim.col_dic.token2id['user_id-{}'.format(user_id)]]
query

In [None]:
#[('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]
s = pd.Series(dict([('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]), name='user')
s

In [None]:
df = df.append(s)
df.tail()

In [None]:
sns.pairplot(df, markers=['.', 'o', 's'], height=3.5, hue='cls', hue_order=['*','rated','user'], diag_kind='hist')

## Recommended anime

In [None]:
anime_id_list = []
sim_list = []
for anime_id, wgt in sim.get_sim_byrow(query, num_best=30):
    anime_id = re.sub('^anime_id-', '', anime_id)
    #if int(anime_id) not in animeRated:
    if True:
        anime_id_list.append(int(anime_id))
        sim_list.append(wgt)
        #print(anime_csv.loc[int(anime_id),'name'], wgt)

df_show = anime_csv.loc[anime_id_list].copy()
df_show['similarity'] = sim_list
df_show

# Anime rated by user_id=1
---

In [None]:
user_id = 1
pd.merge(rating_csv2.query('user_id==@user_id'), anime_csv, on='anime_id')

In [None]:
animeRated = rating_csv2.query('user_id==@user_id').anime_id.values
animeRated

In [None]:
df = pd.DataFrame(sim.sim_row.index[:,:3])
df.columns = ['c' + str(ee) for ee in df.columns]
print(df.shape)
df['cls'] = ['*']*df.shape[0]
df.head()

In [None]:
cl = ['rated' if ee1 in animeRated else '*' for ee1 in [int(re.sub('^anime_id-', '', ee)) for ee in list(sim.row_dic.values())]]
df['cls'] = cl

In [None]:
query = sim.sim_col.index[sim.col_dic.token2id['user_id-{}'.format(user_id)]]
query

In [None]:
#[('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]
s = pd.Series(dict([('c'+str(ii), ee) for ii, ee in itertools.islice(enumerate(query), 3)] + [('cls', 'user')]), name='user')
s

In [None]:
df = df.append(s)
df.tail()

In [None]:
sns.pairplot(df, markers=['.']+['o', 's'], height=3.5, hue='cls', diag_kind='hist')

## Recommended anime for user_id=1

In [None]:
anime_id_list = []
sim_list = []
for anime_id, wgt in sim.get_sim_byrow(query, num_best=30):
    anime_id = re.sub('^anime_id-', '', anime_id)
    #if int(anime_id) not in animeRated:
    if True:
        anime_id_list.append(int(anime_id))
        sim_list.append(wgt)
        #print(anime_csv.loc[int(anime_id),'name'], wgt)

df_show = anime_csv.loc[anime_id_list].copy()
df_show['similarity'] = sim_list
df_show

In [None]:
(
    np.min(list(zip(*sim.get_sim_byrow(query, num_best=None)))[1]),
    np.max(list(zip(*sim.get_sim_byrow(query, num_best=None)))[1]),
    np.mean(list(zip(*sim.get_sim_byrow(query, num_best=None)))[1]),
)

In [None]:
sns.distplot(list(zip(*sim.get_sim_byrow(query, num_best=None)))[1])

## Similar users to user_id=1

In [None]:
sim.get_sim_bycol(query, num_best=10)