In [55]:
import pymongo
from pymongo import MongoClient

import pandas as pd

import json
import pprint
pp = pprint.PrettyPrinter(indent=2)

from collections import defaultdict

from surprise import NormalPredictor, SVD, NMF, SlopeOne, CoClustering
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import evaluate
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split

In [2]:
def db_collections(client: MongoClient):
    d = {db: [collection for collection in client[db].list_collection_names()] for db in client.list_database_names()}
    return json.dumps(d)

In [3]:
def encode_dict(d: dict, encoding='utf-8'):
    a = {k: v.encode(encoding) for k, v in d.items() if type(v)==str}
    b = {k: v for k, v in d.items() if type(v)!=str}
    return {**a, **b}

In [4]:
client = MongoClient()
print(db_collections(client))

{"admin": ["system.version"], "config": ["system.sessions"], "kitsu": ["users", "anime", "library"], "local": ["startup_log"], "test": ["library", "anime", "users"]}


In [23]:
attributes = ['user_id', 'media_id', 'rating']
data = {k: [] for k in attributes}


collection = client.kitsu.library
cursor = collection.find(
    {"rating": {"$ne": None}}
)

for i, doc in enumerate(cursor):
    x = encode_dict(doc)
    for k in data:
        data[k].append(x[k])


df = pd.DataFrame(data)
df.describe()

Unnamed: 0,user_id,media_id,rating
count,27048.0,27048.0,27048.0
mean,137.23621,4379.675281,15.975451
std,87.612007,3182.024317,3.998788
min,2.0,1.0,2.0
25%,61.0,1219.0,14.0
50%,129.0,4603.5,16.0
75%,214.0,6734.0,20.0
max,299.0,41204.0,20.0


In [42]:
collection = client.kitsu.anime

cursor = collection.find(
    {},
    {'id': 1, 'slug': 1, '_id': 0}
)

id_slug_map = {}
for i, doc in enumerate(cursor):
    x = encode_dict(doc)
    id_slug_map[x['id']] = x['slug']

id_slug_map

{214: b'gankutsuou',
 1: b'cowboy-bebop',
 9: b'initial-d-fourth-stage',
 161: b'xenosaga-the-animation',
 17: b'texhnolyze',
 156: b'ultra-maniac-ova',
 22: b'neon-genesis-evangelion-death-rebirth',
 4: b'witch-hunter-robin',
 36: b'arc-the-lad',
 45: b'rozen-maiden',
 178: b'video-girl-ai',
 38: b'beck',
 8: b'hungry-heart-wild-striker',
 5: b'beet-the-vandel-buster',
 15: b'school-rumble',
 33: b'angelic-layer',
 28: b'rurouni-kenshin-ishinshishi-e-no-requiem',
 23: b'neon-genesis-evangelion-the-end-of-evangelion',
 25: b'ghost-in-the-shell',
 11: b'naruto',
 47: b'azumanga-daioh',
 154: b'tsubasa-chronicle',
 168: b'love-hina-christmas-special-silent-eve',
 29: b'akira',
 166: b'love-hina',
 153: b'triangle-heart-sweet-songs-forever',
 152: b'tokyo-underground',
 46: b'rozen-maiden-traumend',
 12: b'one-piece',
 30: b'hack-sign',
 35: b'appleseed-movie',
 85: b'hana-yori-dango',
 3: b'trigun',
 42: b'd-n-angel',
 175: b'speed-grapher',
 160: b'whistle',
 162: b'initial-d-first-stag

In [24]:
reader = Reader(rating_scale=(1, 20))
data = Dataset.load_from_df(df[attributes], reader)

In [56]:
algos = [KNNBasic(), KNNWithMeans(), KNNWithZScore(), KNNBaseline(), NormalPredictor(), SVD(), NMF(), SlopeOne(), CoClustering()]
for algo in algos:
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True, n_jobs=6)
    print('\n')

Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.7287  3.7440  3.6794  3.6696  3.8713  3.7386  0.0721  
MAE (testset)     2.6715  2.6583  2.6517  2.6356  2.7793  2.6793  0.0513  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.16    0.16    0.16    0.18    0.16    0.16    0.01    


Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.3987  3.3061  3.5155  3.4354  3.3446  3.4001  0.0728  
MAE (testset)     2.3542  2.3431  2.4428  2.3629  2.3255  2.3657  0.0405  
Fit time          0.02    0.02    0.03    0.02    0.03    0.02    0.00    
Test time         0.19    0.20    0.23    0.18    0.17    0.19    0.02    


Evaluating RMSE, MAE of algorithm KNNWithZScore on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Me

In [36]:
trainset = data.build_full_trainset()
algo = SVD()
# algo = KNNBasic()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x253424e0>

In [37]:
testset = trainset.build_anti_testset()

In [38]:
predictions = algo.test(testset)

In [34]:
def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [46]:
top_n = get_top_n(predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print("\nuser_id: {}".format(uid))
    for i, (iid, _) in enumerate(user_ratings):
        print("> {}".format(id_slug_map[iid]))


user_id: 7
> b'steins-gate'
> b'fullmetal-alchemist-brotherhood'
> b'hotarubi-no-mori-e'
> b'steins-gate-egoistic-poriomania'
> b'attack-on-titan'

user_id: 3
> b'darker-than-black'
> b'attack-on-titan'
> b'ef-a-tale-of-memories'
> b'fate-stay-night-unlimited-blade-works'
> b'fate-zero-2nd-season'

user_id: 2
> b'code-geass-lelouch-of-the-rebellion'
> b'toradora'
> b'hellsing-ultimate'
> b'baccano-specials'
> b'the-melancholy-of-haruhi-suzumiya'

user_id: 8
> b'nausicaa-of-the-valley-of-the-wind'
> b'porco-rosso'
> b'rurouni-kenshin-tsuiokuhen'
> b'chihayafuru'
> b'kiki-s-delivery-service'

user_id: 28
> b'steins-gate'
> b'steins-gate-egoistic-poriomania'
> b'fullmetal-alchemist-brotherhood'
> b'nodame-cantabile'
> b'clannad-after-story'

user_id: 19
> b'spirited-away'
> b'howl-s-moving-castle'
> b'code-geass-lelouch-of-the-rebellion-r2'
> b'durarara'
> b'princess-mononoke'

user_id: 39
> b'paprika'
> b'code-geass-lelouch-of-the-rebellion-r2'
> b'durarara'
> b'btooom'
> b'hotarubi-no-

In [47]:
collection = client.kitsu.library
cursor = collection.find(
    {"user_id": 7, "rating": {"$ne": None}},
    {'user_id': 1, 'media_id': 1, '_id': 0}
)

for i, doc in enumerate(cursor):
    x = encode_dict(doc)
    print(id_slug_map[x['media_id']])

b'cowboy-bebop'
b'cowboy-bebop-tengoku-no-tobira'
b'hack-sign'
b'beck'
b'rozen-maiden'
b'shuffle'
b'last-exile'
b'mai-hime'
b'fullmetal-alchemist'
b'gunslinger-girl'
b'macross-zero'
b'spirited-away'
b'samurai-champloo'
b'planetes'
b'fate-stay-night'
b'boogiepop-phantom'
b'porco-rosso'
b'howl-s-moving-castle'
b'karin'
b'kiki-s-delivery-service'
b'laputa-castle-in-the-sky'
b'metropolis'
b'my-neighbor-totoro'
b'the-cat-returns'
b'rec'
b'pale-cocoon'
b'ergo-proxy'
b'air-gear'
b'black-lagoon'
b'kanojo-to-kanojo-no-neko'
b'pom-poko'
b'black-lagoon-the-second-barrage'
b'death-note'
b'code-geass-lelouch-of-the-rebellion'
b'rozen-maiden-ouverture'
b'paprika'
b'darker-than-black'
b'code-geass-lelouch-of-the-rebellion-r2'
b'appleseed-saga-ex-machina'
b'trigun-badlands-rumble'
b'black-lagoon-roberta-s-blood-trail'
b'eden-of-the-east'
b'eden-of-the-east-movie-i-the-king-of-eden'
b'eden-of-the-east-falling-down'
b'angel-beats'
b'eden-of-the-east-movie-ii-paradise-lost'
b'durarara'
b'black-rock-shoot