In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pprint as pp

import requests
import json
from difflib import get_close_matches
from difflib import SequenceMatcher

from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cosine

plt.style.use('seaborn')

In [2]:
genres_item_matrix = pd.read_csv('data/genres_item_matrix.csv', index_col='id')
tag_item_matrix = pd.read_csv('data/tag_item_matrix.csv', index_col='id')
tag_rank_item_matrix = pd.read_csv('data/tag_rank_item_matrix.csv', index_col='id')
title_df = pd.read_csv('data/title_df.csv', index_col='id')
adjusted_score_df = pd.read_csv('data/adjusted_score_df.csv', index_col='id')

In [3]:
binary_df = pd.concat([genres_item_matrix, tag_item_matrix], axis=1)
binary_df.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99726,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98526,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4876,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
rank_df = pd.concat([genres_item_matrix, tag_rank_item_matrix], axis=1)
rank_df.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0
99726,0,1,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98526,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
966,0,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4876,1,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# rank_df.to_csv('data/content_item_matrix.csv')

In [11]:
def get_anime_id(_user_preferred_title):
    id_list = []
    anime_id = title_df[title_df['userPreferred'].isin([_user_preferred_title])].index
    id_list.append(anime_id)
    
    return id_list[0].values[0]


def get_top_n_recommendations(anime_id, dataframe, similarity_matrix, n=5):
    positional_idx = dataframe.index.get_loc(anime_id)
    
    top_n = np.argsort(similarity_matrix[positional_idx,:])[-n-1:-1]
    recom_titles = []
    for idx, row in title_df.iloc[top_n,:].iterrows():
        if type(row['english']) != float:
            recom_titles.append(row['english'])
        else:
            recom_titles.append(row['userPreferred'])
    
    return recom_titles


        
def get_title_from_id(_id):
    '''Searches the title dataframe based on an anime id and tries to return the english title. 
    If an english title is not available, the "user preferred" is give. '''
    title = None
    if pd.isna(title_df.loc[title_df.index == _id, 'english']).values[0]:
        title = title_df.loc[title_df.index == _id, 'userPreferred'].values[0]
    else:
        title = title_df.loc[title_df.index == _id, 'english'].values[0]
    
    return title


def get_user_preferred(initial_search):
    '''Search the AniList API for a show based on a query.
    
    Returns the user preferred title 
    '''
    query = '''
    query ($search: String) {
      Media (type: ANIME, search: $search) {
        id
        title {
          romaji
          english
          native
          userPreferred
        }
      }
    }
    '''
    variables = {'search': initial_search}

    url = 'https://graphql.anilist.co'
    response = requests.post(url, 
                             json={'query': query, 
                                   'variables': variables})
    user_preferred_title = response.json()['data']['Media']['title']['userPreferred']

    return user_preferred_title


def view_features(search_term, df):
    _id = title_df[title_df['userPreferred'] == search_term].index
    _df = df.loc[_id,:]

    return list(_df.loc[:, (_df != 0).any(axis=0)].columns)



def view_features_from_id(_id, _df):
    single_show = _df.loc[_id,:]
    
    return set(single_show[single_show != 0].index)



def get_common_tags(list_of_ids):
    list_of_tags = [view_features_from_id(_id, binary_df) for _id in list_of_ids]

    return set.intersection(*list_of_tags)

In [12]:
def get_recommendations(search_term, similarity_matrix=weighted_sim_mat):
    user_preferred_title = get_user_preferred(search_term)
    _id = get_anime_id(user_preferred_title)
    print('Getting recommendations for: ', get_title_from_id(_id))
    rec_list = get_top_n_recommendations(_id, 
                                         rank_df, 
                                         similarity_matrix, 
                                         n=5)
    return rec_list



In [13]:
get_top_n_recommendations(get_anime_id(get_user_preferred("Wolf's Rain")), rank_df, weighted_sim_mat, n=5)

['Tsubasa: Spring Thunder Chronicles',
 'Attack on Titan Season 2',
 'Nausicaä of the Valley of the Wind',
 "Wolf's Rain OVA",
 'Made in Abyss']

In [14]:
get_top_n_recommendations(get_anime_id(get_user_preferred("Wolf's Rain")), rank_df, sim_mat, n=5)

['Wan Wan Chuushingura',
 "Wolf's Rain OVA",
 'Galilei Donna',
 'Jyu-Oh-Sei: Planet of the Beast King',
 'Final Fantasy VII: Last Order']

In [15]:
# list of titles to list of ids
title_list = ["Wolf's Rain", "Ergo Proxy", "Texhnolyze"]
id_list = []
for title in title_list:
    user_preferred_title = get_user_preferred(title)
    id_list.append(get_anime_id(user_preferred_title))
    print(user_preferred_title)

Wolf's Rain
Ergo Proxy
Texhnolyze


In [None]:
adjusted_scores = round((adjusted_score_df['adjusted_score']), 4)

mean_vector = rank_df.loc[id_list,:].mean(axis=0)

sim_mat = cosine_similarity(rank_df.append(mean_vector, ignore_index=True).values)

sims = (sim_mat[-1:] * np.append(adjusted_scores.to_numpy(), 0))[0]

exclusion_vec = np.ones(len(sims))

for _id in id_list:
    exclusion_vec[(rank_df.index.get_loc(_id))] = 0

sims *= exclusion_vec

In [79]:
n = 15
for pos in sims.argsort()[:-(n+1):-1]:
    print("positional index:", pos)
    for title in title_df.iloc[pos,[1, 3]]:
        print(title)
    print('-----')

positional index: 215
Neon Genesis Evangelion
Shin Seiki Evangelion
-----
positional index: 1306
nan
Casshern Sins
-----
positional index: 115
Land of the Lustrous
Houseki no Kuni
-----
positional index: 114
Neon Genesis Evangelion: The End of Evangelion
Shin Seiki Evangelion Movie: THE END OF EVANGELION
-----
positional index: 159
Cowboy Bebop: The Movie - Knockin' on Heaven's Door
Cowboy Bebop: Tengoku no Tobira
-----
positional index: 425
nan
Wolf's Rain OVA
-----
positional index: 82
Ghost in the Shell: Stand Alone Complex 2nd GIG
Koukaku Kidoutai: Stand Alone Complex 2nd GIG
-----
positional index: 136
PSYCHO-PASS
PSYCHO-PASS
-----
positional index: 192
Ghost in the Shell
Koukaku Kidoutai
-----
positional index: 354
Akira
Akira
-----
positional index: 24
Made in Abyss
Made in Abyss
-----
positional index: 31
Cowboy Bebop
Cowboy Bebop
-----
positional index: 339
Ghost in the Shell: Stand Alone Complex - The Laughing Man
Koukaku Kidoutai: Stand Alone Complex - The Laughing Man
-----

In [22]:
# measure the cosine similarity between our mean vector and every item in our dataset
dist_list = []
for anime in rank_df.iterrows():
    dist_list.append(cosine(mean_vector, anime[1]))
dist_series = pd.Series(dist_list, index=rank_df.index)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [23]:
dist_series

id
10161     0.438864
99726     0.822804
98526     0.645832
966       0.959022
4876      0.653743
            ...   
99586     0.954598
99916     0.949583
101283    0.710104
101633    0.905234
101089    0.980842
Length: 9832, dtype: float64

In [24]:
title_list

["Wolf's Rain", 'Ergo Proxy', 'Texhnolyze']

In [25]:
# USING AVERAGE FEATURE VECTOR TO FIND MOST SIMILAR ELEMENTS IN A SIMILARITY MATRIX
# get the titles for our ids
for num, _id in enumerate(list(dist_series.sort_values()[:30].index)):
    print(num+1, get_title_from_id(_id))

1 Texhnolyze
2 Ergo Proxy
3 Wolf's Rain
4 Casshern Sins
5 Neon Genesis Evangelion
6 Wolf's Rain OVA
7 DRAMAtical Murder
8 A Wind Named Amnesia
9 Harmony
10 Cowboy Bebop: The Movie - Knockin' on Heaven's Door
11 The Animatrix
12 Land of the Lustrous
13 Mardock Scramble: The Third Exhaust
14 Sky Blue
15 Ghost in the Shell: Stand Alone Complex - The Laughing Man
16 Jyu-Oh-Sei: Planet of the Beast King
17 King of Thorn
18 Ghost in the Shell
19 Neon Genesis Evangelion: The End of Evangelion
20 Akira
21 Final Fantasy: The Spirits Within
22 Shangri-La
23 Black Bullet
24 Sunday without God
25 Blame! Movie
26 Ghost in the Shell: Stand Alone Complex 2nd GIG
27 Galilei Donna
28 Grey: Digital Target
29 Danganronpa 3: The End of Hope's Peak High School - Hope Arc
30 Goku: Midnight Eye
