In [72]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pprint

import requests
import json
from difflib import get_close_matches
from difflib import SequenceMatcher

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA

from scipy.spatial.distance import cosine

plt.style.use('seaborn')

In [2]:
genres_item_matrix = pd.read_csv('data/genres_item_matrix.csv', index_col='id')
tag_item_matrix = pd.read_csv('data/tag_item_matrix.csv', index_col='id')
tag_rank_item_matrix = pd.read_csv('data/tag_rank_item_matrix.csv', index_col='id')
title_df = pd.read_csv('data/title_df.csv', index_col='id')

In [3]:
binary_df = pd.concat([genres_item_matrix, tag_item_matrix], axis=1)
binary_df.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99726,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98526,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4876,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
def get_anime_id(_user_preferred_title):
    id_list = []
    anime_id = title_df[title_df['userPreferred'].isin([_user_preferred_title])].index
    id_list.append(anime_id)
    
    return id_list[0].values[0]


def get_top_n_recommendations(anime_id, dataframe, similarity_matrix, n=5):
    positional_idx = dataframe.index.get_loc(anime_id)
    
    top_n = np.argsort(similarity_matrix[positional_idx,:])[-n-1:-1]
    recom_titles = []
    for idx, row in title_df.iloc[top_n,:].iterrows():
        if type(row['english']) != float:
            recom_titles.append(row['english'])
        else:
            recom_titles.append(row['userPreferred'])
    
    return recom_titles


        
def get_title_from_id(_id):
    '''Searches the title dataframe based on an anime id and tries to return the english title. 
    If an english title is not available, the "user preferred" is give. '''
    title = None
    if pd.isna(title_df.loc[title_df.index == _id, 'english']).values[0]:
        title = title_df.loc[title_df.index == _id, 'userPreferred'].values[0]
    else:
        title = title_df.loc[title_df.index == _id, 'english'].values[0]
    
    return title


def get_user_preferred(initial_search):
    '''Search the AniList API for a show based on a query.
    
    Returns the user preferred title 
    '''
    query = '''
    query ($search: String) {
      Media (type: ANIME, search: $search) {
        id
        title {
          romaji
          english
          native
          userPreferred
        }
      }
    }
    '''
    variables = {'search': initial_search}

    url = 'https://graphql.anilist.co'
    response = requests.post(url, 
                             json={'query': query, 
                                   'variables': variables})
    user_preferred_title = response.json()['data']['Media']['title']['userPreferred']

    return user_preferred_title


def view_features(search_term, df):
    _id = title_df[title_df['userPreferred'] == search_term].index
    _df = df.loc[_id,:]

    return list(_df.loc[:, (_df != 0).any(axis=0)].columns)



def view_features_from_id(_id, _df):
    single_show = _df.loc[_id,:]
    
    return set(single_show[single_show != 0].index)



def get_common_tags(list_of_ids):
    list_of_tags = [view_features_from_id(_id, binary_df) for _id in list_of_ids]

    return set.intersection(*list_of_tags)

In [13]:
rank_df = pd.concat([genres_item_matrix, tag_rank_item_matrix], axis=1)
rank_df.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0
99726,0,1,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98526,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
966,0,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4876,1,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X = rank_df.to_numpy()

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)


pca = PCA(n_components=71)
X_pca = pca.fit_transform(X_scaled)

In [15]:
df_rank_pca = pd.DataFrame(X_pca, index=rank_df.index)
df_rank_pca

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,0.902054,-0.308564,0.735503,0.421122,0.017534,0.030490,0.243384,0.231923,-0.359569,0.040510,...,-0.182485,0.021984,0.026300,0.047043,0.162649,-0.045533,0.059506,0.031714,-0.213018,-0.090854
99726,-0.553772,0.919423,-0.563405,0.693138,0.492470,0.506488,0.254710,-0.667565,0.478516,0.084259,...,0.008837,-0.093183,0.105229,-0.174763,0.241618,-0.396269,-0.032945,0.609413,0.396880,-0.088249
98526,0.562031,-0.360142,0.285042,0.437012,-0.341345,-0.349306,0.202698,0.012140,-0.634209,-0.074448,...,0.146506,0.074305,0.103269,-0.032556,0.052025,0.068706,-0.066901,-0.076074,0.044570,0.147230
966,-0.947915,0.433247,0.318086,-0.000632,-0.138365,-0.067705,0.449668,-0.391173,0.295402,-0.160393,...,-0.114551,-0.164202,-0.152159,-0.224784,0.392628,0.201551,-0.078412,-0.015672,-0.128488,-0.108989
4876,0.893532,0.062809,-0.076532,0.672134,0.141467,-0.872544,0.062129,-0.331749,-0.450992,0.038073,...,0.097272,-0.068849,0.138212,0.229516,-0.056098,-0.175754,-0.032315,-0.030032,-0.076957,-0.202100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99586,-0.492690,-0.461529,0.077596,0.035290,-0.218350,0.116027,0.676679,-0.626119,0.378083,-0.271106,...,-0.022025,0.051464,-0.180136,-0.059431,0.047162,-0.093804,-0.041771,0.092904,-0.082431,0.009839
99916,-0.610720,-0.477351,0.175739,0.605864,-0.083200,0.536619,0.216233,-1.029686,0.625074,-0.133925,...,-0.207554,0.055578,0.098172,-0.230540,-0.256167,0.115844,0.001168,0.105983,0.137175,-0.071298
101283,0.072873,-0.510824,-0.055954,0.054418,-0.351963,-0.368537,-0.174322,0.523298,0.773766,0.113512,...,-0.229846,-0.143228,-0.003926,0.138131,0.136038,-0.042621,0.328266,0.070186,0.024796,0.413676
101633,-0.962752,-0.226620,0.365228,0.500737,0.157103,-0.147362,0.896236,-0.327969,-0.184341,1.099084,...,0.349648,-0.266573,0.187932,-0.312160,0.053678,-0.014947,-0.093748,-0.044092,0.239160,-0.076634


In [16]:
sim_mat = cosine_similarity(df_rank_pca.values)

In [24]:
def get_recommendations(search_term):
    user_preferred_title = get_user_preferred(search_term)
    _id = get_anime_id(user_preferred_title)
    print('Getting recommendations for: ', get_title_from_id(_id))
    rec_list = get_top_n_recommendations(_id, 
                                         df_rank_pca, 
                                         sim_mat, 
                                         n=5)
    return rec_list



In [25]:
get_recommendations("Akira")

Getting recommendations for:  Akira


['Casshern Sins',
 'Tokyo Ghoul Root A',
 'Texhnolyze',
 'Deadman Wonderland',
 'Towanoquon: The Complicity of Dreams']

In [27]:
get_recommendations('Fullmetal')

Getting recommendations for:  Fullmetal Alchemist: Brotherhood


['Magi: The Kingdom of Magic',
 'Plunderer',
 'One Piece',
 'The Heroic Legend of Arslan',
 'Fullmetal Alchemist']

## Recommendations for Sample User Profiles

In [32]:
title_list = ['Tokyo Ghoul', 'Ergo Proxy', 'Fullmetal Alchemist: Brotherhood']
id_list = []
for title in title_list:
    user_preferred_title = get_user_preferred(title)
    _id = get_anime_id(user_preferred_title)
    id_list.append(_id)
    print(user_preferred_title, _id)

Tokyo Ghoul 20605
Ergo Proxy 790
Hagane no Renkinjutsushi: Fullmetal Alchemist 5114


In [33]:
df_rank_pca.loc[id_list,:]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20605,0.73238,-0.349145,0.373274,1.074398,-1.025918,-1.118069,-0.116298,0.741741,0.743977,0.363684,...,0.006369,0.252295,0.258936,-0.073041,-0.058708,0.018604,-0.152504,0.288727,-0.263982,-0.110379
790,0.447823,-0.446803,0.50913,0.001727,0.220747,0.093611,0.02123,0.61172,0.849213,0.196195,...,-0.313142,0.051307,-0.098574,0.104363,0.060498,-0.103056,0.206516,0.025408,-0.224876,0.116717
5114,0.767085,1.263462,-0.333712,1.108493,-0.025172,-0.44529,0.412726,0.245381,-0.500776,0.44334,...,-0.183362,-0.231497,0.012082,0.460317,0.121774,-0.084459,0.058593,0.195755,-0.170341,0.035631


In [71]:
# get the mean vector for a list of anime
mean_vector = df_rank_pca.loc[id_list,:].mean(axis=0)
mean_vector

0     0.649096
1     0.155838
2     0.182898
3     0.728206
4    -0.276781
        ...   
66   -0.056304
67    0.037535
68    0.169963
69   -0.219733
70    0.013990
Length: 71, dtype: float64

In [55]:
# measure the cosine similarity between our mean vector and every item in our dataset
dist_list = []
for anime in df_rank_pca.iterrows():
    dist_list.append(cosine(mean_vector, anime[1]))
dist_series = pd.Series(dist_list, index=df_rank_pca.index)

In [62]:
dist_series

id
10161     0.712965
99726     1.004704
98526     0.787051
966       1.078676
4876      0.609919
            ...   
99586     1.119047
99916     1.101960
101283    0.551344
101633    1.098674
101089    1.235231
Length: 9832, dtype: float64

In [69]:
# get the ids for the 10 items closest to our mean vector
list(dist_series.sort_values()[:10].index)

[20605, 26, 30, 4981, 5114, 790, 16009, 13125, 99147, 47]

In [None]:
# exclude initial titles 


In [70]:
# get the titles for our ids
for _id in list(dist_series.sort_values()[:10].index):
    print(get_title_from_id(_id))

Tokyo Ghoul
Texhnolyze
Neon Genesis Evangelion
Casshern Sins
Fullmetal Alchemist: Brotherhood
Ergo Proxy
Sunday without God
From the New World
Attack on Titan Season 3
Akira
