In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import requests
import json
from difflib import get_close_matches
from difflib import SequenceMatcher

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

plt.style.use('seaborn')

In [2]:
genres_item_matrix = pd.read_csv('data/genres_item_matrix.csv', index_col='id')
tag_item_matrix = pd.read_csv('data/tag_item_matrix.csv', index_col='id')
tag_rank_item_matrix = pd.read_csv('data/tag_rank_item_matrix.csv', index_col='id')
title_df = pd.read_csv('data/title_df.csv', index_col='id')

In [None]:
# We'll use this dataframe later for lookup
title_df.head()

In [None]:
genres_item_matrix.head()

In [None]:
tag_item_matrix.head()

In [3]:
df = pd.concat([genres_item_matrix, tag_item_matrix], axis=1)
df.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
99726,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98526,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4876,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
total_per_feature = df.sum(axis=0)
total_per_feature

In [None]:
total_per_show = df.sum(axis=1)
total_per_show

In [None]:
total_per_show.rename('total_features_per_show')

In [None]:
sns.boxplot(total_per_show);

This seems potentially problematic. A majority of the shows don't have very many features. How many are needed to make an acurate recommendation?

In [14]:
def count_for_n_features(series, n):
    print('{0:.1%}'.format(len(series[series <= n]) / len(series)), 
          'or', len(series[series <= n]), f'shows have {n} or fewer features describing them.')


In [None]:
count_for_n_features(total_per_show, 2)

In [None]:
count_for_n_features(total_per_show, 3)

In [None]:
count_for_n_features(total_per_show, 4)

While selecting which tags to keep, many were cut out because of a low "rank" score. Let's go back and keep all tags but normalize(standardize?) them. 

In [None]:
sns.boxplot(total_per_feature);

In [None]:
# the tags below each describe 5 or fewer anime. They can be dropped without losing too much.
total_per_feature[total_per_feature < 6]

## Getting Recommendations

In [4]:
def jaccard_similarity(df):
    return (1 - pairwise_distances(df, metric="jaccard"))

def get_recommended_titles(recommendations_df):
    recom_titles = []
    for idx, row in recommendations_df.iterrows():
        if type(row['english']) != float:
            recom_titles.append(row['english'])
        else:
            recom_titles.append(row['userPreferred'])
    
    return recom_titles


def get_top_n_recommendations(search_term, dataframe, similarity_matrix, n=5):
    search_result = get_close_matches(search_term, 
                                      title_df['userPreferred'], 
                                      cutoff=0.25, 
                                      n=1)
    print('Closest match: ', search_result)
    
    search_result_idx = title_df[title_df['userPreferred'].isin(search_result)].index
    dataframe.loc[search_result_idx, :]

    # get position of anime (id in index)
    positional_idx = dataframe.index.get_loc(int(search_result_idx.values))
    
    # get top n indicies. The top match will always be the initial item.
    top_n = np.argsort(similarity_matrix[positional_idx,:])[-n-1:-1]
    recom_titles = get_recommended_titles(title_df.iloc[top_n,:])
    
    return recom_titles


def view_title_similarity(search_term, recom_titles):
    for title in recom_titles:
        seq_matcher = SequenceMatcher(None, search_term, title)
        print(f'The terms "{search_term}" and "{title}" are ' 
              + '{0:.2%}'.format(seq_matcher.quick_ratio()) + ' alike.')

In [5]:
cos_sim = cosine_similarity(df.values)

jac_sim = jaccard_similarity(df.values)



<b>Recommendations should probably exclude titles from the same franchise.  
For example, try getting recommendations for Ghost in the Shell.  
If you already know you like Ghost in the Shell, it's pretty easy to find others in the same franchise if that's what you want.  </b>

First of all, our rudimentary search engine is failing...

In [None]:
search_term='Ghost in the Shell'
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=10)
rec_list

In [None]:
search_term= "Wolf's Rain"
n = 5
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=n)
print('-----')
print(f'Top {n} based on {search_term}: \n{rec_list}')

In [None]:
view_title_similarity(search_term, rec_list)

Why is a Pokemon movie showing up as a recommendation based on Wolf's Rain?

## Understanding Bad Recommendations

<tr>
    <td> <img src="https://upload.wikimedia.org/wikipedia/en/thumb/4/43/Wolf%27s_Rain_Region_2_Volume_1.jpg/220px-Wolf%27s_Rain_Region_2_Volume_1.jpg" alt="Drawing" style="width: 250px;"/> </td>
    <td> <img src="https://upload.wikimedia.org/wikipedia/en/thumb/9/95/Pok%C3%A9mon_Lucario_film_poster.jpg/220px-Pok%C3%A9mon_Lucario_film_poster.jpg" alt="Drawing" style="width: 250px;"/> </td>
    </tr>

In [None]:
wr_id = title_df[title_df['english'] == "Wolf's Rain"].index
wr_df = df.loc[wr_id,:]

In [None]:
# Genres and tags for Wolf's Rain
wr_df.loc[:, (wr_df != 0).any(axis=0)]

In [None]:
pk_id = title_df[title_df['english'] == "Pokémon: Lucario and the Mystery of Mew"].index
pk_df = df.loc[pk_id,:]

In [None]:
# Genres and tags for Pokémon: Lucario and the Mystery of Mew
pk_df.loc[:, (pk_df != 0).any(axis=0)]

Would it be wise to give a heavier weight to the "kids" column?

In [None]:
sum(df['kids'])

In [None]:
plt.hist(df['kids']);

In [None]:
plt.hist(total_per_feature, bins=50)
plt.ylabel('Number of Tags')
plt.xlabel('Total Number of Shows Per Tag');

In [None]:
for tag, count in total_per_feature.sort_values(ascending=False).iteritems():
    print(tag, count)

<b>Would like to penalize features that describe a large number of shows (i.e. comedy) and give a heavier weight to those that describe fewer shows.  
The normalization should be strong enough to not recommend a Pokemon movie when looking at "Wolf's Rain" but not so strong that all recommendations are based on the rarest tag. </b>

## Using Anilist's API search

In [15]:
def get_search_term(initial_search):
    '''Search the AniList API for a show based on a query.
    
    Returns the user preferred title 
    '''
    query = '''
    query ($search: String) {
      Media (type: ANIME, search: $search) {
        id
        title {
          romaji
          english
          native
          userPreferred
        }
      }
    }
    '''
    variables = {'search': initial_search}

    url = 'https://graphql.anilist.co'
    response = requests.post(url, 
                             json={'query': query, 
                                   'variables': variables})
    user_preferred_title = response.json()['data']['Media']['title']['userPreferred']

    return user_preferred_title

In [None]:
initial_search = 'Ghost in the Shell'

In [None]:
# using cosine similarity 
search_term = get_search_term(initial_search)
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=5)
rec_list

<b> The search is working but maybe we're returning too many from the same franchise.</b>

In [None]:
view_title_similarity('Ghost in the Shell', rec_list)

In [None]:
seq_matcher = SequenceMatcher(None, 
                              "Ghost in the Shell Arise: Alternative Architecture", 
                              "Ghost in the Shell: Stand Alone Complex 2nd GIG")
seq_matcher.quick_ratio()

This is one problem with the recommender. The recommendations are good but too many are not very useful for a user. Future work on the project would include going through and filtering out titles from the same franchise or only including one title per franchise. 

In [None]:
search_term = get_search_term(initial_search)
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=20)
rec_list

## Comparing Cosine Similarity and Jaccard

We can do Jaccard because our data is still binary

### Samurai Champloo

In [None]:
initial_search = 'Samurai Champloo'

In [None]:
search_term = get_search_term(initial_search)
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=10)
rec_list

In [None]:
search_term = get_search_term(initial_search)
rec_list = get_top_n_recommendations(search_term, df, jac_sim, n=10)
rec_list

Cosine similarity and jaccard similarity are returning similar results.

In [16]:
def view_features(search_term, df):
    _id = title_df[title_df['userPreferred'] == search_term].index
    _df = df.loc[_id,:]
    return list(_df.loc[:, (_df != 0).any(axis=0)].columns)

In [None]:
champloo_tags = view_features(get_search_term('Samurai Champloo'), df)
champloo_tags

In [None]:
# Blade of the Immortal scored higher with cosine similarity than jaccard similarity
blade_immortal_tags = view_features(get_search_term("Blade of the Immortal"), df)
blade_immortal_tags

In [None]:
# "Ranma ½: One Flew Over the Kuno's Nest" scored higher with jaccard similarity than cosine similarity
ranma_tags = view_features(get_search_term("Ranma ½: One Flew Over the Kuno's Nest"), df)
ranma_tags

Both have a "supernatural" tag while "Samurai Champloo" does not.  
"Action" and "Adventure" are present for all three.  
"Blade of the Immortal" shares "historical", "samurai", and "swordplay"  
"Ranma ½: One Flew Over the Kuno's Nest" shares "comedy", "martial_arts", and "shounen".

In [None]:
temp_categories = total_per_feature[total_per_feature.index.isin(['action', 
                                                                  'adventure', 
                                                                  'supernatural', 
                                                                  'historical', 
                                                                  'samurai', 
                                                                  'swordplay', 
                                                                  'comedy', 
                                                                  'martial_arts', 
                                                                  'shounen'])]
plt.bar(temp_categories.index, temp_categories)
plt.title('Number of Shows Per Genre/Tag');

Not sure what's going on but it appears jaccard similarity may be favoring more common features.  
To Be Continued...

## Diving into EDA

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(12,8), sharex=True)
ax[0].hist(total_per_feature, bins=100)
ax[0].set_title('Number of Shows Per Genre/Tag')
ax[0].set_ylabel('Tag Count')

ax[1].boxplot(total_per_feature, vert=False)
ax[1].set_xlabel('Anime Count');

## Weighing Features

In [None]:
tag_rank_item_matrix.head()

In [6]:
df_w_rank = pd.concat([genres_item_matrix, tag_rank_item_matrix], axis=1)
df_w_rank.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0
99726,0,1,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98526,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
966,0,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4876,1,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
cos_sim_rank = cosine_similarity(df_w_rank.values)

MemoryError: Unable to allocate 738. MiB for an array with shape (9832, 9832) and data type float64

In [19]:
cos_sim = cosine_similarity(df.values)

MemoryError: Unable to allocate 738. MiB for an array with shape (9832, 9832) and data type float64

In [None]:
# boolean matrix
search_term= "Wolf's Rain"
rec_list = get_top_n_recommendations(search_term, df, cos_sim, n=5)
rec_list

In [None]:
# Accounting for tag relevance to title
search_term= "Wolf's Rain"
rec_list = get_top_n_recommendations(search_term, df_w_rank, cos_sim_rank, n=5)
rec_list

<b>Using rank got rid of the Pokemon recommendation! That's a good sign.</b>  
'Wan Wan Chuushingura' is still present through. Why?

In [None]:
view_features(get_search_term("Wolf's Rain"), df_w_rank)

In [None]:
view_features(get_search_term('Wan Wan Chuushingura'), df_w_rank)

All of the tags for "Wan Wan Chuushingura" are also tags for "Wolf's Rain" but there are a few missing that are very important. This goes back to a problem noticed earlier. Our data is more sparce than what is ideal. We need more tags for "Wan Wan Chuushingura" for our recommender to see that maybe this isn't a great suggestion based on Wolf's Rain.  
Would it be worth penalizing titles with fewer features?  
Or maybe we just cut out titles with fewer features? 

## PCA

In [7]:
df_w_rank.head()

Unnamed: 0_level_0,action,adventure,comedy,drama,ecchi,fantasy,horror,mahou_shoujo,mecha,music,...,witch,work,wrestling,writing,wuxia,yakuza,yandere,youkai,yuri,zombie
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10161,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.0,0.0
99726,0,1,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98526,1,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
966,0,0,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4876,1,1,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
X = df_w_rank.to_numpy()

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [9]:
X_pca.shape

(9832, 3)

In [10]:
pca.components_.round(2)

array([[ 0.11,  0.02, -0.01,  0.11,  0.02,  0.03,  0.07,  0.03,  0.02,
        -0.06,  0.14,  0.15,  0.06,  0.06,  0.01, -0.03,  0.11,  0.14,
         0.01,  0.09,  0.03, -0.02,  0.05,  0.05,  0.03,  0.04,  0.01,
         0.07,  0.09,  0.01,  0.11,  0.04,  0.  ,  0.02,  0.16,  0.03,
         0.08,  0.01,  0.08,  0.01,  0.01,  0.04,  0.02,  0.02,  0.  ,
         0.01,  0.03,  0.  , -0.  ,  0.07,  0.  ,  0.04,  0.1 ,  0.05,
         0.01,  0.  ,  0.03,  0.07,  0.01,  0.01, -0.  ,  0.03,  0.08,
         0.01,  0.01,  0.05,  0.03,  0.01,  0.03,  0.1 ,  0.14,  0.04,
         0.01,  0.16,  0.05,  0.02,  0.04,  0.04,  0.04,  0.07, -0.01,
         0.01,  0.08,  0.04,  0.07,  0.09,  0.07,  0.  ,  0.1 ,  0.04,
         0.02,  0.08,  0.04,  0.04,  0.11,  0.02,  0.02,  0.05,  0.11,
         0.04,  0.07,  0.02,  0.07,  0.03,  0.04,  0.01, -0.  ,  0.14,
         0.02,  0.01,  0.01, -0.  ,  0.01, -0.01,  0.09,  0.09,  0.02,
         0.  ,  0.04,  0.08,  0.05,  0.06,  0.  ,  0.05,  0.1 , -0.01,
      

In [11]:
df_rank_pca = pd.DataFrame(X_pca, index=df_w_rank.index)
df_rank_pca

Unnamed: 0_level_0,0,1,2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10161,3.064172,-1.638542,-0.135898
99726,4.010838,4.321764,-1.877839
98526,-0.184088,-1.843840,-1.670911
966,0.115804,2.917696,0.561511
4876,2.241810,-2.477414,0.014356
...,...,...,...
99586,0.931506,5.403912,-0.445584
99916,1.312347,5.251737,-1.818466
101283,14.640444,-1.356724,8.143539
101633,0.505021,5.313240,-1.733870


In [12]:
sim_mat = cosine_similarity(df_rank_pca.values)

In [17]:
search_term = get_search_term('Space Dandy')
rec_list = get_top_n_recommendations(search_term, df_rank_pca, sim_mat, n=10)
rec_list

Closest match:  ['Space☆Dandy']


['Garo: The Animation',
 'Planet With',
 'The Seven Deadly Sins the Movie: Prisoners of the Sky',
 'Garakowa -Restore the World-',
 'Dragon Ball Super',
 'Gatchaman Crowds',
 'Utawarerumono',
 'Inuyasha - The Final Act',
 'DARLING in the FRANXX',
 'Freezing']

In [18]:
# vs the old code below
search_term = get_search_term('Space Dandy')
rec_list = get_top_n_recommendations(search_term, df_w_rank, cos_sim_rank, n=10)
rec_list

NameError: name 'cos_sim_rank' is not defined