# Anime Planet Recommended System - Content Based & Collaborative Filtering

This notebook use a classic method (KNN) to recommend anime using 2 different approach: 

- Content Based
- Collaborative Filtering

In Content bases, I explore 7 ways to recommend a list of anime based in 1 given anime:

1. Only Metadata
2. Using one hot encoding to embedding the tags.
3. Using TF-IDF to embedding the tags
4. Concatenate opction 1, 2 and 3.
5. Apply PCA to generate reduced vector of option 4.
6. Using TF-IDF to embedding the synopsis
7. Apply PCA to generate reduced vector of option 6.


I was based on the following work https://www.kaggle.com/benroshan/content-collaborative-anime-recommendation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA

pd.set_option("max_colwidth", None)

## Content Based

In [None]:
anime_data=pd.read_csv('../input/animeplanet-recommendation-database-2020/anime.csv')

print("anime_data.shape:", anime_data.shape)
Synopsis = anime_data["Synopsis"]
del anime_data["Synopsis"]
anime_data.head(1)

In [None]:
def process_multilabel(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

anime_data["Tags"] = anime_data["Tags"].map(process_multilabel)
anime_data["Content Warning"] = anime_data["Content Warning"].map(process_multilabel)
anime_data["Studios"] = anime_data["Studios"].map(process_multilabel)
anime_data["Rating Score"] = anime_data["Rating Score"].replace("Unknown", 0).astype(float)
anime_data["Duration"] = anime_data["Duration"].replace("Unknown", 0).astype(int)
anime_data["Finished"] = anime_data["Finished"].replace("Unknown", 0).astype(int)
anime_data["Episodes"] = anime_data["Episodes"].replace("Unknown", 0).astype(int)
anime_data["Number Votes"] = anime_data["Number Votes"].replace("Unknown", 0).astype(int)
anime_data["EndYear"] = anime_data["EndYear"].replace("Unknown", 0).astype(int)
anime_data["StartYear"] = anime_data["StartYear"].replace("Unknown", 0).astype(int)


anime_data.head(1)

In [None]:
def preprocessing_category(df, column, is_multilabel=False):
    # Binarise labels
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()
        
    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    # Create a pandas.DataFrame from our output
    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

anime_metadata = anime_data.copy()
anime_metadata = preprocessing_category(anime_metadata, "Type")
anime_metadata = preprocessing_category(anime_metadata, "Season")
anime_metadata = preprocessing_category(anime_metadata, "Studios", is_multilabel=True)
anime_metadata = preprocessing_category(anime_metadata, "Content Warning", is_multilabel=True)
#anime_metadata = anime_metadata.replace("Unknown", 0)
#anime_metadata = anime_metadata.replace("an", 0)

Genders = anime_metadata["Tags"]
ID_NAME = anime_metadata[["Anime-PlanetID", "Name", "Alternative Name"]]


del anime_metadata["Tags"]
del anime_metadata["Name"]
del anime_metadata["Alternative Name"]
del anime_metadata["Anime-PlanetID"]
del anime_metadata["Url"]

numeric_columns = ["Rating Score", "Number Votes", "Episodes", "Duration", "StartYear", "EndYear"]
anime_metadata[numeric_columns] = MinMaxScaler().fit_transform(anime_metadata[numeric_columns])
anime_metadata = anime_metadata.values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
genres_original = anime_data['Tags'].fillna('').astype(str)
genres_vector_tf_idf = tfv.fit_transform(genres_original)

genres_vector_one_hot = preprocessing_category(pd.DataFrame(Genders), "Tags", True).values

In [None]:
print("anime_metadata.shape:", anime_metadata.shape)
print("genres_vector_tf_idf.shape:", genres_vector_tf_idf.shape)
print("genres_vector_one_hot.shape:", genres_vector_one_hot.shape)

## Recommend with KNN

In [None]:
def get_recommended(vector, query_index, n_neighbors=10):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(csr_matrix(vector))

    distances, indices = model_knn.kneighbors(vector[query_index,:].reshape(1, -1), n_neighbors = n_neighbors)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime_data.iloc[index])
        
    return pd.DataFrame(result)

In [None]:
# query_index = np.random.choice(anime_metadata.shape[0])
query_index = ID_NAME[ID_NAME["Anime-PlanetID"] == 7639].index[0]
aux = anime_data.iloc[[query_index]].copy()
del aux["Url"]
aux

In [None]:
get_recommended(anime_metadata, query_index, 10)

In [None]:
get_recommended(genres_vector_tf_idf, query_index, 10)

In [None]:
get_recommended(genres_vector_one_hot, query_index, 10)

In [None]:
all_data = np.concatenate((anime_metadata, genres_vector_tf_idf.todense(), genres_vector_one_hot), axis=1)
all_data.shape

In [None]:
get_recommended(all_data, query_index, 10)

In [None]:
%%time

reduced_all_data = PCA(n_components=250).fit_transform(all_data)
get_recommended(reduced_all_data, query_index, 10)

### Use Synopsis and TF-IDF

In [None]:
usecols = ["Anime-PlanetID", "Name", "Tags", "Synopsis"]
anime_data_2 = pd.read_csv('../input/animeplanet-recommendation-database-2020/anime.csv', usecols=usecols)
anime_data_2.head()

In [None]:
query_index_2 = anime_data_2[anime_data_2["Anime-PlanetID"] == 7639].index[0]
anime_data_2.iloc[[query_index_2]]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string

synopsis_original = anime_data_2['Synopsis'].fillna('').astype(str)
synopsis_vector_tf_idf = tfv.fit_transform(synopsis_original)
synopsis_vector_tf_idf.shape

In [None]:
def get_recommended_another_df(vector, query_index, n_neighbors=10):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(csr_matrix(vector))

    distances, indices = model_knn.kneighbors(vector[query_index,:].reshape(1, -1), n_neighbors = n_neighbors)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime_data_2.iloc[index])
        
    return pd.DataFrame(result)

In [None]:
get_recommended_another_df(synopsis_vector_tf_idf, query_index_2, 10)

In [None]:
%%time

reduced_all_data = PCA(n_components=250).fit_transform(synopsis_vector_tf_idf.todense())
get_recommended_another_df(reduced_all_data, query_index_2, 10)

## Collaborative Filtering

In [None]:
rating_data = pd.read_csv('../input/animeplanet-recommendation-database-2020/rating_complete.csv')

print ("rating_data.shape:", rating_data.shape)
print (rating_data.info())
rating_data.head()

In [None]:
unique_users = {int(x): i for i,x in enumerate(rating_data.user_id.unique())}
unique_items = {int(x): i for i,x in enumerate(rating_data.anime_id.unique())}

print(len(unique_items), len(unique_users))
anime_collabolative_filter = np.zeros((len(unique_items), len(unique_users)))

for user_id, anime_id, rating in rating_data.values:
    anime_collabolative_filter[unique_items[int(anime_id)], unique_users[int(user_id)]] = rating

In [None]:
get_recommended(anime_collabolative_filter, query_index_2, 10)