# In this notebook I tried to build a recommendation system that would recommend similar anime based on the user's browsing history

Plan:
* Explore the data
* Check and clean the missing values
* Prepare data for clustering
* Make clusters(use minbatchkmeans)
* Display clusters(use t-SNE)
* Find nearest neighbors
* Test our model

Imports:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

%matplotlib inline

init_notebook_mode(connected=True)

Read the data

In [None]:
df_anime = pd.read_csv('../input/anime-recommendations-database/anime.csv')

In [None]:
df_anime

In [None]:
df_rating = pd.read_csv('../input/anime-recommendations-database/rating.csv')

In [None]:
df_rating

Check missing values

In [None]:
df_anime.isna().sum(), df_anime.isnull().sum()

We have some missing values, i decide to drop rows. their number is small, our completeness of information has not suffered much

In [None]:
df_anime = df_anime.dropna()

In [None]:
df_anime.isna().sum(), df_anime.isnull().sum()

Check missing values in df_rating

In [None]:
df_rating.isna().sum(), df_rating.isnull().sum()

Column "episodes" has a value "Unknown". it does not suit us.We replace this value by median.

In [None]:
unknown_index = df_anime[df_anime['episodes']=='Unknown'].index.to_list()

In [None]:
df_anime.loc[unknown_index,'episodes'] = 0
df_anime[df_anime['episodes']==0]

In [None]:
df_anime['episodes'] = df_anime['episodes'].astype('int')

In [None]:
df_anime['episodes'].describe()

In [None]:
df_anime.loc[unknown_index,'episodes'] = df_anime['episodes'].median()

In [None]:
df_anime['episodes'].describe()

# Explore anime dataframe

Explore genre

In [None]:
df_anime

Number of unique anime

In [None]:
len(df_anime['anime_id'].unique())

Work with genre. make dummy variables with one hot code

In [None]:
#first of all get all genres 
all_genres = ''
for genre in df_anime['genre'].to_list():
    all_genres += str(genre) + ', '

In [None]:
all_genres = all_genres.split(',')

In [None]:
all_genres = list(map(lambda x: x.strip() ,all_genres))

In [None]:
all_genres = set(all_genres)

In [None]:
all_genres.remove('')

Create dummy variables. we check if genre name in genre list -> column genre, then set 1 in other case set 0

In [None]:
for genre_name in all_genres:
    list_code = list(map(lambda x: 1 if x.find(genre_name)+1 else 0,df_anime['genre'].to_list()))
    df_anime.loc[:,'genre_%s'%genre_name] =list_code 

When we create dummy variables, no longer needed a column "genre"

In [None]:
df_anime = df_anime.drop('genre',axis=1)

Top count genres. Make plot

In [None]:
count_genres = {genre: df_anime['genre_%s'%genre].sum() for genre in all_genres}

In [None]:
count_genres = {key: value for key, value in sorted(count_genres.items(),key=lambda x: x[1],reverse=True)}

In [None]:
x = list(count_genres.keys())[:10]
y = [count_genres[key] for key in x]

In [None]:
plt.figure(figsize=(10,10))
sns.barplot(x=x,y=y)

Top rating genres. Make plot

In [None]:

rate_genres = [(genre,df_anime[df_anime['genre_%s'%genre]==1]['rating'].mean()) for genre in all_genres]

In [None]:
rate_genres = sorted(rate_genres, key=lambda x: x[1], reverse=True)

In [None]:
len(rate_genres)

In [None]:
plt.figure(figsize=(11,10))
plt.xlabel('Genre')
plt.ylabel('Mean rating')
sns.barplot(x=list(map(lambda x: x[0],rate_genres[:10])), y=list(map(lambda x: x[1],rate_genres[:10])))

Explore type anime. dummy variables make later

In [None]:
values = df_anime['type'].value_counts()

Make count plot

In [None]:
labels = values.index.to_list()
values = values.to_list()
values,labels

In [None]:
plt.figure(figsize=(10,10))
plt.pie(values, labels=labels,autopct='%1.1f%%')
_ = plt.legend(labels)

make rating\type plot

In [None]:
rate_type = [(type_name, df_anime[df_anime['type']==type_name]['rating'].mean()) for type_name in df_anime['type'].unique()]

In [None]:
rate_type = sorted(rate_type, key=lambda x: x[1],reverse=True)

In [None]:
plt.figure(figsize=(10,10))
plt.xlabel('type')
plt.ylabel('mean rating')
sns.barplot(x=list(map(lambda x: x[0],rate_type)), y=list(map(lambda x: x[1],rate_type)))

Explore amount of ratings

In [None]:
sns.violinplot(df_anime['rating'])

In [None]:
df_anime['rating'].describe()

Explore members

In [None]:
top5 = df_anime.sort_values(by=['members'], ascending=False)[:5]
down5 = df_anime.sort_values(by=['members'], ascending=False)[-5:]

Build count plot top5 

In [None]:
plt.figure(figsize=(10,10))
a = sns.barplot(x=top5['name'],y=top5['members'])
_ = plt.xticks(a.get_xticks(), rotation=90)

Make plot of rating top5

In [None]:
plt.figure(figsize=(10,10))
a = sns.barplot(x=top5['name'],y=top5['rating'])
_ = plt.xticks(a.get_xticks(), rotation=90)

Make plot of down5

In [None]:
plt.figure(figsize=(10,10))
a = sns.barplot(x=down5['name'],y=down5['members'])
_ = plt.xticks(a.get_xticks(), rotation=90)

Make plot of rating down5

In [None]:
plt.figure(figsize=(10,10))
a = sns.barplot(x=down5['name'],y=down5['rating'])
_ = plt.xticks(a.get_xticks(), rotation=90)

# Explore rating dataframe

In [None]:
df_rating

Explore user_id

In [None]:
#find number of unique user
len(df_rating['user_id'].unique())

Explore anime_id

In [None]:
#find number of unique anime
len(df_rating['anime_id'].unique())

determine which anime the user liked, by determine mean user rating and if single rating bigger than mean, then user like this anime

In [None]:
df_rating['mean_rating'] = df_rating.groupby('user_id')['rating'].transform('mean')
df_rating

In [None]:
a = df_rating[df_rating['rating']>=df_rating['mean_rating']].apply(lambda x: 1,axis=1)

In [None]:
index_liked = a.index.to_list()

In [None]:
df_rating_liked = df_rating.iloc[index_liked,:]

In [None]:

df_rating_liked = df_rating_liked.drop(['rating','mean_rating'], axis=1)

In [None]:
df_rating_liked

# Prepare data for clusterize

The idea is that we first clusterize df_anime according to its parameters(without anime_id). Then, for example, we take the first user and his anime which he “liked”. We build the "user centroid" according to his anime. And then we look for the nearest points through the centroids of the clusters

In [None]:
anime_index = {df_anime.loc[idx,'anime_id']:idx for idx in df_anime.index}

In [None]:
df_anime_clusterize = df_anime.drop(['name','anime_id'],axis=1)

In [None]:
df_anime_clusterize = pd.get_dummies(df_anime_clusterize)

Define numerical and categorical columns

In [None]:
num_cols= df_anime_clusterize[['episodes','rating','members']]
cat_cols = df_anime_clusterize.drop(['episodes','rating','members'], axis=1)

Scale numericals columns

In [None]:
scaler = StandardScaler()

In [None]:
num_cols = pd.DataFrame(scaler.fit_transform(num_cols))

In [None]:
num_cols.columns = ['episodes_scale','rating_scale','members_scale']

In [None]:
df_anime_clusterize = pd.concat([num_cols, cat_cols], axis=1, join='inner')

# Make clusters by MiniBatchKMeans

In [None]:
scores = []
inertia_list = np.empty(11)

for i in range(2,11):
    print(i)
    kmeans = MiniBatchKMeans(n_clusters=i, batch_size=50)
    kmeans.fit(df_anime_clusterize)
    inertia_list[i] = kmeans.inertia_
    scores.append(silhouette_score(df_anime_clusterize, kmeans.labels_))

In [None]:


plt.plot(range(0,11),inertia_list,'-o')
plt.xlabel('Number of cluster')
plt.axvline(x=4, color='blue', linestyle='--')
plt.ylabel('Inertia')
plt.show()



In [None]:


plt.plot(range(2,11), scores);
plt.title('Results KMeans')
plt.xlabel('n_clusters');
plt.axvline(x=4, color='blue', linestyle='--')
plt.ylabel('Silhouette Score');
plt.show()



From theses result, i decide to pick 4 number of clusters

In [None]:
kmeans =  MiniBatchKMeans(n_clusters=4,batch_size=40)
kmeans = kmeans.fit(df_anime_clusterize)
clusters = kmeans.predict(df_anime_clusterize)
df_anime_clusterize['cluster'] = clusters
df_anime_clusterize['cluster'].value_counts()

# Display clusters

Pick 4000 rows, to reduce time of a calculation

In [None]:
plot_df = pd.DataFrame(np.array(df_anime_clusterize.sample(4000)))
plot_df.columns = df_anime_clusterize.columns

Pick this value of perplexity. because return good result with good time of a calculation

In [None]:
perplexity = 30

create tsne for 2d and 3d plots

In [None]:
tsne_2d = TSNE(n_components=2, perplexity=perplexity)

tsne_3d = TSNE(n_components=3, perplexity=perplexity)

In [None]:
TCs_2d = pd.DataFrame(tsne_2d.fit_transform(plot_df.drop(["cluster"], axis=1)))
TCs_3d = pd.DataFrame(tsne_3d.fit_transform(plot_df.drop(["cluster"], axis=1)))

In [None]:
TCs_2d.columns = ["TC1_2d","TC2_2d"]

TCs_3d.columns = ["TC1_3d","TC2_3d","TC3_3d"]

In [None]:
plot_df = pd.concat([plot_df,TCs_2d,TCs_3d], axis=1, join='inner')

In [None]:
plot_df["1d_y"] = 0

In [None]:
clusters = {}
for cluster_label in plot_df['cluster'].unique():
    clusters[cluster_label] = plot_df[plot_df["cluster"] == cluster_label]

2d plot

In [None]:
data = []
for key in clusters.keys():
    data.append(go.Scatter(
                    x = clusters[key]["TC1_2d"],
                    y = clusters[key]["TC2_2d"],
                    mode = "markers",
                    name = "Cluster %s"%key,
                    text = None))

title = "Visualizing Clusters in Two Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"

layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

3d plot

In [None]:
data = []
for key in clusters.keys():
    data.append(go.Scatter3d(
                    x = clusters[key]["TC1_3d"],
                    y = clusters[key]["TC2_3d"],
                    z = clusters[key]["TC3_3d"],
                    mode = "markers",
                    name = "Cluster %s"%key,
                    text = None))


title = "Visualizing Clusters in Three Dimensions Using T-SNE (perplexity=" + str(perplexity) + ")"

layout = dict(title = title,
              xaxis= dict(title= 'TC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'TC2',ticklen= 5,zeroline= False)
             )
plt.figure(figsize=(20,20))
fig = dict(data = data, layout = layout)

iplot(fig)

# find the closest anime to user's liked anime

Build dict with key cluster and a list of values anime in this cluster

In [None]:
anime_clusters = {i: [] for i in range(4)}
for anime_id, c_pred in zip(df_anime['anime_id'], df_anime_clusterize['cluster']):
    anime_clusters[c_pred] +=[anime_id]

Function that find mean vector "user centroid" of their view history

In [None]:
def find_user_centroid(data):
    data = data[data['cluster']==data['cluster'].mode()[0]]
    data = data.drop(['user_id','anime_id','cluster'], axis=1,errors='ignore')
    return pd.DataFrame(data.mean(axis=0)).T

Create experiment data. take 100k from df_rating rows so it would take less time to calculate.

Structure of experiment data. we group by data by user, and take 75% to build recommendations, and other 25% we check how to close our recommendations to true value(anime which liked our user)

In [None]:
data = df_rating_liked[:100000]
grouped = data.groupby('user_id')

In [None]:
train_data = {'user_id': [],'anime_id': []}
test_data = {'user_id': [],'anime_id': []}
for name,group in grouped:
    if len(group)>1:
        
        train, test = train_test_split(group['anime_id'],test_size=0.2,random_state=42)

        train_data['user_id']+=[name for _ in range(len(train))]
        train_data['anime_id']+= list(train)

        test_data['user_id']+=[name for _ in range(len(test))]
        test_data['anime_id']+= list(test)
    
    

In [None]:
len(train_data['user_id']),len(test_data['user_id'])

In [None]:
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)


In [None]:
df_train = df_train.join(df_anime_clusterize, how='inner')

In [None]:
train_centroids = pd.DataFrame(columns = ['user_id']+list(df_anime_clusterize.columns))

for name,group in df_train.groupby('user_id'):
    user_centroid = find_user_centroid(group)
    user_centroid['user_id'] = name
    user_centroid['cluster'] = group['cluster'].mode()[0]
    train_centroids = train_centroids.append(user_centroid,ignore_index=True)
#     print(group['cluster'])
train_centroids

When we find users centroids, next step find nearest global centroid. we find distance to each centroids and sort them ascending=True, and pick top 3 centroids.

In [None]:
result = {}
for user_id in train_centroids['user_id'][:10]:
    print('User id %s'%user_id)
    user = train_centroids[train_centroids['user_id']==user_id]
    result_dist = []
   
    for anime_id in anime_clusters[user['cluster'].iloc[0]]:
        #iterate by all points in cluster. find 10 closer points to user centroid
        
        anime_point = df_anime_clusterize.loc[anime_index[anime_id],:].drop('cluster').to_numpy()
        
        result_dist.append((anime_id, np.linalg.norm(user.drop(['cluster','user_id'],axis=1)-anime_point)))
        
    
    result[user_id] = sorted(result_dist,key=lambda x: x[1])[:10]

When we get our recommendation let's check it

In [None]:
test_data = pd.DataFrame(test_data)

In [None]:
error_recom = {}
for user_id in list(result.keys())[:10]:
    test_centroid = find_user_centroid(test_data[test_data['user_id']==user_id].join(df_anime_clusterize,how='inner'))
    index = list(map(lambda x: anime_index[x[0]],result[user_id]))
    result_centroid = find_user_centroid(df_anime_clusterize.iloc[index,:])
    error_recom[user_id]  = np.linalg.norm(test_centroid-result_centroid)

In [None]:
error_recom