# import python packages


In [1]:
import pandas as pd
import numpy as np
import random
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import math
import itertools
import collections
from sklearn.metrics import jaccard_similarity_score
from scipy.sparse import csr_matrix
import random

# import datasets as dataframes:
- There are two datasets:
  - anime.csv (Contains overall anime ratings and other details)
  - rating.csv (Contains user-wise rating for each anime)

In [2]:
anime_df = pd.read_csv("/content/drive/MyDrive/Folder0/PoML/CaseStudy/anime.csv")
rating_df = pd.read_csv("/content/drive/MyDrive/Folder0/PoML/CaseStudy/rating.csv")

# preprocessing of datasets

## getting to know the datasets

In [3]:
anime_df.shape

(12294, 7)

In [4]:
rating_df.shape

(7813737, 3)

In [5]:
anime_df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


- Description:
(anime_df)
  - anime_id - myanimelist.net's unique id identifying an anime.
  - name - full name of anime.
  - genre - comma separated list of genres for this anime.
  - type - movie, TV, OVA, etc.
  - episodes - how many episodes in this show. (1 if movie).
  - rating - average rating out of 10 for this anime.
  - members - number of community members that are in this anime's "group".


In [6]:
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


- Description:(rating_df)
  - user_id - non identifiable randomly - generated user id.
  - anime_id - the anime that this user has rated.
  - rating - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

## Preprocessing rating_df dataframe

- The size of this dataset is very large and each user tends to give ratings based on different criteria. But on a whole, a user tends to watch anime which are similar and also tend to give similar ratings with a few exceptions.
- It is better to take the mean user rating rather than each rating, in this case.

In [7]:
rating_df.loc[rating_df.rating== -1]

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813628,73515,2385,-1
7813629,73515,2386,-1
7813631,73515,2490,-1
7813635,73515,2680,-1


- Remove all rating = -1 because it means that the user has not rated it yet and it will be of no use if he/she has not rated it 

In [8]:
rating_df.drop(rating_df.loc[rating_df.rating== -1].index, inplace=True)

In [9]:
rating_df.loc[rating_df.rating== -1]

Unnamed: 0,user_id,anime_id,rating


In [10]:
rating_df

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [11]:
# mean rating for each user
mean_rating = rating_df.groupby('user_id').mean().reset_index()


In [12]:
mean_rating

Unnamed: 0,user_id,anime_id,rating
0,1,11724.750000,10.000000
1,2,11771.000000,10.000000
2,3,12982.380435,7.565217
3,5,10940.472767,4.355120
4,7,15824.320700,7.387755
...,...,...,...
69595,73512,405.916667,8.583333
69596,73513,1361.575758,7.515152
69597,73514,512.000000,10.000000
69598,73515,4774.351955,8.547486


In [13]:
# mean_rating.mean_rating = mean_rating.rating
mean_rating['mean_rating'] = mean_rating['rating']

In [14]:
mean_rating

Unnamed: 0,user_id,anime_id,rating,mean_rating
0,1,11724.750000,10.000000,10.000000
1,2,11771.000000,10.000000,10.000000
2,3,12982.380435,7.565217,7.565217
3,5,10940.472767,4.355120,4.355120
4,7,15824.320700,7.387755,7.387755
...,...,...,...,...
69595,73512,405.916667,8.583333,8.583333
69596,73513,1361.575758,7.515152,7.515152
69597,73514,512.000000,10.000000,10.000000
69598,73515,4774.351955,8.547486,8.547486


In [15]:
# drop the extra rating column and anime_id column
mean_rating.drop(['anime_id','rating'],axis=1, inplace=True)

In [16]:
mean_rating

Unnamed: 0,user_id,mean_rating
0,1,10.000000
1,2,10.000000
2,3,7.565217
3,5,4.355120
4,7,7.387755
...,...,...
69595,73512,8.583333
69596,73513,7.515152
69597,73514,10.000000
69598,73515,8.547486


In [17]:
# merge the mean rating dataframe mean_rating and rating_df dataframes
rating_df = pd.merge(rating_df,mean_rating,on=['user_id','user_id'])

In [18]:
rating_df

Unnamed: 0,user_id,anime_id,rating,mean_rating
0,1,8074,10,10.000000
1,1,11617,10,10.000000
2,1,11757,10,10.000000
3,1,15451,10,10.000000
4,2,11771,10,10.000000
...,...,...,...,...
6337236,73515,16512,7,8.547486
6337237,73515,17187,9,8.547486
6337238,73515,22145,10,8.547486
6337239,73516,790,9,9.000000


- Here we can tell that animes with rating greater than a particular user's average rating is his/her favourites

In [19]:
# drop rows where the anime is not the user's favoutite 
# drop rows where rating < mean_rating
rating_df = rating_df.drop(rating_df[rating_df.rating < rating_df.mean_rating].index)

In [20]:
rating_df

Unnamed: 0,user_id,anime_id,rating,mean_rating
0,1,8074,10,10.000000
1,1,11617,10,10.000000
2,1,11757,10,10.000000
3,1,15451,10,10.000000
4,2,11771,10,10.000000
...,...,...,...,...
6337232,73515,13331,9,8.547486
6337237,73515,17187,9,8.547486
6337238,73515,22145,10,8.547486
6337239,73516,790,9,9.000000


- Computing the number of time each anime is related

In [21]:
times_rated = rating_df.groupby(['anime_id'])['rating'].count()
times_rated = times_rated.rename('times_rated')

In [22]:
times_rated.describe()

count     8573.000000
mean       394.384346
std       1263.827147
min          1.000000
25%          5.000000
50%         28.000000
75%        202.000000
max      27109.000000
Name: times_rated, dtype: float64

- We can see that an anime is rated around 400 times on average

In [23]:
rating_df = rating_df.merge(times_rated,on='anime_id')

In [24]:
rating_df

Unnamed: 0,user_id,anime_id,rating,mean_rating,times_rated
0,1,8074,10,10.000000,8800
1,17,8074,7,6.901720,8800
2,27,8074,9,8.569620,8800
3,40,8074,9,8.870370,8800
4,41,8074,10,8.670588,8800
...,...,...,...,...,...
3381052,73188,9777,7,6.998418,1
3381053,73188,11511,7,6.998418,1
3381054,73188,17209,8,6.998418,1
3381055,73188,25291,7,6.998418,1


In [25]:
rate_list=times_rated.unique()

In [26]:
rate_list

array([11108,  4431,  6798, ...,  1858,  1106,   972])

In [27]:
rate_list.mean()

1805.3611842105263

- From the above, we will select only the anime which falls in the top 50% of ratings   
i.e > mean(unique(times_rated)) => > 1805

In [28]:
rating_df_top50 = rating_df[rating_df['times_rated']>1805]

In [29]:
rating_df.shape

(3381057, 5)

In [30]:
rating_df_top50.shape

(2072252, 5)

In [31]:
rating_df_top50

Unnamed: 0,user_id,anime_id,rating,mean_rating,times_rated
0,1,8074,10,10.000000,8800
1,17,8074,7,6.901720,8800
2,27,8074,9,8.569620,8800
3,40,8074,9,8.870370,8800
4,41,8074,10,8.670588,8800
...,...,...,...,...,...
2971973,73406,79,8,7.872727,2566
2971974,73417,79,8,7.144254,2566
2971975,73440,79,10,7.920000,2566
2971976,73443,79,8,7.921348,2566


## Merging the two datasets anime_df and rating_df

In [32]:
# rename the rating column to user_rating 
# in order to differntiate it from the rating column in anime_df dataset
rating_df_top50 = rating_df_top50.rename({"rating":"user_rating"},axis=1)


In [33]:
rating_df_top50

Unnamed: 0,user_id,anime_id,user_rating,mean_rating,times_rated
0,1,8074,10,10.000000,8800
1,17,8074,7,6.901720,8800
2,27,8074,9,8.569620,8800
3,40,8074,9,8.870370,8800
4,41,8074,10,8.670588,8800
...,...,...,...,...,...
2971973,73406,79,8,7.872727,2566
2971974,73417,79,8,7.144254,2566
2971975,73440,79,10,7.920000,2566
2971976,73443,79,8,7.921348,2566


In [34]:
#merging
df= pd.merge(anime_df,rating_df_top50,on=["anime_id","anime_id"])


In [35]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,152,10,7.760563,1858
1,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,244,10,8.800000,1858
2,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,271,10,7.372287,1858
3,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,322,10,8.356322,1858
4,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,398,10,8.000000,1858
...,...,...,...,...,...,...,...,...,...,...,...
2072247,2476,School Days,"Drama, Harem, Romance, School",TV,12,6.17,279183,73395,9,8.266667,3082
2072248,2476,School Days,"Drama, Harem, Romance, School",TV,12,6.17,279183,73402,10,6.640000,3082
2072249,2476,School Days,"Drama, Harem, Romance, School",TV,12,6.17,279183,73417,8,7.144254,3082
2072250,2476,School Days,"Drama, Harem, Romance, School",TV,12,6.17,279183,73440,8,7.920000,3082


## Check for null values 

In [36]:
df.isnull().sum()

anime_id       0
name           0
genre          0
type           0
episodes       0
rating         0
members        0
user_id        0
user_rating    0
mean_rating    0
times_rated    0
dtype: int64

## More details on the merged dataset


In [37]:
for i in df.columns :
  print(i,df[i].nunique())

anime_id 475
name 475
genre 360
type 5
episodes 50
rating 173
members 474
user_id 68127
user_rating 10
mean_rating 19627
times_rated 457


In [38]:
df.type.unique()

array(['Movie', 'TV', 'OVA', 'Special', 'ONA'], dtype=object)

In [39]:
olg=[]
def func(x):
  result = [i.strip() for i in x.split(',')]
  for i in result:
    if i not in olg:
      olg.append(i)

In [40]:
for i in df['genre']:
  func(i)

In [41]:
olg

['Drama',
 'Romance',
 'School',
 'Supernatural',
 'Action',
 'Adventure',
 'Fantasy',
 'Magic',
 'Military',
 'Shounen',
 'Sci-Fi',
 'Thriller',
 'Comedy',
 'Historical',
 'Parody',
 'Samurai',
 'Super Power',
 'Slice of Life',
 'Mecha',
 'Sports',
 'Music',
 'Martial Arts',
 'Space',
 'Seinen',
 'Mystery',
 'Vampire',
 'Shoujo',
 'Horror',
 'Police',
 'Psychological',
 'Ecchi',
 'Josei',
 'Game',
 'Demons',
 'Dementia',
 'Harem',
 'Cars',
 'Shounen Ai',
 'Kids',
 'Shoujo Ai']

## Getting the dataset ready for modelling( Recommendation based on user_rating )

- Pivot the df so that we get another dataframe which gives the detailed view on which user rated which anime and the corresponding rating
- The rest will be Nan which will be filled with 0.0

- Filtering using K Nearest Neighbours:
  - The system will recommend anime based on the nearest rating between the ratings of user's anime and the ratings of other anime.
  - If a user watches some anime and then rates them, and another user wants a recommendation from user 1(dataset). The algorithm will pick the k nearest anime with the closest rating to the rating given by the first user (dataset) for that specific anime

In [42]:
anime_pivot_df = pd.pivot_table(index='name',columns='user_id',values='user_rating', data=df)

In [43]:
anime_pivot_df

user_id,1,2,3,5,7,8,9,10,11,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,45,...,73476,73478,73479,73480,73481,73482,73483,73484,73485,73486,73487,73488,73489,73490,73491,73492,73493,73494,73495,73496,73497,73498,73499,73500,73501,73502,73503,73504,73505,73506,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Accel World,,,,,8.0,,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,8.0,,,,,8.0,,,...,,,,,,,,,,,,,,,,,,,,,,,8.0,,,,,,,,,,,,,,,,,
Afro Samurai,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,9.0,,,,,,,,,,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,
Air,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,9.0,,,9.0,,8.0,,,,,,,,10.0,8.0,,,
Air Gear,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,10.0,,8.0,,,...,,,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,,,,,,,,,
Akame ga Kill!,,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.0,,,,9.0,,9.0,,,...,,,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoku Natsume Yuujinchou,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,8.0,,,,,,,,,,,,,,,,,,7.0,,,,,,,,,,,,,
ef: A Tale of Melodies.,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,8.0,,,,,,,,,,,,,,,,,,,,,,10.0,,,9.0,,,,,,,,,,,,,,
ef: A Tale of Memories.,,,,,,,,,,,,,8.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,8.0,,,,,,,,,,,,,,,,,,,,,,10.0,,,10.0,,,,,,,,,,,,,,
xxxHOLiC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,9.0,,,,,,,,,,,,10.0,8.0,,,,10.0,,,,,,,,,


In [44]:
anime_pivot_df.fillna(value=0,inplace=True)

In [45]:
anime_pivot_df

user_id,1,2,3,5,7,8,9,10,11,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,36,37,38,39,40,41,42,43,44,45,...,73476,73478,73479,73480,73481,73482,73483,73484,73485,73486,73487,73488,73489,73490,73491,73492,73493,73494,73495,73496,73497,73498,73499,73500,73501,73502,73503,73504,73505,73506,73507,73508,73509,73510,73511,73512,73513,73514,73515,73516
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Accel World,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Afro Samurai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Air,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,9.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.0,0.0,0.0,0.0
Air Gear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Akame ga Kill!,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0,0.0,9.0,0.0,0.0,...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoku Natsume Yuujinchou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ef: A Tale of Melodies.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ef: A Tale of Memories.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xxxHOLiC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Converting the dataframe into a sparse matrix 
- This representation makes it easier for modelling

In [46]:
anime_mat = csr_matrix(anime_pivot_df.values)

In [47]:
print(anime_mat)

  (0, 4)	8.0
  (0, 25)	9.0
  (0, 32)	8.0
  (0, 37)	8.0
  (0, 72)	10.0
  (0, 96)	9.0
  (0, 109)	9.0
  (0, 118)	9.0
  (0, 129)	9.0
  (0, 135)	8.0
  (0, 141)	9.0
  (0, 212)	8.0
  (0, 222)	10.0
  (0, 234)	9.0
  (0, 238)	9.0
  (0, 250)	7.0
  (0, 256)	8.0
  (0, 258)	10.0
  (0, 263)	8.0
  (0, 269)	9.0
  (0, 286)	7.0
  (0, 289)	9.0
  (0, 293)	10.0
  (0, 299)	9.0
  (0, 310)	9.0
  :	:
  (474, 67716)	9.0
  (474, 67751)	8.0
  (474, 67752)	9.0
  (474, 67754)	9.0
  (474, 67767)	9.0
  (474, 67768)	8.0
  (474, 67790)	9.0
  (474, 67796)	8.0
  (474, 67846)	10.0
  (474, 67863)	7.0
  (474, 67891)	8.0
  (474, 67904)	9.0
  (474, 67914)	8.0
  (474, 67941)	8.0
  (474, 67969)	8.0
  (474, 67974)	9.0
  (474, 67988)	9.0
  (474, 68024)	10.0
  (474, 68032)	9.0
  (474, 68075)	9.0
  (474, 68086)	8.0
  (474, 68095)	8.0
  (474, 68112)	10.0
  (474, 68113)	8.0
  (474, 68117)	9.0


## ML Model

- One of the easiest approach to cluster similar animes is to use K nearest neighbours
- Here, I will be using Cosine similarity for calculating the distance



In [48]:
from sklearn.neighbors import NearestNeighbors
anime_nbrs = NearestNeighbors(metric='cosine', algorithm='brute').fit(anime_mat)
distances, indices = anime_nbrs.kneighbors(anime_mat)

In [49]:
# list of anime names
anime_names = list(anime_pivot_df.index)

In [50]:
anime_names

['Accel World',
 'Afro Samurai',
 'Air',
 'Air Gear',
 'Akame ga Kill!',
 'Akatsuki no Yona',
 'Akira',
 'Aldnoah.Zero',
 'Amagami SS',
 'Amagi Brilliant Park',
 'Angel Beats!',
 'Angel Beats!: Another Epilogue',
 'Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.',
 'Ano Natsu de Matteru',
 'Another',
 'Ansatsu Kyoushitsu (TV)',
 'Ansatsu Kyoushitsu (TV) 2nd Season',
 'Ao Haru Ride',
 'Ao no Exorcist',
 'Arakawa Under the Bridge',
 'Azumanga Daioh',
 'B Gata H Kei',
 'Baccano!',
 'Baccano! Specials',
 'Baka to Test to Shoukanjuu',
 'Baka to Test to Shoukanjuu Ni!',
 'Bakemonogatari',
 'Bakuman.',
 'Bakuman. 2nd Season',
 'Bakuman. 3rd Season',
 'Barakamon',
 'Beck',
 'Beelzebub',
 'Ben-To',
 'Berserk',
 'Binbougami ga!',
 'Bishoujo Senshi Sailor Moon',
 'Black Bullet',
 'Black Cat',
 'Black Lagoon',
 'Black Lagoon: Roberta&#039;s Blood Trail',
 'Black Lagoon: The Second Barrage',
 'Black★Rock Shooter (OVA)',
 'Black★Rock Shooter (TV)',
 'Bleach',
 'Bleach Movie 1: Memories of N

In [51]:
# choice=random.choice(anime_names )
choice='Barakamon'

In [52]:
choice

'Barakamon'

In [53]:
# Details of choice
df.loc[df.name == choice]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
466271,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,5,8,4.355120,4611
466272,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,17,8,6.901720,4611
466273,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,38,8,6.703557,4611
466274,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,93,10,7.557143,4611
466275,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,123,10,7.108225,4611
...,...,...,...,...,...,...,...,...,...,...,...
470877,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,73343,10,9.342767,4611
470878,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,73346,8,6.867568,4611
470879,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,73376,9,7.449735,4611
470880,22789,Barakamon,"Comedy, Slice of Life",TV,12,8.5,225927,73380,8,7.941489,4611


In [54]:
choice_index = anime_names.index(choice)

In [55]:
num_recom=6

In [56]:
distances, indices = anime_nbrs.kneighbors(anime_pivot_df.iloc[choice_index,:].values.reshape(1,-1),n_neighbors=num_recom)

In [57]:
distances

array([[2.22044605e-15, 5.43472121e-01, 6.21175161e-01, 6.22860637e-01,
        6.40064397e-01, 6.44228641e-01]])

In [58]:
indices

array([[ 30, 146, 171, 464, 394,  95]])

In [59]:
indices_flat, distances_flat = indices.flatten(),distances.flatten()

In [60]:
distances_flat

array([2.22044605e-15, 5.43472121e-01, 6.21175161e-01, 6.22860637e-01,
       6.40064397e-01, 6.44228641e-01])

In [61]:
indices_flat

array([ 30, 146, 171, 464, 394,  95])

In [62]:
anlist=[]

In [63]:
for index,anime_index in enumerate(indices_flat):
    anime_name = anime_names[anime_index]
    anlist.append(anime_name)
    if(index == 0): 
        # print(f'Animes similar to {anime_name}:')
        print("Animes Similar to ",anime_name,":")
    else:
        # print(f'\t {anime_name} with score ---> {distances_flat[index]}')
        print("- ",anime_name,"(with similarity distance = ",distances_flat[index],")")

Animes Similar to  Barakamon :
-  Gekkan Shoujo Nozaki-kun (with similarity distance =  0.5434721211641667 )
-  Haikyuu!! (with similarity distance =  0.6211751609258811 )
-  Zankyou no Terror (with similarity distance =  0.6228606374984226 )
-  Shigatsu wa Kimi no Uso (with similarity distance =  0.6400643968746094 )
-  Death Parade (with similarity distance =  0.6442286410115066 )


In [64]:
df.loc[df.name==anlist[1]].head(1)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
917152,23289,Gekkan Shoujo Nozaki-kun,"Comedy, Romance, School",TV,12,8.24,292622,5,5,4.35512,5442


In [65]:
df.loc[df.name==anlist[2]].head(1)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
281064,20583,Haikyuu!!,"Comedy, Drama, School, Shounen, Sports",TV,25,8.68,284498,3,9,7.565217,5035


In [66]:
df.loc[df.name==anlist[3]].head(1)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
895723,23283,Zankyou no Terror,"Psychological, Thriller",TV,11,8.26,342893,17,7,6.90172,6206


In [67]:
df.loc[df.name==anlist[4]].head(1)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
106130,23273,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,22,8.92,416397,5,8,4.35512,7155


In [68]:
df.loc[df.name==anlist[5]].head(1)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,user_id,user_rating,mean_rating,times_rated
765738,28223,Death Parade,"Drama, Game, Mystery, Psychological, Thriller",TV,12,8.33,383914,17,8,6.90172,6649
