## 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import difflib
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Import Data

In [2]:
#Import data --- using 1M rows for evaluation

anime_rating_data = pd.read_csv("C:/Users/PC/Downloads/Dataset/animelist.csv",nrows=5000)
anime_data = pd.read_csv("C:/Users/PC/Downloads/Dataset/anime.csv")

#Rename columns for MAL_ID -> Anime_id

anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
anime_contact_data = anime_data[["anime_id", "Name"]]

## 3. Get the Initial Ideas of Data

In [3]:
#Pre-processing - Merging anime_data to anime_contact_data

anime_rating_data = anime_rating_data.merge(anime_contact_data, left_on = 'anime_id', right_on = 'anime_id', how = 'left')
anime_rating_data = anime_rating_data[["user_id", "Name", "anime_id","rating", "watching_status", "watched_episodes"]]

#Display 10 rows of df
anime_rating_data.head(11)

Unnamed: 0,user_id,Name,anime_id,rating,watching_status,watched_episodes
0,0,Basilisk: Kouga Ninpou Chou,67,9,1,1
1,0,Fairy Tail,6702,7,1,4
2,0,Gokusen,242,10,1,4
3,0,Kuroshitsuji,4898,0,1,1
4,0,One Piece,21,10,1,0
5,0,School Rumble,24,9,1,5
6,0,Seto no Hanayome,2104,0,1,4
7,0,Skip Beat!,4722,8,1,4
8,0,Sora no Manimani,6098,6,1,2
9,0,Tokimeki Tonight,3125,9,1,29


In [4]:
anime_rating_data.shape

(5000, 6)

In [5]:
#some df info 

anime_rating_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   user_id           5000 non-null   int64 
 1   Name              5000 non-null   object
 2   anime_id          5000 non-null   int64 
 3   rating            5000 non-null   int64 
 4   watching_status   5000 non-null   int64 
 5   watched_episodes  5000 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 273.4+ KB


In [6]:
# Look how many NA values in the data
count_na = anime_rating_data.isna().sum()
print(count_na)

user_id             0
Name                0
anime_id            0
rating              0
watching_status     0
watched_episodes    0
dtype: int64


## 4. Selecting Features for Recommendation

In [7]:
#In this case we can see the data are dense so we can move forward to recommendation system by selecting relevant features

selected_features = ['user_id','Name','anime_id','rating','watching_status','watched_episodes']
print(selected_features)

['user_id', 'Name', 'anime_id', 'rating', 'watching_status', 'watched_episodes']


In [8]:
# Combining all features - convert any int64 dtype to object(str)
combined_features = anime_rating_data['user_id'].apply(str)+' '+anime_rating_data['Name']+' '+anime_rating_data['rating'].apply(str)+' '+anime_rating_data['watching_status'].apply(str)+' '+anime_rating_data['watched_episodes'].apply(str)

combined_features.shape

(5000,)

In [9]:
# converting text data to feature vectors

vectorizer = TfidfVectorizer()

In [10]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 578)	0.5042520846199159
  (0, 2342)	0.5042520846199159
  (0, 1796)	0.48702105238830024
  (0, 370)	0.5042520846199159
  (1, 3263)	0.7120930659612248
  (1, 897)	0.7020850841671135
  (2, 10)	0.3905839996612793
  (2, 1101)	0.9205672920588683
  (3, 1849)	1.0
  (4, 2559)	0.7011049834354991
  (4, 2427)	0.6202574904583171
  (4, 10)	0.35175623339503065
  (5, 2782)	0.7783775205377321
  (5, 2863)	0.627796492122672
  (6, 1203)	0.6849607258439752
  (6, 2352)	0.21133055851618437
  (6, 2941)	0.697257627486808
  (7, 378)	0.7071067811865475
  (7, 3118)	0.7071067811865475
  (8, 2035)	0.8037287263571997
  (8, 3145)	0.5570744806235796
  (8, 2352)	0.20901712242350518
  (9, 76)	0.5592541117145474
  (9, 3396)	0.5861888938432613
  (9, 3384)	0.5861888938432613
  :	:
  (4996, 35)	0.19181656585054718
  (4996, 1350)	0.6223574855806339
  (4996, 3714)	0.584594755417584
  (4996, 2424)	0.4546452949419669
  (4996, 21)	0.16560251487353958
  (4997, 2184)	0.654433624472103
  (4997, 3722)	0.654433624472103
  (4997, 

## 5. Test Our Model and Make Some Recommendation

In [12]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.02326927 0.05493109]
 [0.         0.         0.         ... 0.02326927 1.         0.06024447]
 [0.         0.         0.         ... 0.05493109 0.06024447 1.        ]]


In [13]:
print(similarity.shape)

(5000, 5000)


## 6. Input from Users

In [14]:
# Input from user

anime_name = input(' Enter your favourite anime name : ')

 Enter your favourite anime name :  berserk


In [16]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = anime_rating_data['Name'].tolist()
print(list_of_all_titles)

['Basilisk: Kouga Ninpou Chou', 'Fairy Tail', 'Gokusen', 'Kuroshitsuji', 'One Piece', 'School Rumble', 'Seto no Hanayome', 'Skip Beat!', 'Sora no Manimani', 'Tokimeki Tonight', 'Yu☆Gi☆Oh! Duel Monsters', 'Black Cat (TV)', 'Byousoku 5 Centimeter', 'Daisougen no Chiisana Tenshi: Bush Baby', 'Erementar Gerad', 'Fate/stay night', 'Fullmetal Alchemist', 'Fullmetal Alchemist: The Conqueror of Shamballa', 'Ged Senki', 'Ghost Hunt', 'Hotaru no Haka', 'Howl no Ugoku Shiro', 'Igano Kabamaru', 'Jin-Rou', 'Jungle no Ouja Taa-chan', 'Kaiketsu Zorro', 'Kanojo to Kanojo no Neko', 'Kumo no Mukou, Yakusoku no Basho', 'Legend of Duo', 'Lovely★Complex', 'Mononoke Hime', 'Muka Muka Paradise', 'Mushishi', 'Naruto', 'Naruto Narutimate Hero 3: Tsuini Gekitotsu! Jounin vs. Genin!! Musabetsu Dairansen Taikai Kaisai!!', 'Naruto: Akaki Yotsuba no Clover wo Sagase', 'Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo! - Konoha no Sato no Dai Undoukai', 'Neko no Ongaeshi', 'On Your Mark', 'One Piece Movie 

In [17]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(anime_name, list_of_all_titles)
print(find_close_match)

['Berserk']


In [18]:
close_match = find_close_match[0]
print(close_match)

Berserk


In [24]:
# finding the index of the movie with title

index_of_the_anime = anime_rating_data[anime_rating_data.Name == close_match]['user_id'].values[0]
print(index_of_the_anime)

14


In [25]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_anime]))
print(similarity_score)

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 1.0000000000000002), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.18843517438694107), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.11793978126695476), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.0), (65, 0.0), (66, 0.0), (67, 0.0), (68, 0.0), (69, 0.0), (70, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.0), (76, 0.0), (77, 0.0), (78, 0.0), (79, 0.0), (80, 0.0), (81, 0.0), (82, 0.0), (83, 0.0), (84, 0.0), (85, 0.0), (86, 0.0), (87, 

In [26]:
# sorting the movies based on their similarity score

sorted_similar_anime = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_anime)

[(14, 1.0000000000000002), (1746, 0.9303025765498621), (3521, 0.8358217537850743), (2005, 0.2176477123490961), (926, 0.21428322073457154), (1088, 0.20936844158391038), (4466, 0.2060880472410133), (3766, 0.1970033954585834), (3789, 0.196934456439479), (2384, 0.19536157183252137), (32, 0.18843517438694107), (3295, 0.18647474313093332), (1085, 0.18564572119546163), (4166, 0.18296288778667943), (4161, 0.17831546134195356), (4086, 0.17206890029110955), (4741, 0.17206890029110955), (2798, 0.16932944636311564), (4548, 0.167844227909404), (1717, 0.1676234778722027), (283, 0.16480630728186332), (1560, 0.16220438703780446), (4095, 0.16216854590298485), (4624, 0.16216854590298485), (267, 0.16129981286299303), (909, 0.16129981286299303), (1609, 0.16129981286299303), (2017, 0.16129981286299303), (3812, 0.15993443248506348), (4920, 0.1596134557932629), (1254, 0.15866977087608575), (2438, 0.1577248835754957), (1101, 0.15768322552195413), (3837, 0.15654575072338242), (1094, 0.15452707726785578), (2298

In [28]:
# print the name of similar Anime based on the index - Top 10

print('Anime suggested for you : \n')

i = 1

for anime in sorted_similar_anime:
  index = anime[0]
  title_from_index = anime_rating_data[anime_rating_data.index==index]['Name'].values[0]
  if (i<11):
    print(i, '.',title_from_index)
    i+=1

Anime suggested for you : 

1 . Erementar Gerad
2 . Erementar Gerad
3 . Erementar Gerad
4 . K-On!!
5 . Nichijou
6 . D.N.Angel
7 . K-On!!
8 . Nichijou
9 . R.O.D: The TV
10 . K-On!!
