## 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import difflib
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Import Data

In [2]:
#Import data --- using 1M rows for evaluation

anime_rating_data = pd.read_csv("C:/Users/PC/Downloads/Dataset/animelist.csv",nrows=1000000)
anime_data = pd.read_csv("C:/Users/PC/Downloads/Dataset/anime.csv")

#Rename columns for MAL_ID -> Anime_id

anime_data = anime_data.rename(columns={"MAL_ID": "anime_id"})
anime_contact_data = anime_data[["anime_id", "Name"]]

## 3. Get the Initial Ideas of Data

In [3]:
#Pre-processing - Merging anime_data to anime_contact_data

anime_rating_data = anime_rating_data.merge(anime_contact_data, left_on = 'anime_id', right_on = 'anime_id', how = 'left')
anime_rating_data = anime_rating_data[["user_id", "Name", "anime_id","rating", "watching_status", "watched_episodes"]]

#Display 10 rows of df
anime_rating_data.head(11)

Unnamed: 0,user_id,Name,anime_id,rating,watching_status,watched_episodes
0,0,Basilisk: Kouga Ninpou Chou,67,9,1,1
1,0,Fairy Tail,6702,7,1,4
2,0,Gokusen,242,10,1,4
3,0,Kuroshitsuji,4898,0,1,1
4,0,One Piece,21,10,1,0
5,0,School Rumble,24,9,1,5
6,0,Seto no Hanayome,2104,0,1,4
7,0,Skip Beat!,4722,8,1,4
8,0,Sora no Manimani,6098,6,1,2
9,0,Tokimeki Tonight,3125,9,1,29


In [4]:
anime_rating_data.shape

(1000000, 6)

In [5]:
#some df info 

anime_rating_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   user_id           1000000 non-null  int64 
 1   Name              1000000 non-null  object
 2   anime_id          1000000 non-null  int64 
 3   rating            1000000 non-null  int64 
 4   watching_status   1000000 non-null  int64 
 5   watched_episodes  1000000 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 53.4+ MB


In [6]:
# Look how many NA values in the data
count_na = anime_rating_data.isna().sum()
print(count_na)

user_id             0
Name                0
anime_id            0
rating              0
watching_status     0
watched_episodes    0
dtype: int64


## 4. Selecting Features for Recommendation

In [7]:
#In this case we can see the data are dense so we can move forward to recommendation system by selecting relevant features

selected_features = ['user_id','Name','anime_id','rating','watching_status','watched_episodes']
print(selected_features)

['user_id', 'Name', 'anime_id', 'rating', 'watching_status', 'watched_episodes']


In [8]:
# Combining all features - convert any int64 dtype to object(str)
combined_features = anime_rating_data['user_id'].apply(str)+' '+anime_rating_data['Name']+' '+anime_rating_data['rating'].apply(str)+' '+anime_rating_data['watching_status'].apply(str)+' '+anime_rating_data['watched_episodes'].apply(str)

combined_features.shape

(1000000,)

In [9]:
# converting text data to feature vectors

vectorizer = TfidfVectorizer()

In [10]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [12]:
print(feature_vectors)

  (0, 4754)	0.4729974659071348
  (0, 10836)	0.5175597188473618
  (0, 9022)	0.4993172863147588
  (0, 4041)	0.5090064658325409
  (1, 13925)	0.7111868916578028
  (1, 5949)	0.7030029908429358
  (2, 24)	0.32338838193226077
  (2, 6586)	0.9462663232046431
  (3, 9183)	1.0
  (4, 11614)	0.6824033083061536
  (4, 11177)	0.6192409929970358
  (4, 24)	0.38841513539621714
  (5, 12327)	0.8035820857726216
  (5, 12657)	0.5951939443789083
  (6, 6951)	0.6572883503490847
  (6, 10869)	0.20499146749830507
  (6, 12877)	0.7252244636995296
  (7, 4082)	0.703208738736275
  (7, 13427)	0.7109834525253997
  (8, 9741)	0.8253637039187083
  (8, 13509)	0.5292682121990252
  (8, 10869)	0.19659581839204682
  (9, 2004)	0.5459510704317813
  (9, 14394)	0.6476599193410211
  (9, 14334)	0.5314828854945137
  :	:
  (999997, 14884)	0.46387676637935776
  (999997, 15010)	0.20984989311774485
  (999997, 4590)	0.31350889701566165
  (999998, 2530)	0.4305287422456417
  (999998, 3466)	0.45395981505350436
  (999998, 3393)	0.45395981505350436

## 5. Test Our Model and Make Some Recommendation