# Anime Recommender System Using Hybrid Filter
<b>Muhammed Rüşen Birben 150220755</b>

## Data Preparing

### Importing pandas and reading anime csv files

In [23]:
import pandas as pd
import numpy as np

In [3]:
anime_synopsis = pd.read_csv("data_raw/data/anime_with_synopsis.csv")
anime_df = pd.read_csv("data_raw/data/anime.csv")



### Getting info for both dfs

In [4]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [5]:
anime_synopsis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16214 non-null  int64 
 1   Name       16214 non-null  object
 2   Score      16214 non-null  object
 3   Genres     16214 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 633.5+ KB


### Dropping duplicates for both

In [6]:
before = len(anime_df.index)
anime_df.drop_duplicates(inplace=True)
after = len(anime_df.index)
print(f"Before anime {before}\nAfter anime {after}")

before = len(anime_synopsis.index)
anime_synopsis.drop_duplicates(inplace=True)
after = len(anime_synopsis.index)
print(f"Before anime_synopsis {before}\nAfter anime_synopsis {after}")

Before df 17562
After df 17562
Before anime_synopsis 16214
After anime_synopsis 16214


### Renaming columns of dfs

In [7]:
print(anime_df.columns)
anime_df.rename(columns={"English name":"english_name", "Japanese name":"japanese_name",
 "On-Hold":"on_hold", "Plan to Watch":"plan_to_watch", "MAL_ID":"anime_id"}, inplace=True)
anime_df.rename(columns={i:i.replace("-","_") for i in anime_df.columns}, inplace=True)
anime_df.rename(columns={i:i.lower() for i in anime_df.columns}, inplace=True)
print(anime_df.columns)

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')
Index(['anime_id', 'name', 'score', 'genres', 'english_name', 'japanese_name',
       'type', 'episodes', 'aired', 'premiered', 'producers', 'licensors',
       'studios', 'source', 'duration', 'rating', 'ranked', 'popularity',
       'members', 'favorites', 'watching', 'completed', 'on_hold', 'dropped',
       'plan_to_watch', 'score_10', 'score_9', 'score_8', 'score_7', 'score_6',
       'score_5', 'score_4', 'score_3', 'score_2', 'score_1'],
      dtype='object')


In [8]:
print(anime_synopsis.columns)
print("Feature num: ",anime_synopsis.shape[1])
anime_synopsis.rename(columns={"MAL_ID":"anime_id", "sypnopsis":"synopsis"}, inplace=True)
anime_synopsis.rename(columns={i:i.replace("-","_") for i in anime_synopsis.columns}, inplace=True)
anime_synopsis.rename(columns={i:i.lower() for i in anime_synopsis.columns}, inplace=True)
print(anime_synopsis.columns)

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'sypnopsis'], dtype='object')
Feature num:  5
Index(['anime_id', 'name', 'score', 'genres', 'synopsis'], dtype='object')


### Filter Adult Content

In [11]:
# will use this for filtering the ratings of adult animes 
adult_anime_ids = anime_df[anime_df['rating'] == 'Rx - Hentai']['anime_id']
anime_df = anime_df[anime_df['rating'] != 'Rx - Hentai']
anime_synopsis = anime_synopsis[anime_synopsis.index.isin(anime_df.index)]

### Fill 'Unknown' With np.nan or Zero

In [24]:
to_int = ["episodes","ranked","score_10","score_9","score_8","score_7","score_6","score_5","score_4","score_3","score_2","score_1"]
to_float = ["score"]
for i in to_int:
    anime_df[i]= anime_df[i].apply(lambda x: 0 if x == "Unknown" else x)
    anime_df[i] = anime_df[i].astype(float)
    anime_df[i] = anime_df[i].astype(int)

for i in to_float:
    anime_df[i]= anime_df[i].apply(lambda x: np.nan if x == "Unknown" else x)
    anime_df[i] = anime_df[i].astype(float)
    anime_synopsis[i]= anime_synopsis[i].apply(lambda x: np.nan if x == "Unknown" else x)
    anime_synopsis[i] = anime_synopsis[i].astype(float)

### Looking for na

In [8]:
anime_df.isna().sum()[anime_df.isna().sum() != 0] 
# Nothing is missing

Series([], dtype: int64)

In [9]:
anime_synopsis.isna().sum()[anime_synopsis.isna().sum() != 0] 
# synopsis of some values are missing

synopsis    8
dtype: int64

### Saving as csv

In [10]:
anime_df.to_csv("data_cleansed/anime_info.csv", index=False)
anime_synopsis.to_csv("data_cleansed/anime_synopsis.csv", index=False)
del anime_synopsis
del anime_df

### Reading the rating dataset & dropping the duplicates

In [11]:
# A really large dataset, takes too long 
# (might even give an error depending on RAM)
ratings = pd.read_csv("data_raw/data/animelist.csv")
ratings.drop_duplicates(inplace=True)

### Looking for info

In [12]:
ratings.info() # column names seems good

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109224746 entries, 0 to 109224746
Data columns (total 5 columns):
 #   Column            Dtype
---  ------            -----
 0   user_id           int64
 1   anime_id          int64
 2   rating            int64
 3   watching_status   int64
 4   watched_episodes  int64
dtypes: int64(5)
memory usage: 4.9 GB


In [13]:
ratings.isna().sum()[ratings.isna().sum() != 0] # No feature with na in it

Series([], dtype: int64)

### Filtering Adult Anime Ratings

In [None]:
ratings = ratings[~ratings['anime_id'].isin(adult_anime_ids)]

### Saving rating files to cvs

In [14]:
# this one also takes too long (took me 8 min.)
ratings.to_csv("data_cleansed/ratings_watch_inf.csv", index=False)
del ratings

# rating complete consist of ratings that are given after watching the anime completely (watching_status==2)
rating_status_comp = pd.read_csv('data_raw/data/rating_complete.csv')
rating_status_comp.drop_duplicates(inplace=True)
rating_status_comp = rating_status_comp[~rating_status_comp['anime_id'].isin(adult_anime_ids)]
rating_status_comp.to_csv("data_cleansed/ratings.csv", index=False)
del rating_status_comp

### Reading watching status mapping csv

In [15]:
df = pd.read_csv("data_raw/data/watching_status.csv")

In [16]:
print(df) # nothing to clear, can keep these in mind.
# not worth copying to clensed_data folder
del df

   status         description
0       1  Currently Watching
1       2           Completed
2       3             On Hold
3       4             Dropped
4       6       Plan to Watch


In [17]:
import os
os.listdir('data_cleansed')

['anime_info.csv',
 'anime_synopsis.csv',
 'ratings.csv',
 'ratings_watch_inf.csv']

<b>anime_info.csv:</b> consist of animes' detailed information like genre, studio, airing year, ratings number for each rate degree etc.<br>
<b>anime_synopsis.csv:</b> consist of basic info about anime like score and genre. But most importantly it includes the synopsis of anime.<br>
<b>ratings.csv:</b> includes all of the ratings of users that are given for animes which user actually watched compleatly. (watching_status==2)<br>
<b>ratings.csv:</b> includes all of the ratings of users and their watched number of episodes and watching status for that rated anime.<br>

## Data Preprocessing

### Reading Anime and Rating CSVs

In [18]:
ratings = pd.read_csv('data_cleansed/ratings.csv')
animes = pd.read_csv('data_cleansed/anime_info.csv')

### Selecting Ratings and Animes

In [None]:
# Half of the user's ratings are ignored (113 is the median for rating count for users) 
# because their rate count is lower than the median of rate count 
# (this will reduce the sparcity of our user-item matrix)
temp_df = ratings.groupby("user_id").count()
print(f"Before: {len(ratings.index)}")
ratings = ratings[ratings['user_id'].isin((temp_df[temp_df> temp_df.median()]).index)]
print(f"After: {len(ratings.index)}")
del temp_df

In [None]:
temp_df = animes.loc[:,"score_10":].sum(axis=1)
NUMBER = 200
print(f"Before: {len(animes.index)}")
animes = animes[animes.index.isin(temp_df[temp_df > NUMBER].index)]
print(f"After: {len(animes.index)}")
del temp_df


### Creating User-Item Matrix

### Creating Labels for Anime Genres

## Model Building