# Anime Recommendation System

In [502]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from matplotlib import pyplot as plt

#### Importing the data

In [503]:
df = pd.read_csv(r'database.csv')

In [504]:
#quick look at the data
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [505]:
# Checking the datatypes and null values of each field
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  9999 non-null   int64  
 1   name      9999 non-null   object 
 2   genre     9959 non-null   object 
 3   type      9999 non-null   object 
 4   episodes  9999 non-null   object 
 5   rating    9997 non-null   float64
 6   members   9999 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 546.9+ KB


In [506]:
# We can drop 'anime_id' as it will not contribute to cosine similarity
df.drop('anime_id', inplace=True, axis=1)

In [507]:
df.shape

(9999, 6)

In [508]:
# The rows with null values must be dropped to avoid confusion while deriving consine similarity matrix
df.dropna(axis=0, inplace=True)

#### Making a list of genres

In [509]:
df['genre'] = df['genre'].map(lambda x: x.lower().split(','))

In [510]:
df.head()

Unnamed: 0,name,genre,type,episodes,rating,members
0,Kimi no Na wa.,"[drama, romance, school, supernatural]",Movie,1,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic,...",TV,64,9.26,793665
2,Gintama°,"[action, comedy, historical, parody, samur...",TV,51,9.25,114262
3,Steins;Gate,"[sci-fi, thriller]",TV,24,9.17,673572
4,Gintama&#039;,"[action, comedy, historical, parody, samur...",TV,51,9.16,151266


#### The anime name to be displayed in recommendations must be stripped off of all unnecessary characters

In [511]:
df['anime_name'] = df['name'].map(lambda name: re.sub('[.,@#$%^&*{}°;?!]',' ',name))

#### The 'name' field must have only alphabets and digits before passing it on for the cosine similarity matrix formation

In [512]:
df['name'] = df['name'].map(lambda name: re.sub(' ',''," ".join(re.findall('[a-zA-Z0-9]',name.lower()))))

In [513]:
df.head()

Unnamed: 0,name,genre,type,episodes,rating,members,anime_name
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,1,9.37,200630,Kimi no Na wa
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,64,9.26,793665,Fullmetal Alchemist: Brotherhood
2,gintama,"[action, comedy, historical, parody, samur...",TV,51,9.25,114262,Gintama
3,steinsgate,"[sci-fi, thriller]",TV,24,9.17,673572,Steins Gate
4,gintama039,"[action, comedy, historical, parody, samur...",TV,51,9.16,151266,Gintama 039


#### Some audience only prefer smaller animes that can be finished quickly, while others prefer binge-watching larger ones. The episodes will be classified into six classes before passing them on to the matrix. 

In [514]:
# Converting object datatype to int
df['episodes'] = df['episodes'].map(lambda x: int(x) if x != 'Unknown' else None)

In [515]:
df['anime_size'] = ''

In [516]:
# new column 'anime_size' has information about the size of the anime based on number of episodes
df['anime_size'] = df['episodes'].map(lambda x: 'tiny' if (0<x<=1) 
                                      else ('small' if (1<x<=50) 
                                            else ('medium' if (50<x<=100) 
                                                  else ('large' if (100<x<=200) 
                                                        else ('verylarge' if (200<x<=500) 
                                                              else 'superlarge')))))

#### Some auidence prefer less popular shows while others only look out for the most popular ones, hence classification based on popularity becomes important. Thus, the members can also be classified into four different classes similarly. 

In [517]:
df['popularity'] = ''

In [518]:
# new column 'popularity' will have information about the fan following of each anime
df['popularity'] = df['members'].map(lambda x: 'low' if (0<x<=1000) 
                                      else ('moderate' if (1000<x<=10000) 
                                            else ('high' if (10000<x<=100000) 
                                                  else 'veryhigh')))

In [519]:
df.head()

Unnamed: 0,name,genre,type,episodes,rating,members,anime_name,anime_size,popularity
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,1.0,9.37,200630,Kimi no Na wa,tiny,veryhigh
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,64.0,9.26,793665,Fullmetal Alchemist: Brotherhood,medium,veryhigh
2,gintama,"[action, comedy, historical, parody, samur...",TV,51.0,9.25,114262,Gintama,medium,veryhigh
3,steinsgate,"[sci-fi, thriller]",TV,24.0,9.17,673572,Steins Gate,small,veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,51.0,9.16,151266,Gintama 039,medium,veryhigh


#### We must consider only the data relevant for formulating cosine similarity matrix and drop all other columns.

In [520]:
to_be_dropped = ['episodes','rating','members']

In [521]:
str_data = df.drop(to_be_dropped, axis=1)

In [522]:
str_data.head()

Unnamed: 0,name,genre,type,anime_name,anime_size,popularity
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,Kimi no Na wa,tiny,veryhigh
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,Fullmetal Alchemist: Brotherhood,medium,veryhigh
2,gintama,"[action, comedy, historical, parody, samur...",TV,Gintama,medium,veryhigh
3,steinsgate,"[sci-fi, thriller]",TV,Steins Gate,small,veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,Gintama 039,medium,veryhigh


#### Creating new column 'combined' that carries relevant information of the entire row.

In [523]:
str_data['combined'] = ''
columns = str_data.columns
for index, row in str_data.iterrows():
    words = ''
    for col in columns:
        if col != 'genre' and col != 'anime_name':
            words = words + ' ' + row[col]
        elif col != 'anime_name':
            for genre in row[col]:
                words = words + ' ' + genre
    row['combined'] = words

#df = df.drop(columns = [col for col in df.columns if col != 'combined'], inplace=True)

In [524]:
str_data.head()

Unnamed: 0,name,genre,type,anime_name,anime_size,popularity,combined
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,Kimi no Na wa,tiny,veryhigh,kiminonawa drama romance school supernatur...
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,Fullmetal Alchemist: Brotherhood,medium,veryhigh,fullmetalalchemistbrotherhood action adventu...
2,gintama,"[action, comedy, historical, parody, samur...",TV,Gintama,medium,veryhigh,gintama action comedy historical parody s...
3,steinsgate,"[sci-fi, thriller]",TV,Steins Gate,small,veryhigh,steinsgate sci-fi thriller TV small veryhigh
4,gintama039,"[action, comedy, historical, parody, samur...",TV,Gintama 039,medium,veryhigh,gintama039 action comedy historical parody...


### Using CountVectorizer to convert contents of 'combined' column into vectors 

In [525]:
cv = CountVectorizer()

In [526]:
matrix = cv.fit_transform(str_data['combined'])

In [527]:
matrix

<9957x9977 sparse matrix of type '<class 'numpy.int64'>'
	with 76037 stored elements in Compressed Sparse Row format>

In [528]:
cosine_matrix = cosine_similarity(matrix)

In [529]:
cosine_matrix.shape

(9957, 9957)

In [530]:
# Creating new 'id' column for convenience 
str_data['id'] = [i for i in range(0,str_data.shape[0])]

In [531]:
# this function accepts a anime name as input as returns a list of ten most recommended animes 
def recommend_ten(title):
    anime_id = str_data[str_data.name==title]['id'].values[0]
    sorted_scores = sorted(list(enumerate(cosine_matrix[anime_id])), key=lambda x: x[1], reverse=True)
    sorted_ten = sorted_scores[1:11]
    top_ten = [str_data[anime[0]==str_data['id']]['anime_name'].values[0] for anime in sorted_ten]
    return top_ten

In [532]:
recommendations = recommend_ten('gintama')

In [533]:
recommendations

['Gintama  039 ',
 'Gintama',
 'Gintama  039 : Enchousen',
 'Viewtiful Joe',
 'Genji Tsuushin Agedama',
 'Big X',
 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
 'Gintama: Yorinuki Gintama-san on Theater 2D',
 'Gintama Movie: Shinyaku Benizakura-hen',
 'Gintama: Shinyaku Benizakura-hen']

In [537]:
import pickle
# open a file, where you want to store the data
file = open('anime_recommendation.pkl', 'wb')

# dump information to that file
pickle.dump(cosine_matrix, file)

In [538]:
matrix = open('anime_recommendation.pkl','rb')
#cos_matrix = pickle.load(matrix)

In [539]:
cos_matrix = pickle.load(matrix)

In [540]:
print(cos_matrix)

[[1.         0.21320072 0.10206207 ... 0.14433757 0.         0.2236068 ]
 [0.21320072 1.         0.43519414 ... 0.12309149 0.26967994 0.28603878]
 [0.10206207 0.43519414 1.         ... 0.         0.25819889 0.        ]
 ...
 [0.14433757 0.12309149 0.         ... 1.         0.36514837 0.51639778]
 [0.         0.26967994 0.25819889 ... 0.36514837 1.         0.28284271]
 [0.2236068  0.28603878 0.         ... 0.51639778 0.28284271 1.        ]]


In [542]:
file = open('anime_dataframe.pkl', 'wb')

# dump information to that file
pickle.dump(str_data, file)

In [543]:
data = open('anime_dataframe.pkl', 'rb')

In [544]:
dataframe = pickle.load(data)

In [545]:
dataframe

Unnamed: 0,name,genre,type,anime_name,anime_size,popularity,combined,id
0,kiminonawa,"[drama, romance, school, supernatural]",Movie,Kimi no Na wa,tiny,veryhigh,kiminonawa drama romance school supernatur...,0
1,fullmetalalchemistbrotherhood,"[action, adventure, drama, fantasy, magic,...",TV,Fullmetal Alchemist: Brotherhood,medium,veryhigh,fullmetalalchemistbrotherhood action adventu...,1
2,gintama,"[action, comedy, historical, parody, samur...",TV,Gintama,medium,veryhigh,gintama action comedy historical parody s...,2
3,steinsgate,"[sci-fi, thriller]",TV,Steins Gate,small,veryhigh,steinsgate sci-fi thriller TV small veryhigh,3
4,gintama039,"[action, comedy, historical, parody, samur...",TV,Gintama 039,medium,veryhigh,gintama039 action comedy historical parody...,4
...,...,...,...,...,...,...,...,...
9994,pochacconojacktomamenoki,"[fantasy, kids]",OVA,Pochacco no Jack to Mame no Ki,tiny,low,pochacconojacktomamenoki fantasy kids OVA ti...,9952
9995,pochaccononinjinhatawaoosawagi,"[fantasy, kids]",OVA,Pochacco no Ninjin Hata wa Oosawagi,tiny,low,pochaccononinjinhatawaoosawagi fantasy kids ...,9953
9996,pochacconowakuwakubirthday,"[fantasy, kids]",OVA,Pochacco no Wakuwaku Birthday,tiny,low,pochacconowakuwakubirthday fantasy kids OVA ...,9954
9997,pokapokamorinorascal,[kids],TV,Poka Poka Mori no Rascal,medium,low,pokapokamorinorascal kids TV medium low,9955
