# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing
from scipy.sparse import hstack
import pandas_profiling
%matplotlib inline

# Loading Data and Cleaning

In [2]:
path = "data/anime_data.csv"

In [3]:
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [5]:
df2 = pd.read_csv("data/anime.csv")

In [6]:
df2.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name',
       'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors',
       'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity',
       'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped',
       'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6',
       'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1'],
      dtype='object')

In [7]:
df2.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [8]:
def get_episodes(df, df2):
    episodes = []
    for i in df.Name:
        if df2.loc[df2['Name'] == i].empty:
            pass
        else:
            try:
                episodes.append(int(df2.loc[df2['Name'] == i].Episodes.values[0]) )
            except ValueError:
                episodes.append(0)
            
    
    return episodes

In [9]:
episode_list = get_episodes(df,df2)

In [10]:
episode_list

[26,
 1,
 26,
 26,
 52,
 145,
 24,
 52,
 24,
 74,
 220,
 0,
 178,
 12,
 26,
 24,
 22,
 24,
 69,
 26,
 26,
 1,
 1,
 25,
 1,
 4,
 94,
 1,
 1,
 26,
 5,
 24,
 3,
 26,
 24,
 1,
 26,
 13,
 26,
 26,
 26,
 24,
 26,
 26,
 12,
 12,
 12,
 26,
 24,
 23,
 25,
 24,
 12,
 13,
 26,
 25,
 13,
 13,
 24,
 43,
 12,
 6,
 1,
 13,
 50,
 47,
 1,
 1,
 51,
 49,
 3,
 39,
 50,
 50,
 50,
 49,
 26,
 26,
 26,
 13,
 12,
 26,
 74,
 24,
 13,
 51,
 13,
 3,
 24,
 52,
 52,
 8,
 26,
 26,
 13,
 26,
 7,
 4,
 13,
 26,
 51,
 52,
 52,
 4,
 13,
 13,
 24,
 6,
 50,
 25,
 26,
 49,
 12,
 13,
 75,
 62,
 8,
 8,
 14,
 12,
 44,
 12,
 13,
 26,
 12,
 14,
 12,
 12,
 50,
 3,
 24,
 45,
 64,
 1,
 24,
 26,
 13,
 24,
 13,
 24,
 26,
 26,
 1,
 26,
 26,
 24,
 26,
 12,
 101,
 13,
 25,
 24,
 26,
 4,
 26,
 26,
 1,
 13,
 13,
 26,
 39,
 12,
 26,
 13,
 1,
 24,
 3,
 1,
 1,
 24,
 5,
 12,
 12,
 24,
 24,
 1,
 50,
 6,
 26,
 12,
 26,
 27,
 13,
 3,
 26,
 161,
 1,
 6,
 1,
 26,
 1,
 13,
 153,
 64,
 13,
 6,
 26,
 12,
 27,
 26,
 70,
 78,
 26,
 0,
 26,
 50,
 42,
 2

In [11]:
anime_data = df.copy()

In [12]:
anime_data['Episodes'] = episode_list

In [13]:
anime_data.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis,Episodes
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",26
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",1
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",26
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,26
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,52


In [14]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16214 non-null  int64 
 1   Name       16214 non-null  object
 2   Score      16214 non-null  object
 3   Genres     16214 non-null  object
 4   sypnopsis  16206 non-null  object
 5   Episodes   16214 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 760.2+ KB


In [15]:
anime_data.sypnopsis

0        In the year 2071, humanity has colonized sever...
1        other day, another bounty—such is the life of ...
2        Vash the Stampede is the man with a $$60,000,0...
3        ches are individuals with special powers like ...
4        It is the dark century and the people are suff...
                               ...                        
16209    No synopsis information has been added to this...
16210    ko is a typical high school student whose life...
16211            Sequel to Higurashi no Naku Koro ni Gou .
16212                            New Yama no Susume anime.
16213    Solar calendar year 2020: grotesque organisms ...
Name: sypnopsis, Length: 16214, dtype: object

In [16]:
anime_data.drop('sypnopsis', axis=1, inplace=True)

In [17]:
anime_data.columns

Index(['MAL_ID', 'Name', 'Score', 'Genres', 'Episodes'], dtype='object')

In [18]:
def convert_score(row):
#     print(row['Score'])
    try:
        row['Score'] = float(row['Score'])
    except ValueError:
        row['Score'] = 5
    
    return row

anime_data = anime_data.apply(convert_score, axis="columns")

In [19]:
anime_data.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Episodes
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",26
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",1
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",26
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",26
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",52


In [20]:
float(anime_data.Score[0])

8.78

In [21]:
anime_data.Score[0]

8.78

In [22]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MAL_ID    16214 non-null  int64  
 1   Name      16214 non-null  object 
 2   Score     16214 non-null  float64
 3   Genres    16214 non-null  object 
 4   Episodes  16214 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 633.5+ KB


# Recommender System

In [23]:
anime_data.drop('MAL_ID', axis = 1, inplace=True)

In [24]:
anime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16214 entries, 0 to 16213
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      16214 non-null  object 
 1   Score     16214 non-null  float64
 2   Genres    16214 non-null  object 
 3   Episodes  16214 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 506.8+ KB


In [25]:
anime_data.head()

Unnamed: 0,Name,Score,Genres,Episodes
0,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",26
1,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",1
2,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",26
3,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",26
4,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",52


In [32]:
def preprocess(df):
    
    s = list(df.select_dtypes(include=['object']).columns)
    
    s.remove("Name")
    
    df['all_text'] = df[s[0]]
    
    token = RegexpTokenizer(r'[a-zA-Z]+')
    
    cv = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range = (1, 1), tokenizer = token.tokenize)
    text_counts = cv.fit_transform(df['all_text'])
    
    ndf = df.select_dtypes(include=['float64', 'int64'])
    
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    
    ndfmx = pd.DataFrame((scaler.fit_transform(ndf)))
    ndfmx.columns = ndf.columns
    
    scores = ndfmx.Score.values[:, None]
    X_train_dtm = hstack((text_counts, scores))
    episodes = ndfmx.Score.values[:, None]
    X_train_dtm = hstack((X_train_dtm, episodes))
    
    return X_train_dtm

In [33]:
mat = preprocess(anime_data)

In [35]:
mat.shape

(16214, 47)

In [36]:
print(mat)

  (0, 37)	0.57209688011299
  (0, 11)	0.3625941350691579
  (0, 32)	0.3625941350691579
  (0, 8)	0.36071304885146876
  (0, 5)	0.25430507843268535
  (0, 1)	0.34445181159390365
  (0, 0)	0.3103612719931134
  (1, 24)	0.501098184908516
  (1, 37)	0.5478418588756376
  (1, 11)	0.34722133939003297
  (1, 32)	0.34722133939003297
  (1, 8)	0.345420005025125
  (1, 0)	0.29720297747146196
  (2, 35)	0.42687242314306073
  (2, 11)	0.3997851155644119
  (2, 32)	0.3997851155644119
  (2, 8)	0.3977110878894137
  (2, 5)	0.28038893996571906
  (2, 1)	0.3797819489776768
  (2, 0)	0.3421947709298145
  (3, 19)	0.40787861534525915
  (3, 40)	0.3762953502967659
  (3, 26)	0.5681677454680382
  (3, 24)	0.44955370053044996
  (3, 8)	0.30988905203207123
  :	:
  (16189, 46)	0.42915531335149865
  (16190, 46)	0.42915531335149865
  (16191, 46)	0.42915531335149865
  (16192, 46)	0.42915531335149865
  (16193, 46)	0.42915531335149865
  (16194, 46)	0.42915531335149865
  (16195, 46)	0.42915531335149865
  (16196, 46)	0.42915531335149865
 

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

sig = cosine_similarity(mat, mat)

In [44]:
indices = pd.Series(anime_data.index, index=anime_data['Name']).drop_duplicates()

In [45]:
indices

Name
Cowboy Bebop                           0
Cowboy Bebop: Tengoku no Tobira        1
Trigun                                 2
Witch Hunter Robin                     3
Bouken Ou Beet                         4
                                   ...  
Daomu Biji Zhi Qinling Shen Shu    16209
Mieruko-chan                       16210
Higurashi no Naku Koro ni Sotsu    16211
Yama no Susume: Next Summit        16212
Scarlet Nexus                      16213
Length: 16214, dtype: int64

In [47]:
sig

array([[1.        , 0.91839289, 0.90160659, ..., 0.4152807 , 0.5478509 ,
        0.52360195],
       [0.91839289, 1.        , 0.8105232 , ..., 0.50336609, 0.40640889,
        0.51397519],
       [0.90160659, 0.8105232 , 1.        , ..., 0.40272961, 0.55645865,
        0.52833946],
       ...,
       [0.4152807 , 0.50336609, 0.40272961, ..., 1.        , 0.26919206,
        0.26919206],
       [0.5478509 , 0.40640889, 0.55645865, ..., 0.26919206, 1.        ,
        0.26919206],
       [0.52360195, 0.51397519, 0.52833946, ..., 0.26919206, 0.26919206,
        1.        ]])

In [54]:
def give_rec(title, sig=sig):
    
    try:
        idx = indices[title]
    except:
        return "Sorry No recommendations"
    
    sig_scores = list(enumerate(sig[idx]))
    
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse = True)
    
    sig_scores = sig_scores[1:11]
    
    movie_indices = [i[0] for i in sig_scores]
    
    print(movie_indices)
    
    return df['Name'].iloc[movie_indices]

In [55]:
give_rec('Cowboy Bebop')

[3044, 1913, 1914, 1300, 2094, 2312, 365, 1301, 2458, 1158]


3044                      Cowboy Bebop: Yose Atsume Blues
1913                              Waga Seishun no Arcadia
1914             Waga Seishun no Arcadia: Mugen Kidou SSX
1300                            Ginga Tetsudou Monogatari
2094    Uchuu Kaizoku Captain Herlock: Arcadia-gou no ...
2312        Ginga Tetsudou Monogatari: Eien e no Bunkiten
365                             Seihou Bukyou Outlaw Star
1301                                   Ginga Tetsudou 999
2458                Moonlight Mile 2nd Season: Touch Down
1158    Sayonara Ginga Tetsudou 999: Andromeda Shuucha...
Name: Name, dtype: object