# Anime Data Exploration and Recommendation System
This notebook explores an anime dataset and builds a recommendation system based on user ratings and anime metadata

# Required Libraries

In [129]:
# Visualization
import plotly.express as px
import plotly.graph_objects as go  # for 3D plot visualization
import plotly.figure_factory as ff

from plotly.offline import init_notebook_mode, iplot 
init_notebook_mode(connected=True)
from wordcloud import WordCloud
from datetime import datetime

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np

# Load Datasets

In [132]:
pd.set_option('display.max_columns', 50)

In [133]:
df_anime = pd.read_csv(r'D:\Dataset\myAnimeList2023\anime-dataset-2023.csv')
df_user = pd.read_csv(r'D:\Dataset\myAnimeList2023\users-details-2023.csv')
df_score = pd.read_csv(r'D:\Dataset\myAnimeList2023\users-score-2023.csv')

In [134]:
scores = df_anime['Score'][df_anime['Score'] != 'UNKNOWN']
scores = scores.astype('float')
score_mean= round(scores.mean() , 2)

In [135]:
df_anime['Score'] = df_anime['Score'].replace('UNKNOWN', score_mean)
df_anime['Score'] = df_anime['Score'].astype('float64')

In [136]:
df_anime['Rank'] = df_anime['Rank'].replace('UNKNOWN', np.nan)
df_anime['Rank'] = df_anime['Rank'].astype('float64')

# Data Preprocessing, Analyzing and Visualizing
We replace unknown values with appropriate defaults or remove them for accurate analysis.

In [138]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   anime_id      24905 non-null  int64  
 1   Name          24905 non-null  object 
 2   English name  24905 non-null  object 
 3   Other name    24905 non-null  object 
 4   Score         24905 non-null  float64
 5   Genres        24905 non-null  object 
 6   Synopsis      24905 non-null  object 
 7   Type          24905 non-null  object 
 8   Episodes      24905 non-null  object 
 9   Aired         24905 non-null  object 
 10  Premiered     24905 non-null  object 
 11  Status        24905 non-null  object 
 12  Producers     24905 non-null  object 
 13  Licensors     24905 non-null  object 
 14  Studios       24905 non-null  object 
 15  Source        24905 non-null  object 
 16  Duration      24905 non-null  object 
 17  Rating        24905 non-null  object 
 18  Rank          20293 non-nu

In [139]:
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24325191 entries, 0 to 24325190
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   user_id      int64 
 1   Username     object
 2   anime_id     int64 
 3   Anime Title  object
 4   rating       int64 
dtypes: int64(3), object(2)
memory usage: 927.9+ MB


In [140]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731290 entries, 0 to 731289
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Mal ID            731290 non-null  int64  
 1   Username          731289 non-null  object 
 2   Gender            224383 non-null  object 
 3   Birthday          168068 non-null  object 
 4   Location          152805 non-null  object 
 5   Joined            731290 non-null  object 
 6   Days Watched      731282 non-null  float64
 7   Mean Score        731282 non-null  float64
 8   Watching          731282 non-null  float64
 9   Completed         731282 non-null  float64
 10  On Hold           731282 non-null  float64
 11  Dropped           731282 non-null  float64
 12  Plan to Watch     731282 non-null  float64
 13  Total Entries     731282 non-null  float64
 14  Rewatched         731282 non-null  float64
 15  Episodes Watched  731282 non-null  float64
dtypes: float64(10), int6

In [141]:
type_counts = df_anime['Type'].value_counts()

fig = px.bar(type_counts, 
             x=(type_counts.index), 
             y=(type_counts.values), 
             color=(type_counts.index), 
             labels={'x':'(Anime Type)', 'y':'(Count)'}, 
             title='Anime by Type')

fig.show()

In [142]:
# Scatter Plot: Score vs Members
fig = px.scatter(df_anime, x='Score', y='Members', 
                 labels={'Score':'Score', 'Members':'Member'}, 
                 title='Anime of Score vs. Members of Score')

fig.show()

In [143]:
# Bar Chart of Genres
genre_counts = df_anime[df_anime['Genres'] != "UNKNOWN"]['Genres'].apply(lambda x: x.split(', ')).explode().value_counts()

fig = px.bar(genre_counts, 
             x=genre_counts.index, 
             y=genre_counts.values,
             labels={'x':'Genre', 'y':'Number'},
             title='Common Anime Genre',
             color=genre_counts.index)

fig.show()

In [144]:
#Pie Chart of Top 10 Genres
top_10_genres = genre_counts.head(10)

fig = go.Figure(data=[go.Pie(labels=top_10_genres.index, values=top_10_genres.values,
                             hole=0.6, hoverinfo='label+percent', textinfo='value')])

fig.update_layout(title='Distribution of Anime Genres',
                  legend=dict(font=dict(size=12), title='Genre'),
                  annotations=[dict(text='Genre', x=1, y=1, font_size=22, showarrow=False)])

fig.show()

In [145]:
#Correlation Matrix
correlation_matrix = df_anime[['Score', 'Popularity', 'Rank']].corr()

fig = ff.create_annotated_heatmap(z=correlation_matrix.values,
                                  x=list(correlation_matrix.columns),
                                  y=list(correlation_matrix.index),
                                  colorscale='Viridis')
fig.update_layout(title='Correlation Matrix')
fig.show()

In [146]:
# Yearly Anime Distribution
def season_year(premiered):
    if premiered == 'UNKNOWN':
        return None, None
    else:
        season, year = premiered.split()
        return season, int(year)

season_year = df_anime['Premiered'].map(season_year)
premiered_season = season_year.apply(lambda x: x[0])
premiered_Year = season_year.apply(lambda x: x[1])
filtered_premiered_year = premiered_Year.dropna()
year_counts = filtered_premiered_year.value_counts()
sorted_years = sorted(year_counts.index)

fig = go.Figure(data=go.Bar(
    x=sorted_years,
    y=year_counts[sorted_years],
    marker=dict(color='#000000'),  # Set the color of the bars
))
fig.update_layout(
    title='Number of Anime by Year',
    xaxis_title='Year',
    yaxis_title='Number of Animes',
    title_font=dict(size=30),
    font=dict(size=22, color='#000000')
)

fig.show()

In [147]:
studiocounts = df_anime['Studios'].value_counts()
studiocounts = studiocounts[studiocounts.index != 'UNKNOWN']
top_10_studios = studiocounts.head(10)

fig = go.Figure(data=go.Bar(
    x=top_10_studios.index,
    y=top_10_studios.values,
    marker=dict(color=top_10_studios.values, colorscale='Greens'), 
    text=top_10_studios.values, 
    hovertemplate='Studio: %{x}<br>Number of Animes: %{y}<extra></extra>', 
))
fig.update_layout(
    title='Top 10 Studio',
    xaxis_title='Studios',
    yaxis_title='Number of Animes',
)

fig.show()


In [148]:
def calculate_age(age):
    if age != 'NaN':
        try:
            birth = int(age.split('-')[0])
            now = datetime.utcnow().year
            age = now - birth
            if age >= 10 and age < 60:
                return age
            else:
                return None
        except:
            return None
    return None

Age = df_user['Birthday'].dropna().apply(calculate_age)

fig = px.histogram(Age, nbins=10, title='Distribution of Age', labels={'value': 'Age', 'count': 'Count'})

fig.update_layout(
    xaxis=dict(title='Age'),
    yaxis=dict(title='Count'),
    bargap=0.1,
)

fig.show()


datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).



# Building a Recommendation System

## Cosine Similarity
We use user ratings to recommend anime similar to a selected one based on cosine similarity.

In [151]:
df_score

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8
...,...,...,...,...,...
24325186,1291087,Oblongata,10611,R-15,3
24325187,1291087,Oblongata,174,Tenjou Tenge,6
24325188,1291097,JuunanaSai,1535,Death Note,9
24325189,1291097,JuunanaSai,226,Elfen Lied,10


In [152]:
df = df_score.copy().drop_duplicates()
df['scoredBy'] = df.groupby('anime_id')['anime_id'].transform('count')
df.head()

Unnamed: 0,user_id,Username,anime_id,Anime Title,rating,scoredBy
0,1,Xinil,21,One Piece,9,51317
1,1,Xinil,48,.hack//Sign,7,21292
2,1,Xinil,320,A Kite,5,6234
3,1,Xinil,49,Aa! Megami-sama!,8,8636
4,1,Xinil,304,Aa! Megami-sama! Movie,8,8200


In [153]:
len(df['user_id'].unique())

270033

In [154]:
len(df['anime_id'].unique())

16500

In [155]:
df['scoredBy'].describe()

count    2.432519e+07
mean     1.849844e+04
std      2.107010e+04
min      1.000000e+00
25%      4.043000e+03
50%      1.096500e+04
75%      2.443400e+04
max      1.264920e+05
Name: scoredBy, dtype: float64

In [156]:
filtered_df = df[(df['scoredBy']>=4100) & (df['rating']>=7)]

In [157]:
matrix = filtered_df.pivot(index = 'user_id', columns = 'Anime Title',values = 'rating')
matrix.fillna(0,inplace = True)

In [158]:
matrix

Anime Title,"""Bungaku Shoujo"" Movie","""Oshi no Ko""",.hack//G.U. Trilogy,.hack//Liminality,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,11eyes,11eyes: Momoiro Genmutan,3-gatsu no Lion,3-gatsu no Lion 2nd Season,30-sai no Hoken Taiiku,5-toubun no Hanayome,86,86 Part 2,91 Days,A Kite,A-Channel,AKB0048,Aa! Megami-sama!,Aa! Megami-sama! (TV),Aa! Megami-sama! (TV) Specials,Aa! Megami-sama! Movie,Aa! Megami-sama! Sorezore no Tsubasa,...,Yuâ˜†Giâ˜†Oh!,Yu☆Gi☆Oh!,Yu☆Gi☆Oh! Duel Monsters,Yu☆Gi☆Oh! Duel Monsters GX,Yu☆Gi☆Oh!: Hikari no Pyramid,Zan Sayonara Zetsubou Sensei,Zankyou no Terror,Zero no Tsukaima,Zero no Tsukaima F,Zero no Tsukaima: Futatsuki no Kishi,Zero no Tsukaima: Princesses no Rondo,Zero no Tsukaima: Princesses no Rondo - Yuuwaku no Sunahama,Zetman,Zetsuen no Tempest,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,xxxHOLiC◆Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
1,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1291079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1291085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1291087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
similarity_scores=cosine_similarity(matrix.T)

In [160]:
def recommend(anime_name):
    # index fetch
    index = np.where(matrix.T.index==anime_name)[0][0]
    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = df[df['Anime Title'] == matrix.T.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Anime Title')['Anime Title'].values))
        data.append(item)
    
    return data

In [161]:
recommend('Violet Evergarden')

[['Yakusoku no Neverland'],
 ['Koe no Katachi'],
 ['Kimi no Na wa.'],
 ['Shigatsu wa Kimi no Uso']]