#### c)	Use the Anime Recommendations dataset from below Kaggle link and create an end-to-end project on Jupyter/Colab.

https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database/data

Code for reference - https://www.kaggle.com/code/benroshan/content-collaborative-anime-recommendation


i.	Download the dataset from above link and load it into your Python environment. \
ii.	Perform the EDA and do the visualizations.\
iii.	Check the distributions/skewness in the variables and do the transformations if required.\
iv.	Create a content based Recommender system 


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import Dataset
anime_df = pd.read_csv("anime.csv")

In [3]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# Import Dataset
rating_df = pd.read_csv("rating.csv")

In [5]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
# Checking for duplicate values
anime_df.duplicated().sum()

np.int64(0)

In [7]:
# Check missing values
anime_df.isnull().sum().sort_values(ascending=False)

rating      230
genre        62
type         25
name          0
anime_id      0
episodes      0
members       0
dtype: int64

In [8]:
anime_df['genre'] = anime_df['genre'].fillna('')

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])

In [10]:
tfidf_matrix.shape

(12294, 46)

In [11]:
# Cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [12]:
# Sigmoid Kernel similarity matrix
from sklearn.metrics.pairwise import sigmoid_kernel

sim_matrix = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
indices = pd.Series(anime_df.index, index=anime_df['name'])

In [14]:
indices

name
Kimi no Na wa.                                            0
Fullmetal Alchemist: Brotherhood                          1
Gintama°                                                  2
Steins;Gate                                               3
Gintama&#039;                                             4
                                                      ...  
Toushindai My Lover: Minami tai Mecha-Minami          12289
Under World                                           12290
Violence Gekiga David no Hoshi                        12291
Violence Gekiga Shin David no Hoshi: Inma Densetsu    12292
Yasuji no Pornorama: Yacchimae!!                      12293
Length: 12294, dtype: int64

In [15]:
def content_based_recommendation(title, top_n=5, sig=cosine_sim):
    if title not in anime_df['name'].values:
        return f"Anime '{title}' not found"

    #Get the index corresponding to original title
    index = indices[title]

    #Get pair-wise similarity score
    sim_scores = list(enumerate(sig[index]))
    #score = list(enumerate(sim_matrix[index])
                 
    #Sort the movie
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    #Movies indices
    anime_indices = [i[0] for i in sim_scores]
    
    return anime_df['name'].iloc[anime_indices]


In [16]:
content_based_recommendation('Gintama', 10, sim_matrix)

4                                            Gintama&#039;
8        Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                                 Gintama&#039;: Enchousen
12                                                 Gintama
63             Gintama: Yorinuki Gintama-san on Theater 2D
65                  Gintama Movie: Shinyaku Benizakura-hen
216                       Gintama: Shinyaku Benizakura-hen
306                       Gintama: Jump Festa 2014 Special
10896                                       Gintama (2017)
380      Gintama: Nanigoto mo Saiyo ga Kanjin nano de T...
Name: name, dtype: object

In [None]:
import joblib

joblib.dump(cosine_sim, 'cosine_sim.pkl')
joblib.dump(indices, 'indices.pkl')


['tfidf_matrix.pkl']