In [18]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
name = df['name']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
## checking duplicated values 
df[df.duplicated()]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members


In [6]:
## checking null values 
df.isna().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [8]:
## for genre column we cant fill any value using mode imputation so instead of this we fill with empty string
df['genre'] = df['genre'].fillna('')

In [11]:
## Convert 'rating' to a numeric value and fill missing ratings with the mean rating
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(df['rating'].mean())

In [14]:
# Create a TF-IDF Vectorizer to convert genres into a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [15]:
# Construct the TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [19]:
# Compute the cosine similarity matrix based on genres
genre_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [22]:
def get_top_n_similar_anime(anime_id, df, genre_similarity, n=10):
    # Find the index of the anime in the DataFrame
    anime_idx = df[df['anime_id'] == anime_id].index[0]
    
    # Get the similarity scores for all anime
    similarity_scores = list(enumerate(genre_similarity[anime_idx]))
    
    # Sort the anime based on similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top N most similar anime
    top_n_indices = [i[0] for i in similarity_scores[1:n+1]]
    
    # Get the names and genres of the top N similar anime
    top_n_anime = df.iloc[top_n_indices][['anime_id', 'name', 'genre', 'rating']]
    
    return top_n_anime


In [23]:
# Example usage
anime_id = 32281  # Example anime_id from your dataset
top_n_similar_anime = get_top_n_similar_anime(anime_id, df, genre_similarity, n=10)
print(f'Top 10 anime similar to anime_id {anime_id}:')
print(top_n_similar_anime)

Top 10 anime similar to anime_id 32281:
      anime_id                                   name  \
5805       547            Wind: A Breath of Heart OVA   
6394       546           Wind: A Breath of Heart (TV)   
1111     14669  Aura: Maryuuin Kouga Saigo no Tatakai   
1201     10067         Angel Beats!: Another Epilogue   
1494     20903                               Harmonie   
878       2787          Shakugan no Shana II (Second)   
986        355                      Shakugan no Shana   
1604      6572                    Shakugan no Shana S   
1959       713                              Air Movie   
4514      2105                           Touka Gettan   

                                                  genre  rating  
5805               Drama, Romance, School, Supernatural    6.35  
6394               Drama, Romance, School, Supernatural    6.14  
1111       Comedy, Drama, Romance, School, Supernatural    7.67  
1201                        Drama, School, Supernatural    7.63  
14