#### Anime Recommendation System using Cosine Similarity

In [72]:
# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
# Load dataset
df = pd.read_csv("anime.csv")

In [96]:
# Display first 5 rows
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,rating_norm
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0.92437
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,0.911164
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0.909964
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0.90036
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0.89916


##### Data Preprocessing

In [75]:
# Check for missing values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [76]:
# Drop rows with missing anime names or genres
df.dropna(subset=['name', 'genre'], inplace=True)


In [77]:
# Fill missing type or episodes with defaults
df['type'] = df['type'].fillna('Unknown')
df['episodes'] = df['episodes'].replace('Unknown', 0).astype(int)

In [78]:
# Reset index
df.reset_index(drop=True, inplace=True)

In [79]:
# Display cleaned dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12232 entries, 0 to 12231
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12232 non-null  int64  
 1   name      12232 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12232 non-null  object 
 4   episodes  12232 non-null  int64  
 5   rating    12017 non-null  float64
 6   members   12232 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 669.1+ KB


In [80]:
# check no of rows & columns
df.shape

(12232, 7)

In [81]:
# check for duplicates
df.duplicated().sum()

np.int64(0)

##### Feature Extraction

In [82]:
# Convert genre into TFIDF vectors
tfidf = TfidfVectorizer(stop_words='english')

In [83]:
# Replace NaN with empty string
df['genre'] = df['genre'].fillna('')

In [84]:
# Fit and transform
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [85]:
# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

 ##### Build the Recommendation Function

In [86]:
# Create a Series to map anime titles to their indices
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(name, num_recommendations=5):
    if name not in indices:
        print("Anime not found in dataset.")
        return []
        
    # Get index of the anime 
    idx = indices[name]

    # Get similarity scores for all anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N similar anime (excluding itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get the corresponding anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return top recommended anime
    return df[['name', 'genre', 'type', 'rating']].iloc[anime_indices]

##### Test the Recommender

In [87]:
recommendations = recommend_anime("Mushishi", num_recommendations=5)
print("Recommended Anime similar to 'Mushishi':")
display(recommendations)

Recommended Anime similar to 'Mushishi':


Unnamed: 0,name,genre,type,rating
27,Mushishi Zoku Shou,"Adventure, Fantasy, Historical, Mystery, Seine...",TV,8.8
28,Mushishi,"Adventure, Fantasy, Historical, Mystery, Seine...",TV,8.78
33,Mushishi Zoku Shou: Suzu no Shizuku,"Adventure, Fantasy, Historical, Mystery, Seine...",Movie,8.75
48,Mushishi Special: Hihamukage,"Adventure, Fantasy, Historical, Mystery, Seine...",Special,8.66
85,Mushishi Zoku Shou: Odoro no Michi,"Adventure, Fantasy, Historical, Mystery, Seine...",Special,8.54


##### Add Rating-Based Weighting

In [89]:
# Weighted cosine similarity using rating as a feature
df['rating'] = df['rating'].fillna(df['rating'].mean())

# Normalize rating
df['rating_norm'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

# Combine genre vectors with rating
from scipy.sparse import hstack

combined_features = hstack([tfidf_matrix, np.array(df['rating_norm'])[:, None]])

cosine_sim_weighted = cosine_similarity(combined_features, combined_features)


In [91]:
#modifying recommend_anime function to use cosine_sim_weighted instead of cosine_sim

# Create a Series to map anime titles to their indices
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(name, num_recommendations=5):
    if name not in indices:
        print("Anime not found in dataset.")
        return []
        
    # Get index of the anime
    idx = indices[name]

    # Get similarity scores for all anime - now using cosine_sim_weighted instead of cosine_sim
    sim_scores = list(enumerate(cosine_sim_weighted[idx]))
    
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N similar anime (excluding itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get the corresponding anime indices
    anime_indices = [i[0] for i in sim_scores]
    
    # Return top recommended anime
    return df[['name', 'genre', 'type', 'rating']].iloc[anime_indices]


In [92]:
recommendations = recommend_anime("Mushishi", num_recommendations=5)
print("Recommended Anime similar to 'Mushishi':")
display(recommendations)

Recommended Anime similar to 'Mushishi':


Unnamed: 0,name,genre,type,rating
27,Mushishi Zoku Shou,"Adventure, Fantasy, Historical, Mystery, Seine...",TV,8.8
33,Mushishi Zoku Shou: Suzu no Shizuku,"Adventure, Fantasy, Historical, Mystery, Seine...",Movie,8.75
17,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seine...",TV,8.88
48,Mushishi Special: Hihamukage,"Adventure, Fantasy, Historical, Mystery, Seine...",Special,8.66
85,Mushishi Zoku Shou: Odoro no Michi,"Adventure, Fantasy, Historical, Mystery, Seine...",Special,8.54


#### Interview Questions & Answers

##### 1) Difference between User-Based and Item-Based Collaborative Filtering

| Aspect      | User-Based                                         | Item-Based                                |
| ----------- | -------------------------------------------------- | ----------------------------------------- |
| Definition  | Finds similar users based on their rating patterns | Finds similar items based on user ratings |
| Approach    | “Users who liked X also liked Y”                   | “Items similar to X are Y”                |
| Computation | Compares users                                     | Compares items                            |
| Pros        | Personalized                                       | Scalable for large datasets               |
| Cons        | Cold start problem (new users)                     | Needs enough item interactions            |

In this assignment, we used Item-Based Collaborative Filtering (using genre similarity).


##### What is Collaborative Filtering & How Does It Work?

Collaborative filtering recommends items based on patterns in user behavior:

* It assumes similar users like similar things.
* It uses ratings, preferences, or watch history to find relationships.

Two main types:

1. User-based CF: Similar users are found.
2. Item-based CF: Similar items are found.


In [97]:
### Simple Example Python Code for Collaborative Filtering

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Example user-item matrix (rows: users, columns: anime)
ratings = pd.DataFrame({
    'Naruto': [5, 4, np.nan, 2],
    'One Piece': [4, np.nan, 5, 3],
    'Bleach': [np.nan, 4, 4, 2],
    'Attack on Titan': [5, 5, 4, 3]
}, index=['User1', 'User2', 'User3', 'User4'])

# Fill NaN with 0
ratings_filled = ratings.fillna(0)

# Compute cosine similarity between items (anime)
item_similarity = cosine_similarity(ratings_filled.T)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)

print(item_similarity_df)

                   Naruto  One Piece    Bleach  Attack on Titan
Naruto           1.000000   0.548128  0.496904         0.877876
One Piece        0.548128   1.000000  0.612826         0.800167
Bleach           0.496904   0.612826  1.000000         0.808290
Attack on Titan  0.877876   0.800167  0.808290         1.000000


                                                    Submitted by: Meghana C Varghese