In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [17]:
# Data Preprocessing

df.info()
df.isnull().sum()

df.dropna(subset=['rating', 'genre', 'type'], inplace=True) # Drop or fill missing values depending on their importan

df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'].fillna(0, inplace=True) # Fill missing episodes with 0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['episodes'].fillna(0, inplace=True) # Fill missing episodes with 0


In [18]:
# Feature Extraction for Similarity

df['genre'] = df['genre'].fillna('') # Fill missing genres with empty string

tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df['genre']) # TF-IDF vectorizer for genres

cosine_sim = cosine_similarity(genre_matrix, genre_matrix) # Create a similarity matrix { cosine similarity }

In [19]:
# Recommendation Function

# Create a mapping from title to index
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend(title, cosine_sim=cosine_sim, top_n=10):
    idx = indices.get(title)
    if idx is None:
        return f"Anime '{title}' not found."

    sim_scores = list(enumerate(cosine_sim[idx])) # Similarity scores

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Sort the animes based on similarity scores

    sim_scores = sim_scores[1:top_n+1] # Get the top_n most similar anime (excluding itself)

    anime_indices = [i[0] for i in sim_scores]
    return df[['name', 'genre']].iloc[anime_indices]

In [20]:
recommend("Naruto", top_n=20)

Unnamed: 0,name,genre
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P..."
841,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P..."
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P..."
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P..."
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P..."
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,"Action, Comedy, Martial Arts, Shounen, Super P..."
2458,Naruto Shippuuden: Sunny Side Battle,"Action, Comedy, Martial Arts, Shounen, Super P..."
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,"Action, Comedy, Martial Arts, Shounen, Super P..."
7628,Kyutai Panic Adventure!,"Action, Martial Arts, Shounen, Super Power"
784,Naruto: Shippuuden Movie 6 - Road to Ninja,"Action, Adventure, Martial Arts, Shounen, Supe..."


# **Interview Questions**

## **1. Difference between User-Based and Item-Based Collaborative Filtering**

Collaborative filtering is a recommendation approach that filters items based on preferences from multiple users. It assumes that users who agreed in the past will agree again.

| Aspect      | User-Based                    | Item-Based                          |
| ----------- | ----------------------------- | ----------------------------------- |
| Similarity  | Between users                 | Between items                       |
| Logic       | Users with similar tastes     | Items liked by similar users        |
| Example     | "People like you liked..."    | "People who liked this also liked…" |
| Scalability | Less scalable with many users | More scalable and stable            |


## **2. What is Collaborative Filtering and How Does It Work?**
Collaborative filtering recommends items by using preferences of similar users or similar items.

*Steps:*

- Create a user-item matrix (ratings).

- Compute similarity (user-user or item-item).

- Predict missing ratings.

- Recommend top-rated items.

*Types:*

- User-Based: Finds similar users.

- Item-Based: Finds similar items.

- Model-Based: Uses ML (*e.g., matrix factorization*).