In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"C:\Users\91808\Downloads\anime.csv") 
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [15]:
# Drop or fill missing values
df.dropna(subset=['genre', 'rating'], inplace=True)  # Keep rows with essential info
df[ 'rating'].fillna(df[ 'rating'].mean(), inplace=True)
import warnings
warnings.filterwarnings('ignore')

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# TF-IDF for genres
tfidf = TfidfVectorizer(stop_words='english')
genre_matrix = tfidf.fit_transform(df['genre'])

# Normalize ratings
scaler = MinMaxScaler()
rating_scaled = scaler.fit_transform(df[['rating']])

# Combine features: genre + rating
from scipy.sparse import hstack
feature_matrix = hstack([genre_matrix, rating_scaled])

In [17]:
# Compute cosine similarity matrix
cos_sim_matrix = cosine_similarity(feature_matrix, feature_matrix)

# Recommendation function
def recommend_anime(title, top_n=10):
    if title not in df['name'].values:
        return "Anime not found."

    idx = df[df['name'] == title].index[0]
    sim_scores = list(enumerate(cos_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:top_n+1]  # Exclude the anime itself
    anime_indices = [i[0] for i in sim_scores]
    
    return df[['name', 'genre', 'rating']].iloc[anime_indices]

In [18]:
recommend_anime("Naruto", top_n=5)

Unnamed: 0,name,genre,rating
615,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",7.94
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"Action, Comedy, Martial Arts, Shounen, Super P...",7.68
486,Boruto: Naruto the Movie,"Action, Comedy, Martial Arts, Shounen, Super P...",8.03
1343,Naruto x UT,"Action, Comedy, Martial Arts, Shounen, Super P...",7.58
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"Action, Comedy, Martial Arts, Shounen, Super P...",7.53


In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Simulated evaluation for known similar titles (optional/advanced)
# You can manually define expected relevant animes and check if they are returned

true_relevant = {'Naruto', 'Naruto: Shippuden', 'Bleach', 'One Piece'}
recommended = set(recommend_anime("Naruto", top_n=5)['name'])


In [21]:
intersection = recommended & true_relevant

if len(intersection) == 0:
    precision = 0
    recall = 0
    f1 = 0
else:
    precision = len(intersection) / len(recommended)
    recall = len(intersection) / len(true_relevant)
    f1 = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 0.00
Recall: 0.00
F1-Score: 0.00


In [None]:
# Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?
# and:
# User-Based Collaborative Filtering	
# Finds similar users to recommend items	
# Recommend items liked by users similar to you	
	
# Item-Based Collaborative Filtering
# Finds similar items to recommend to users
# Recommend items similar to items you liked
    
# 2. What is collaborative filtering, and how does it work?
# ans
# Collaborative Filtering is a recommendation system technique that suggests items based on the past behavior of users, 
# without needing explicit information about the items.
# Assumes that if two users liked the same item, they are likely to agree again in the future.
# Builds a user-item interaction matrix (e.g., ratings, clicks, purchases).
# Predicts missing entries (what a user might like) based on similar users or similar items.