In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("anime.csv")

# Display first few rows
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
# Fix the 'episodes' column
df['episodes'] = df['episodes'].replace('Unknown', np.nan)  # Replace 'Unknown' with NaN
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')  # Convert to float safely
df['episodes'] = df['episodes'].fillna(df['episodes'].median())  # Fill NaN with median


In [None]:
# Check missing values
print(df.isnull().sum())

# Fill missing values with appropriate replacements
df['genre'] = df['genre'].fillna('Unknown')

# Drop unnecessary columns (if any)
df = df[['anime_id', 'name', 'genre', 'type', 'episodes']]


anime_id      0
name          0
genre         0
type         25
episodes      0
rating      230
members       0
dtype: int64


In [None]:
# Convert genre to lowercase
df['genre'] = df['genre'].str.lower()

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Transform genres into numerical vectors
tfidf_matrix = tfidf.fit_transform(df['genre'])

# Convert to dense array for similarity calculation
tfidf_array = tfidf_matrix.toarray()


In [None]:
# Compute cosine similarity between all anime
cosine_sim = cosine_similarity(tfidf_matrix)


In [None]:
def recommend_anime(title, df, cosine_sim, top_n=5):
    # Find the index of the given anime title
    idx = df[df['name'] == title].index[0]

    # Get similarity scores for all anime with the given anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on similarity scores (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top recommended anime
    return df.iloc[anime_indices][['name', 'genre']]

# Example Recommendation
recommend_anime("Naruto", df, cosine_sim)


Unnamed: 0,name,genre
615,Naruto: Shippuuden,"action, comedy, martial arts, shounen, super p..."
841,Naruto,"action, comedy, martial arts, shounen, super p..."
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,"action, comedy, martial arts, shounen, super p..."
1343,Naruto x UT,"action, comedy, martial arts, shounen, super p..."
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,"action, comedy, martial arts, shounen, super p..."


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(df, cosine_sim, sample_size=100):
    precision_list, recall_list, f1_list = [], [], []

    sample_anime = df.sample(sample_size)

    for _, row in sample_anime.iterrows():
        anime_name = row['name']
        true_genres = set(row['genre'].split(', '))

        # Get recommended anime
        recommendations = recommend_anime(anime_name, df, cosine_sim, top_n=5)

        # Compute genre overlap
        precision_vals, recall_vals = [], []

        for _, rec in recommendations.iterrows():
            rec_genres = set(rec['genre'].split(', '))
            intersection = len(true_genres.intersection(rec_genres))

            if intersection > 0:
                precision_vals.append(intersection / len(rec_genres))
                recall_vals.append(intersection / len(true_genres))

        if precision_vals:
            precision_list.append(np.mean(precision_vals))
            recall_list.append(np.mean(recall_vals))
            f1_list.append(2 * np.mean(precision_vals) * np.mean(recall_vals) /
                           (np.mean(precision_vals) + np.mean(recall_vals)))

    return {
        "Precision": np.mean(precision_list),
        "Recall": np.mean(recall_list),
        "F1-score": np.mean(f1_list)
    }

# Run Evaluation
evaluate_recommendations(df, cosine_sim)


{'Precision': 0.9616190476190478,
 'Recall': 0.9383238095238094,
 'F1-score': 0.946648777638756}

Interview Questions & Answers
1️⃣ Difference between User-Based and Item-Based Collaborative Filtering
User-Based Collaborative Filtering

Finds similar users based on their past interactions.
Works well when there are many users but fewer items.
Example: "People who watched Naruto also liked One Piece."
Item-Based Collaborative Filtering

Finds similar items based on user interactions.
Works well when there are many items but fewer users.
Example: "Since you liked Naruto, we recommend Bleach."
2️⃣ What is Collaborative Filtering?
Collaborative Filtering is a recommendation technique that suggests items based on user behavior and preferences.

Types:

User-Based Collaborative Filtering: Recommends items based on the preferences of similar users.
Item-Based Collaborative Filtering: Recommends items similar to those the user has previously liked.
Example:

If User A likes Naruto and User B likes Naruto and One Piece, then One Piece is recommended to User A.







