In [1]:
pip install pandas scikit-learn



In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [3]:
# Load the dataset
df = pd.read_csv('anime.csv')

In [4]:
# Show basic info
print("Initial shape:", df.shape)
print(df.columns)

Initial shape: (12294, 7)
Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [5]:
# Fill missing values
df['genre'] = df['genre'].fillna('')
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [6]:
# Optional: drop duplicates if any
df = df.drop_duplicates(subset='name')

In [7]:
# TF-IDF on genre column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genre'])

In [8]:
# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
# Build index mapping of anime names to DataFrame index
anime_indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [10]:
# Recommendation function
def recommend_anime(title, top_n=10):
    if title not in anime_indices:
        return f"'{title}' not found in dataset."
    idx = anime_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # exclude the anime itself

    anime_indices_similar = [i[0] for i in sim_scores]
    return df[['name', 'genre', 'rating']].iloc[anime_indices_similar]

In [11]:
# Simulate ground truth by splitting known similar genres
df['genre_label'] = df['genre'].apply(lambda x: x.split(',')[0] if x else 'Unknown')
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
# Simple evaluation: check if test anime genre appears in top-N similar ones from train
def evaluate_model(top_n=10):
    y_true = []
    y_pred = []

    for _, row in test.iterrows():
        title = row['name']
        true_genre = row['genre_label']

        if title not in anime_indices:
            continue

        recommendations = recommend_anime(title, top_n=top_n)
        if isinstance(recommendations, str):  # error message
            continue

        predicted_genres = recommendations['genre'].apply(lambda g: g.split(',')[0] if g else 'Unknown').tolist()

        y_true.append(true_genre)
        y_pred.append(predicted_genres[0] if predicted_genres else 'Unknown')

    precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)

    print(f"\nEvaluation:\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1-score: {f1:.4f}")

In [13]:
# Run an example
print("\nRecommended Anime for 'Naruto':")
print(recommend_anime('Naruto'))


Recommended Anime for 'Naruto':
                                                   name  \
615                                  Naruto: Shippuuden   
841                                              Naruto   
1103  Boruto: Naruto the Movie - Naruto ga Hokage ni...   
1343                                        Naruto x UT   
1472        Naruto: Shippuuden Movie 4 - The Lost Tower   
1573  Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...   
2458               Naruto Shippuuden: Sunny Side Battle   
2997  Naruto Soyokazeden Movie: Naruto to Mashin to ...   
7628                            Kyutai Panic Adventure!   
784          Naruto: Shippuuden Movie 6 - Road to Ninja   

                                                  genre  rating  
615   Action, Comedy, Martial Arts, Shounen, Super P...    7.94  
841   Action, Comedy, Martial Arts, Shounen, Super P...    7.81  
1103  Action, Comedy, Martial Arts, Shounen, Super P...    7.68  
1343  Action, Comedy, Martial Arts, Shounen, Super P.

# Task
Explain the error in the selected code. If possible, fix the error and incorporate the changes into the existing code. Otherwise, try to diagnose the error.

## Recalculate cosine similarity

### Subtask:
Calculate the cosine similarity matrix using the `train` dataset.


**Reasoning**:
Apply the fitted TF-IDF vectorizer to the training data and calculate the cosine similarity matrix for the training data.



In [14]:
tfidf_matrix_train = tfidf.transform(train['genre'])
cosine_sim = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)

## Update anime indices

### Subtask:
Rebuild the `anime_indices` mapping using the indices of the `train` dataset.


**Reasoning**:
Create the `anime_indices` mapping using the index and name from the training set and remove duplicates.



In [15]:
anime_indices = pd.Series(train.index, index=train['name']).drop_duplicates()

## Run evaluation

### Subtask:
Execute the `evaluate_model` function using the updated `anime_indices` and `cosine_sim` matrix.


**Reasoning**:
Execute the evaluate_model function with top_n=5 to evaluate the recommendation model using the updated anime_indices and cosine_sim matrix.



## Recalculate cosine similarity

### Subtask:
Calculate the cosine similarity matrix using the `train` dataset.

**Reasoning**:
Apply the fitted TF-IDF vectorizer to the training data and calculate the cosine similarity matrix for the training data.

In [16]:
# Assuming 'tfidf' vectorizer is already fitted on the combined genre data
tfidf_matrix_train = tfidf.transform(train['genre'])
cosine_sim = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)

## Update Anime Indices

### Subtask:
Rebuild the `anime_indices` mapping using the indices of the `train` dataset.

**Reasoning**:
Recreate the `anime_indices` Series using the 'name' column and the index of the `train` DataFrame.

In [17]:
# Build index mapping of anime names to DataFrame index using the train data
anime_indices = pd.Series(train.index, index=train['name']).drop_duplicates()

In [18]:
# --- Improved Data Preprocessing ---
# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle missing genres with placeholder
df['genre'] = df['genre'].fillna("Unknown")

# Handle 'Unknown' or '-' or invalid ratings
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating'] = df['rating'].fillna(df['rating'].mean())

# Clean and convert episode values
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

# Optional: Lowercase genre for consistency
df['genre'] = df['genre'].str.lower().str.strip()


## Interview Question Answers

In [None]:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?
# User-Based Collaborative Filtering:

# Recommends items to a user based on the preferences of other similar users.

# It assumes that if two users liked similar items in the past, they will likely agree on other items too.

# Example: If User A and User B both liked "Naruto" and "Bleach", and User A also liked "One Piece", then User B might also like "One Piece".

# Item-Based Collaborative Filtering:

# Recommends items based on the similarity between items, not users.

# It checks which items are often liked together and recommends similar items to the user based on their past likes.

# Example: If users who liked "Naruto" also liked "Bleach", then "Bleach" will be recommended to someone who likes "Naruto".

# Key Differences:

# User-based focuses on user similarity, item-based focuses on item similarity.

# Item-based is often more stable when the number of users is high and constantly changing.

# 2. What is collaborative filtering, and how does it work?
# Collaborative Filtering is a recommendation technique that makes automatic predictions about a user’s interests by collecting preferences from many users (collaboration).

# How It Works:

# It uses a user-item interaction matrix (like ratings or views).

# Finds patterns in the matrix to identify similarities.

# Two main types:

# User-Based: "Users like you also liked..."

# Item-Based: "Items similar to what you liked..."

# Advantages:

# No need for deep item content analysis (works with sparse data).

# Can discover surprising patterns or associations.

# Challenges:

# Cold Start Problem: New users/items have insufficient data.

# Data Sparsity: Not all users rate all items.

# Scalability: Large datasets need efficient computation.