# ---------------------- Recommendation System -----------------------------

In [64]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score


## --- 1. Data Preprocessing ---

In [65]:
# Load the dataset
df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [66]:
# 1. Fill missing 'genre' and 'type' with 'Unknown'
df['genre'].fillna('Unknown', inplace=True)
df['type'].fillna('Unknown', inplace=True)

In [67]:
# 2. Fill missing 'rating' with the mean rating
mean_rating = df['rating'].mean()
df['rating'].fillna(mean_rating, inplace=True)

In [68]:
# 3. Handle 'episodes': Convert 'Unknown' to 0 and ensure numeric
# Coerce errors to NaN, then fill NaN (which may come from non-numeric strings) with 0
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce').fillna(0).astype(int)


In [69]:
# 4. Drop duplicates based on 'name' to ensure unique anime entries
df.drop_duplicates(subset=['name'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [70]:
print("\nData After Preprocessing:")
df.info()



Data After Preprocessing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12292 entries, 0 to 12291
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12292 non-null  int64  
 1   name      12292 non-null  object 
 2   genre     12292 non-null  object 
 3   type      12292 non-null  object 
 4   episodes  12292 non-null  int32  
 5   rating    12292 non-null  float64
 6   members   12292 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 624.3+ KB


## --- 2. Feature Extraction and Scaling ---

In [71]:
#2.1. Feature Preprocessing
# --- Combine features into a single string for TF-IDF ---
# Clean up genre string for better tokenization
df['genre'] = df['genre'].str.replace(', ', ' ').str.replace(',', ' ')
df['features'] = df['genre'] + ' ' + df['type']

# --- Convert 'rating' and 'members' for numerical use ---
# Normalize 'rating' and 'members' (number of users) to have a fair weight.
# Rating is already on a scale (0-10), but we'll normalize it for consistency.

# Scale the 'rating' feature (0 to 1)
scaler_rating = MinMaxScaler()
df['rating_scaled'] = scaler_rating.fit_transform(df[['rating']])

# Scale the 'members' (community size) feature (0 to 1)
scaler_members = MinMaxScaler()
df['members_scaled'] = scaler_members.fit_transform(df[['members']])

print("\nScaled Numerical Features Head:")
print(df[['rating', 'rating_scaled', 'members', 'members_scaled']].head())


Scaled Numerical Features Head:
   rating  rating_scaled  members  members_scaled
0    9.37       0.924370   200630        0.197872
1    9.26       0.911164   793665        0.782770
2    9.25       0.909964   114262        0.112689
3    9.17       0.900360   673572        0.664325
4    9.16       0.899160   151266        0.149186


In [72]:
#2.2. TF-IDF and Feature Matrix Creation
# Initialize TfidfVectorizer
# stop_words='english' is used, although genre terms are less likely to be English stopwords.
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix for the 'features' (genre + type)
tfidf_matrix = tfidf.fit_transform(df['features'])

print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")

# --- Combine TF-IDF matrix with normalized numerical features ---
# Convert the sparse TF-IDF matrix to a dense NumPy array for concatenation
tfidf_array = tfidf_matrix.toarray()

# Get the numerical features as a NumPy array
numerical_features = df[['rating_scaled', 'members_scaled']].values

# Combine the arrays horizontally
# This is the final feature matrix for cosine similarity calculation
feature_matrix = np.hstack([tfidf_array, numerical_features])

print(f"Final Feature Matrix Shape: {feature_matrix.shape}")


TF-IDF Matrix Shape: (12292, 52)
Final Feature Matrix Shape: (12292, 54)


## --- 3. Recommendation System Implementation ---

In [73]:
# --- Task: Calculate Cosine Similarity ---
# Compute the cosine similarity matrix
# The matrix element (i, j) is the similarity between anime i and anime j.
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

print(f"\nCosine Similarity Matrix Shape: {cosine_sim.shape}")

# Create a Series of anime titles to map indices to titles
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

# --- Task: Design the Recommendation Function ---
def get_recommendations(title, cosine_sim_matrix=cosine_sim, df=df, indices=indices, top_n=10, min_score=0.5):
    """
    Generates a list of recommended anime based on the similarity to the given title.

    Parameters:
    - title (str): The name of the target anime.
    - cosine_sim_matrix (np.array): The pre-computed cosine similarity matrix.
    - df (pd.DataFrame): The main anime DataFrame.
    - indices (pd.Series): Mapping of anime title to its index.
    - top_n (int): The maximum number of recommendations to return.
    - min_score (float): The minimum similarity score required for an anime to be recommended.
    """
    if title not in indices:
        return f"Anime '{title}' not found in the dataset."

    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the similarity scores for all anime with that anime
    # cosine_sim_matrix[idx] returns an array of scores
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the anime based on the similarity scores (descending)
    # The key is to sort by the score (element 1 of the tuple)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar anime (excluding the anime itself at index 0)
    # We apply the minimum score threshold here.
    recommendations = [
        (i, score) for i, score in sim_scores if i != idx and score >= min_score
    ][:top_n]

    # Get the anime titles and scores
    anime_indices = [i[0] for i in recommendations]
    anime_titles = df['name'].iloc[anime_indices].tolist()
    anime_scores = [i[1] for i in recommendations]

    return pd.DataFrame({
        'Recommended Anime': anime_titles,
        'Similarity Score': anime_scores,
        'Rating': df['rating'].iloc[anime_indices].round(2)
    })

# --- Task: Experiment with different threshold values ---

target_anime = 'Fullmetal Alchemist: Brotherhood' # Example target
print(f"\n--- Recommendations for '{target_anime}' (Top 10, Min Score 0.6) ---")
recommendations_high_thresh = get_recommendations(target_anime, top_n=10, min_score=0.6)
print(recommendations_high_thresh)

print(f"\n--- Recommendations for '{target_anime}' (Top 15, Min Score 0.5) ---")
recommendations_low_thresh = get_recommendations(target_anime, top_n=15, min_score=0.5)
print(recommendations_low_thresh)


Cosine Similarity Matrix Shape: (12292, 12292)

--- Recommendations for 'Fullmetal Alchemist: Brotherhood' (Top 10, Min Score 0.6) ---
                                      Recommended Anime  Similarity Score  \
200                                 Fullmetal Alchemist          0.982810   
288                                          Fairy Tail          0.887223   
268                        Magi: The Labyrinth of Magic          0.860301   
101                          Magi: The Kingdom of Magic          0.844291   
1558      Fullmetal Alchemist: The Sacred Star of Milos          0.839396   
255                                   Fairy Tail (2014)          0.830294   
554          Gate: Jieitai Kanochi nite, Kaku Tatakaeri          0.824667   
402           Fullmetal Alchemist: Brotherhood Specials          0.820312   
795                      Densetsu no Yuusha no Densetsu          0.805627   
555   Gate: Jieitai Kanochi nite, Kaku Tatakaeri 2nd...          0.798184   

      Rating  
2

## --- 4. Evaluation (Conceptual Approach) ---
Evaluating a content-based recommender system without actual user interaction data (like historical purchases or clicks) requires a proxy method. A common approach is to treat the original dataset as "ground truth" and check if the recommendations match similar items in a held-out test set, though this can be complex.

A simpler, more straightforward method for content-based models is to use the existing data and check if similar items in the training set are correctly predicted.

In [74]:
#4.1. Dataset Splitting for Evaluation
# Splitting indices for "training" and "testing" the recommendation accuracy
# In content-based, we train on all content features, but test the recommendation function's output.
train_indices, test_indices = train_test_split(df.index, test_size=0.2, random_state=42)
print(f"\nTotal Anime: {len(df)}")
print(f"Train Set Size: {len(train_indices)}")
print(f"Test Set Size: {len(test_indices)}")


Total Anime: 12292
Train Set Size: 9833
Test Set Size: 2459


In [75]:
#4.2. Evaluation Metrics (Precision, Recall, F1-Score)
def evaluate_recommender(test_indices, df, cosine_sim, top_n=10, relevance_threshold=0.9):
    y_true = []
    y_pred = []

    # Iterate over the test set anime
    for idx in test_indices:
        # 1. Get the actual 'relevant' items (Ground Truth)
        # An item is relevant if its similarity score is > relevance_threshold (and it's not the item itself)
        relevant_indices = [
            i for i, score in enumerate(cosine_sim[idx])
            if i != idx and score >= relevance_threshold
        ]

        # 2. Get the predicted top N recommendations
        # Get the scores of all anime with the target anime
        sim_scores = list(enumerate(cosine_sim[idx]))
        # Sort scores (descending)
        sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # Get the predicted top N indices (excluding the target anime itself)
        predicted_indices = [i for i, score in sim_scores_sorted if i != idx][:top_n]

        # 3. Create binary truth and prediction lists
        # We need to consider all unique relevant indices and predicted indices
        all_unique_indices = sorted(list(set(relevant_indices) | set(predicted_indices)))
        
        # Binary True: 1 if the index is in the relevant list (Ground Truth)
        # Binary Pred: 1 if the index is in the predicted list (Top N)
        for i in all_unique_indices:
            y_true.append(1 if i in relevant_indices else 0)
            y_pred.append(1 if i in predicted_indices else 0)

    if not y_true:
        print("No relevant recommendations found in the test set above the relevance threshold.")
        return 0, 0, 0
        
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return precision, recall, f1

# --- Run Evaluation ---
PRECISION, RECALL, F1 = evaluate_recommender(
    test_indices,
    df,
    cosine_sim,
    top_n=10,           # 'k' in Precision@k, Recall@k
    relevance_threshold=0.9 # Defines what is considered 'relevant'
)

print("\n--- Recommendation System Evaluation ---")
print(f"Precision@10 (Relevance > 0.9): {PRECISION:.4f}")
print(f"Recall@10 (Relevance > 0.9): {RECALL:.4f}")
print(f"F1-Score@10 (Relevance > 0.9): {F1:.4f}")


--- Recommendation System Evaluation ---
Precision@10 (Relevance > 0.9): 0.7332
Recall@10 (Relevance > 0.9): 0.0863
F1-Score@10 (Relevance > 0.9): 0.1545


## 5. Analysis and Improvement

**Analysis**
- **Precision:** Measures the proportion of recommended anime that were actually relevant. A high precision means the top-N list is very accurate.

- **Recall:** Measures the proportion of all relevant anime that were successfully recommended. A high recall means the system didn't miss many highly relevant items.

- **Performance Trade-off:** The choice of top_n (list size) and min_score (recommendation threshold) will directly affect Precision and Recall. A lower min_score or larger top_n generally increases Recall but may decrease Precision.

**Areas of Improvement**

 **1.Feature Weighting:** Currently, all features (TF-IDF terms, normalized rating, normalized members) are weighted equally when concatenated.

- **Improvement:** Apply explicit weights (e.g., feature_matrix = np.hstack([tfidf_array * 1.5, numerical_features])) to prioritize genres/type over community rating/size, or vice-versa, based on desired system behavior.

**2.Advanced Feature Engineering:**

- Include Broadcast Type (type) separately, not just in the genre list. (This was done by including df['type'] in df['features']).

- Create a binary feature for popular/unpopular (e.g., members > median(members)).

**3.Alternative Similarity Metrics:** While Cosine Similarity is standard, Pearson Correlation or Euclidean Distance could be experimented with, though they are less common for sparse feature vectors like TF-IDF.

**4.Hybrid Approach:** Integrate Collaborative Filtering (User-User or Item-Item) if user-specific rating data is available (which this dataset usually has in a separate file). This would address the cold-start problem and improve relevance for popular items.

## Interview Questions

**1. Can you explain the difference between user-based and item-based collaborative filtering?**

The primary difference between User-Based Collaborative Filtering (UBCF) and Item-Based Collaborative Filtering (IBCF) lies in what they compare to generate a recommendation. UBCF works by finding users who have similar historical preferences (ratings, purchases, clicks) to the target user. Once the "neighboring" similar users are identified, the system recommends items that those neighbors liked but the target user hasn't seen yet. This approach can be computationally expensive and unstable because user preferences often change quickly, leading to a large, dynamic similarity matrix. In contrast, IBCF finds items that are frequently rated or consumed together. It builds a matrix of item-to-item similarity. To make a recommendation, the system looks at items the target user has already liked, identifies items similar to those previously liked items, and recommends the most similar and highly-rated ones. IBCF is generally preferred in production systems like Amazon and Netflix because item similarities (e.g., Star Wars is similar to Star Trek) tend to be more stable over time than user preferences, making the similarity matrix smaller and the recommendations faster to compute.

**2. What is Collaborative Filtering, and How Does It Work?**

Collaborative Filtering (CF) is a fundamental technique in recommendation systems that predicts a user's interest in an item by leveraging the opinions and behaviors of other users. The core idea is that if two users agreed in the past (e.g., they both liked the same movie), they are likely to agree again in the future. Essentially, it uses the "wisdom of the crowd" to find relevant suggestions.


It primarily works through three main steps:

 **1.Data Collection:** It starts with a User-Item Interaction Matrix, where rows are users, columns are items, and the cells contain the user's interaction (e.g., a rating from 1 to 5, or a binary value like 'purchased' or 'watched').

 **2.Similarity Calculation:** The system calculates similarity, either between users (User-Based CF) or between items (Item-Based CF), usually using metrics like Cosine Similarity or Pearson Correlation.

 **3.Prediction/Recommendation:**

  - User-Based: To predict user 'A's rating for a new item, the system takes a weighted average of the ratings given to that item by 'A's most similar neighbors (the "collaborators").

  - Item-Based: To recommend an item to user 'A', the system looks at the items 'A' has already liked, finds the items most similar to those, and recommends the highly-rated similar items 'A' hasn't seen yet.

