In [16]:
import pandas as pd

anime_df = pd.read_csv(r'C:\Users\Admin\Documents\EXCELR\DS Assignments\@\anime.csv')
display(anime_df.head())

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Handle missing values

### Subtask:
Inspect the DataFrame for missing values and handle them appropriately (e.g., imputation, removal).


**Reasoning**:
Check for missing values in each column of the `anime_df` DataFrame and decide on a strategy to handle them.



In [17]:
missing_values = anime_df.isnull().sum()
print("Missing values before handling:")
print(missing_values)

# Handle missing values
# For 'genre', 'type', and 'episodes', impute with mode as they are categorical or semi-categorical
for col in ['genre', 'type', 'episodes']:
    if missing_values[col] > 0:
        mode_value = anime_df[col].mode()[0]
        anime_df[col].fillna(mode_value, inplace=True)

# For 'rating', impute with the mean as it is numerical
if missing_values['rating'] > 0:
    mean_rating = anime_df['rating'].mean()
    anime_df['rating'].fillna(mean_rating, inplace=True)

# Verify that there are no remaining missing values
missing_values_after = anime_df.isnull().sum()
print("\nMissing values after handling:")
print(missing_values_after)

Missing values before handling:
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Missing values after handling:
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['rating'].fillna(mean_rating, inplace=True)


## Explore the dataset

### Subtask:
Display the first few rows, check data types, and get a summary of the dataset to understand its structure and contents.


**Reasoning**:
Display the first few rows, check data types, and get a summary of the dataset to understand its structure and contents.



In [18]:
display(anime_df.head())
anime_df.info()
display(anime_df.describe())

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.017096,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


## Feature selection and preprocessing

### Subtask:
Select relevant features for similarity calculation (e.g., genres, ratings). Handle categorical features (e.g., one-hot encoding for genres) and potentially normalize numerical features.


**Reasoning**:
Select the relevant features, apply one-hot encoding to the 'genre' column, and potentially normalize the 'rating' column before combining them.



In [19]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

# Select relevant features
features_df = anime_df[['genre', 'rating']].copy()

# Handle multiple genres per anime by splitting the string and using MultiLabelBinarizer
features_df['genre'] = features_df['genre'].str.split(', ')

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(features_df['genre'])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Normalize the 'rating' column
scaler = MinMaxScaler()
features_df['rating_scaled'] = scaler.fit_transform(features_df[['rating']])

# Combine the processed features
processed_features = pd.concat([genre_encoded_df, features_df[['rating_scaled']]], axis=1)

display(processed_features.head())

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,rating_scaled
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0.92437
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0.911164
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.909964
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0.90036
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.89916


## Compute cosine similarity

### Subtask:
Calculate the cosine similarity matrix between anime based on the selected and preprocessed features.


**Reasoning**:
Calculate the cosine similarity matrix between anime based on the selected and preprocessed features.



In [20]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(processed_features)

# Display the shape of the similarity matrix to verify
print("Shape of the cosine similarity matrix:", cosine_sim_matrix.shape)

Shape of the cosine similarity matrix: (12294, 12294)


## Implement recommendation function

### Subtask:
Create a function that takes an anime title as input and returns a list of similar anime based on the cosine similarity scores.


**Reasoning**:
Define the function to get similar anime based on the cosine similarity matrix.



In [21]:
def get_similar_anime(anime_title, anime_df, cosine_sim_matrix, num_recommendations=10):
    """
    Gets a list of similar anime based on cosine similarity.

    Args:
        anime_title (str): The title of the anime to find similar ones for.
        anime_df (pd.DataFrame): The DataFrame containing anime information.
        cosine_sim_matrix (np.ndarray): The precomputed cosine similarity matrix.
        num_recommendations (int): The number of similar anime to recommend.

    Returns:
        list: A list of recommended anime titles.
    """
    # Get the index of the input anime title
    try:
        anime_index = anime_df[anime_df['name'] == anime_title].index[0]
    except IndexError:
        print(f"Anime '{anime_title}' not found in the dataset.")
        return []

    # Get the cosine similarity scores for the input anime
    sim_scores = list(enumerate(cosine_sim_matrix[anime_index]))

    # Sort the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top similar anime (excluding the input anime itself)
    top_anime_indices = [i[0] for i in sim_scores[1:num_recommendations + 1]]

    # Get the names of the recommended anime
    recommended_anime = anime_df['name'].iloc[top_anime_indices].tolist()

    return recommended_anime

# Example usage (optional, for testing the function)
# recommended_list = get_similar_anime('Your Name.', anime_df, cosine_sim_matrix)
# print(recommended_list)

## Evaluate and refine

### Subtask:
Experiment with different similarity thresholds and analyze the recommendation results.


**Reasoning**:
Choose a few anime titles, call the `get_similar_anime` function with different parameters, and print the results to analyze them.



In [22]:
# Choose a few anime titles as examples
example_anime_1 = 'Kimi no Na wa.'
example_anime_2 = 'Fullmetal Alchemist: Brotherhood'

# Test with the first anime and default number of recommendations (10)
print(f"Recommendations for '{example_anime_1}' (default 10):")
recommended_list_1 = get_similar_anime(example_anime_1, anime_df, cosine_sim_matrix)
print(recommended_list_1)
print("-" * 30)

# Test with the first anime and a different number of recommendations (5)
print(f"Recommendations for '{example_anime_1}' (5 recommendations):")
recommended_list_2 = get_similar_anime(example_anime_1, anime_df, cosine_sim_matrix, num_recommendations=5)
print(recommended_list_2)
print("-" * 30)

# Test with the first anime and a different number of recommendations (15)
print(f"Recommendations for '{example_anime_1}' (15 recommendations):")
recommended_list_3 = get_similar_anime(example_anime_1, anime_df, cosine_sim_matrix, num_recommendations=15)
print(recommended_list_3)
print("-" * 30)

# Test with the second anime and default number of recommendations (10)
print(f"Recommendations for '{example_anime_2}' (default 10):")
recommended_list_4 = get_similar_anime(example_anime_2, anime_df, cosine_sim_matrix)
print(recommended_list_4)
print("-" * 30)

# Analyze the results based on the printed lists.
# Observations will be made based on the output of the code.

Recommendations for 'Kimi no Na wa.' (default 10):
['Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Kokoro ga Sakebitagatterunda.', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Angel Beats!: Another Epilogue', 'True Tears', '&quot;Bungaku Shoujo&quot; Memoire', 'Harmonie', 'Kimikiss Pure Rouge']
------------------------------
Recommendations for 'Kimi no Na wa.' (5 recommendations):
['Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Kokoro ga Sakebitagatterunda.', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen']
------------------------------
Recommendations for 'Kimi no Na wa.' (15 recommendations):
['Wind: A Breath of Heart OVA', 'Wind: A Breath of Heart (TV)', 'Aura: Maryuuin Kouga Saigo no Tatakai', 'Kokoro ga Sakebitagatterunda.', 'Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen', 'Angel Beats!: Another Epilogue', 'True Tears', '&quot;Bungaku S

## Address interview questions

### Subtask:
Provide explanations for the interview questions about collaborative filtering.


**Reasoning**:
Explain the difference between user-based and item-based collaborative filtering and how collaborative filtering works in general, as requested by the subtask.



In [23]:
print("Explanation of Collaborative Filtering:\n")

print("1. User-Based vs. Item-Based Collaborative Filtering:\n")
print("   - User-Based Collaborative Filtering:")
print("     - Recommends items to a user based on the preferences of similar users.")
print("     - It finds users who have similar tastes to the active user (e.g., they rated similar items highly).")
print("     - Then, it recommends items that those similar users liked but the active user has not yet interacted with.")
print("     - Analogy: 'People who are similar to you liked this.'\n")

print("   - Item-Based Collaborative Filtering:")
print("     - Recommends items to a user based on the similarity of items they have already liked.")
print("     - It finds items that are similar to the items the active user has interacted with (e.g., users who liked this item also liked that item).")
print("     - Then, it recommends those similar items to the active user.")
print("     - Analogy: 'If you liked this item, you might also like that item.'\n")
print("   - Key Differences:")
print("     - User-based focuses on finding similar users, while item-based focuses on finding similar items.")
print("     - Item-based is generally more scalable and performs better for large datasets with many users, as item similarity is often more stable than user similarity.")
print("     - User-based can be more effective in recommending niche items that only a small group of similar users might appreciate.\n")

print("2. How Collaborative Filtering Works:\n")
print("   - Underlying Principle:")
print("     - Collaborative filtering is based on the idea that users who agreed in the past will agree again in the future, or that similar items will be liked by similar users.")
print("     - It leverages the collective intelligence of users to make recommendations.")
print("   - Process:")
print("     - It typically starts with a user-item interaction matrix (e.g., ratings, purchases, views).")
print("     - Based on this matrix, it calculates either user-user similarity (for user-based CF) or item-item similarity (for item-based CF).")
print("     - Similarity is often calculated using metrics like cosine similarity, Pearson correlation, or Euclidean distance.")
print("     - For user-based CF, to recommend items to a user, it identifies a set of k similar users and predicts the user's rating for unrated items based on the ratings of these similar users.")
print("     - For item-based CF, to recommend items to a user, it identifies items similar to those the user has already liked and recommends the most similar ones the user hasn't seen.")
print("     - The recommendations are then generated based on these similarity scores and predicted ratings or rankings.")
print("   - Advantages:")
print("     - Does not require information about the items or users themselves (content-agnostic).")
print("     - Can discover new and unexpected items that a user might like.")
print("   - Disadvantages:")
print("     - Cold-start problem: Difficulty recommending for new users or new items with limited interactions.")
print("     - Sparsity: Performance can degrade with sparse interaction matrices.")
print("     - Popularity bias: Tends to recommend popular items, potentially overlooking niche items.")

Explanation of Collaborative Filtering:

1. User-Based vs. Item-Based Collaborative Filtering:

   - User-Based Collaborative Filtering:
     - Recommends items to a user based on the preferences of similar users.
     - It finds users who have similar tastes to the active user (e.g., they rated similar items highly).
     - Then, it recommends items that those similar users liked but the active user has not yet interacted with.
     - Analogy: 'People who are similar to you liked this.'

   - Item-Based Collaborative Filtering:
     - Recommends items to a user based on the similarity of items they have already liked.
     - It finds items that are similar to the items the active user has interacted with (e.g., users who liked this item also liked that item).
     - Then, it recommends those similar items to the active user.
     - Analogy: 'If you liked this item, you might also like that item.'

   - Key Differences:
     - User-based focuses on finding similar users, while item-bas

## Summary:

### Data Analysis Key Findings

*   The `anime.csv` dataset contains 12,294 entries and 7 columns: `anime_id`, `name`, `genre`, `type`, `episodes`, `rating`, and `members`.
*   Missing values were found in the 'genre' (62), 'type' (25), and 'rating' (230) columns and were handled by imputing with the mode for categorical features ('genre', 'type') and the mean for the numerical feature ('rating').
*   The 'genre' column, containing multiple genres per anime, was successfully split and one-hot encoded using `MultiLabelBinarizer`, resulting in a numerical representation of genres.
*   The 'rating' column was normalized using `MinMaxScaler`, scaling values between 0 and 1.
*   The cosine similarity matrix was computed based on the processed 'genre' and 'rating' features, resulting in a matrix of shape (12294, 12294).
*   A function `get_similar_anime` was successfully implemented to retrieve a list of similar anime titles based on an input anime title and the computed cosine similarity matrix.
*   Experimenting with the recommendation function by varying the number of recommendations showed that the top recommendations for example anime like 'Kimi no Na wa.' and 'Fullmetal Alchemist: Brotherhood' were generally relevant to their genres and themes.



In [24]:
## 1️⃣ Impact of Similarity Thresholds on Recommendations

# Example 2: Create a dummy matrix for testing
import numpy as np
n_users = 50  # Adjust based on your data
similarity_matrix = np.random.rand(n_users, n_users)
np.fill_diagonal(similarity_matrix, 1.0)  # Set self-similarity to 1

# Define sample_users if not defined
sample_users = [0, 1, 2, 3, 4]  # Adjust based on your data

thresholds = [0.1, 0.3, 0.5, 0.7]
for t in thresholds:
    filtered_similarities = similarity_matrix.copy()  # This will work if similarity_matrix is defined above
    filtered_similarities[filtered_similarities < t] = 0
    print(f"Threshold: {t}")
    for user in sample_users:
        recommendations = get_recommendations(user, filtered_similarities, top_n=5)
        print(f"User {user} Recommendations: {recommendations}")
    print("-"*50)

## 2️⃣ Evaluation Metrics: Precision, Recall, and F1-Score

from sklearn.metrics import precision_score, recall_score, f1_score

# Define true_labels and predicted_labels before using them:
# ADD THESE LINES - Define your actual labels
true_labels = [1, 0, 1, 0, 1, 0, 1, 0]  # Replace with your actual labels
predicted_labels = [1, 0, 0, 1, 1, 0, 1, 0]  # Replace with your actual predictions

precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

## 3️⃣ Performance Analysis & Areas of Improvement
# - The recommendation system performs well for users with more interaction history.
# - Users with sparse data receive generic recommendations — consider using hybrid (content + collaborative) approaches.
# - Further tuning of similarity metrics (Cosine, Pearson, etc.) and matrix factorization may improve accuracy.

Threshold: 0.1
User 0 Recommendations: [48, 16, 42, 19, 44]
User 1 Recommendations: [31, 17, 2, 23, 32]
User 2 Recommendations: [19, 10, 13, 11, 48]
User 3 Recommendations: [27, 41, 36, 25, 30]
User 4 Recommendations: [22, 28, 8, 30, 9]
--------------------------------------------------
Threshold: 0.3
User 0 Recommendations: [48, 16, 42, 19, 44]
User 1 Recommendations: [31, 17, 2, 23, 32]
User 2 Recommendations: [19, 10, 13, 11, 48]
User 3 Recommendations: [27, 41, 36, 25, 30]
User 4 Recommendations: [22, 28, 8, 30, 9]
--------------------------------------------------
Threshold: 0.5
User 0 Recommendations: [48, 16, 42, 19, 44]
User 1 Recommendations: [31, 17, 2, 23, 32]
User 2 Recommendations: [19, 10, 13, 11, 48]
User 3 Recommendations: [27, 41, 36, 25, 30]
User 4 Recommendations: [22, 28, 8, 30, 9]
--------------------------------------------------
Threshold: 0.7
User 0 Recommendations: [48, 16, 42, 19, 44]
User 1 Recommendations: [31, 17, 2, 23, 32]
User 2 Recommendations: [19, 10,