In [1]:
import pandas as pd
import numpy as np
# Load dataset
df = pd.read_csv("D:\\Data Science\\assignments\\anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [2]:
# Verify missing values
print("\nMissing Values:\n", df.isna().sum())


Missing Values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [3]:
# Handle missing values
df['genre'] = df['genre'].fillna('Unknown')
df['type'] = df['type'].fillna('Unknown')
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [4]:
# Check if any "unknown" values exist in episodes column

df['episodes'].str.lower().isin(['unknown']).any()

True

In [5]:
# Count how many "unknown" entries are present

df['episodes'].str.lower().value_counts().get('unknown', 0)

340

In [6]:
# Clean and convert 'episodes'

df['episodes'] = df['episodes'].replace(['Unknown', 'unknown', 'None'], np.nan)
df['episodes'] = df['episodes'].astype(float)
median_eps = df['episodes'].median()
df['episodes'] = df['episodes'].fillna(median_eps).astype(int)

In [7]:
# Check for duplicates
duplicates = df[df.duplicated(subset='name', keep=False)]
duplicates

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
10140,22399,Saru Kani Gassen,Kids,OVA,1,5.23,62
10141,30059,Saru Kani Gassen,Drama,Movie,1,4.75,76
10193,33193,Shi Wan Ge Leng Xiaohua,"Comedy, Parody",ONA,12,6.67,114
10194,33195,Shi Wan Ge Leng Xiaohua,"Action, Adventure, Comedy, Fantasy, Parody",Movie,1,7.07,110


In [8]:
# Display final info

print("Cleaned Dataset Info:")
print(df.info())
print("\nDuplicate Anime Titles:")
print(duplicates[['name', 'type', 'episodes', 'rating']])


Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12294 non-null  object 
 3   type      12294 non-null  object 
 4   episodes  12294 non-null  int32  
 5   rating    12294 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int32(1), int64(2), object(3)
memory usage: 624.4+ KB
None

Duplicate Anime Titles:
                          name   type  episodes  rating
10140         Saru Kani Gassen    OVA         1    5.23
10141         Saru Kani Gassen  Movie         1    4.75
10193  Shi Wan Ge Leng Xiaohua    ONA        12    6.67
10194  Shi Wan Ge Leng Xiaohua  Movie         1    7.07


In [9]:
# Verify missing values
print("\nRemaining Missing Values:\n", df.isna().sum())


Remaining Missing Values:
 anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [10]:
# Feature Extraction:

# Select Relevant Features
anime_features = df[['name', 'genre', 'type', 'rating', 'members']].copy()
anime_features.head()


Unnamed: 0,name,genre,type,rating,members
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.37,200630
1,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
3,Steins;Gate,"Sci-Fi, Thriller",TV,9.17,673572
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266


In [11]:
# Convert Categorical Features to Numeric (Encoding)

from sklearn.preprocessing import MultiLabelBinarizer

# Split genre strings into lists
anime_features['genre'] = anime_features['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_features['genre'])

# Convert to DataFrame
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=anime_features.index)


In [12]:
genre_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Combine Encoded Genres with Numeric Features

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_scaled = scaler.fit_transform(anime_features[['rating', 'members']])

numeric_df = pd.DataFrame(numeric_scaled, columns=['rating', 'members'], index=anime_features.index)

# Combine everything
anime_final = pd.concat([genre_df, numeric_df], axis=1)


In [14]:
print("Final feature shape:", anime_final.shape)
anime_final.head()


Final feature shape: (12294, 46)


Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri,rating,members
0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0.92437,0.197872
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0.911164,0.78277
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.909964,0.112689
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0.90036,0.664325
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.89916,0.149186


In [15]:
# Recommendation System

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity
cosine_sim = cosine_similarity(anime_final)

# Convert it into a DataFrame for easy lookup
cosine_sim_df = pd.DataFrame(cosine_sim, index=anime_features['name'], columns=anime_features['name'])

# Display sample similarity matrix
cosine_sim_df.head()


name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,Hunter x Hunter (2011),Ginga Eiyuu Densetsu,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare,Gintama&#039;: Enchousen,...,Super Erotic Anime,Taimanin Asagi 3,Teleclub no Himitsu,Tenshi no Habataki Jun,The Satisfaction,Toushindai My Lover: Minami tai Mecha-Minami,Under World,Violence Gekiga David no Hoshi,Violence Gekiga Shin David no Hoshi: Inma Densetsu,Yasuji no Pornorama: Yacchimae!!
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kimi no Na wa.,1.0,0.310704,0.139393,0.24158,0.139035,0.533946,0.184555,0.379767,0.135738,0.136158,...,0.132291,0.379653,0.141599,0.127127,0.128855,0.11925,0.124952,0.150272,0.154319,0.17306
Fullmetal Alchemist: Brotherhood,0.310704,1.0,0.358559,0.255806,0.361091,0.412498,0.639303,0.451598,0.353487,0.35436,...,0.099299,0.099132,0.106289,0.095439,0.096729,0.08953,0.093803,0.11281,0.115839,0.129897
Gintama°,0.139393,0.358559,1.0,0.375115,0.999908,0.41875,0.458125,0.296822,0.999874,0.999916,...,0.10288,0.102671,0.110119,0.098862,0.100207,0.092736,0.097171,0.116861,0.12001,0.134586
Steins;Gate,0.24158,0.255806,0.375115,1.0,0.378231,0.199987,0.269742,0.469828,0.367429,0.368678,...,0.158095,0.157821,0.169224,0.151946,0.154002,0.142537,0.149342,0.179603,0.184428,0.206812
Gintama&#039;,0.139035,0.361091,0.999908,0.378231,1.0,0.418077,0.459322,0.295911,0.999613,0.999693,...,0.101725,0.10152,0.108882,0.097753,0.099082,0.091696,0.096081,0.11555,0.118663,0.133075


In [16]:
# Define Recommendation Function

def recommend_anime(title, n=5, threshold=0.5):
    # Check if title exists
    if title not in cosine_sim_df.index:
        return f"'{title}' not found in the dataset."
    
    # Get similarity scores for the given anime
    sim_scores = cosine_sim_df[title].sort_values(ascending=False)
    
    # Filter by threshold and skip the first one (itself)
    recommendations = sim_scores[(sim_scores < 0.999) & (sim_scores > threshold)].head(n)
    
    # Display recommendations
    return pd.DataFrame({
        'Recommended Anime': recommendations.index,
        'Similarity Score': recommendations.values
    })


In [17]:
# Try Recommendations

recommend_anime("Naruto", n=5, threshold=0.4)


Unnamed: 0,Recommended Anime,Similarity Score
0,Naruto: Shippuuden,0.998222
1,Naruto: Shippuuden Movie 4 - The Lost Tower,0.970475
2,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,0.970356
3,Boruto: Naruto the Movie,0.969445
4,Naruto x UT,0.963999


In [18]:
recommend_anime("Major: World Series", n=5, threshold=0.4)


Unnamed: 0,Recommended Anime,Similarity Score
0,&quot;Eiji&quot;,0.989868
1,Hajime no Ippo: New Challenger,0.887277
2,Hajime no Ippo: Rising,0.887205
3,Major S5,0.886943
4,Hajime no Ippo,0.886304


In [19]:
# Experiment with Different Threshold Values

"""
-> The threshold controls how "close" two anime must be to be considered similar:
-> Higher threshold (0.6–0.8) → fewer but more relevant recommendations.
-> Lower threshold (0.3–0.4) → more recommendations, but some may be less related.
"""
recommend_anime("One Piece", n=5, threshold=0.3)


Unnamed: 0,Recommended Anime,Similarity Score
0,One Piece: Episode of Nami - Koukaishi no Nami...,0.985695
1,One Piece: Episode of Merry - Mou Hitori no Na...,0.985254
2,One Piece: Episode of Sabo - 3 Kyoudai no Kizu...,0.984455
3,One Piece Film: Strong World,0.923249
4,One Piece Film: Z,0.922733


In [20]:
recommend_anime("Death Note", n=5, threshold=0.6)


Unnamed: 0,Recommended Anime,Similarity Score
0,Death Note Rewrite,0.935424
1,Higurashi no Naku Koro ni Kai,0.871985
2,Mousou Dairinin,0.867343
3,Higurashi no Naku Koro ni,0.808046
4,Mirai Nikki (TV),0.772146


In [21]:
recommend_anime("Digimon Adventure", n=5, threshold=0.7)


Unnamed: 0,Recommended Anime,Similarity Score
0,Pokemon Advanced Generation,0.998302
1,Pokemon Diamond &amp; Pearl,0.997931
2,Pokemon XY,0.997845
3,Pokemon XY&amp;Z,0.997582
4,Pokemon XY: Mega Evolution,0.997382


In [23]:
# Evaluation:

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# Split dataset into train (80%) and test (20%)
train_df, test_df = train_test_split(anime_final, test_size=0.2, random_state=42)

# Build cosine similarity matrix using all numeric columns
train_cosine_sim = cosine_similarity(train_df)
train_cosine_df = pd.DataFrame(train_cosine_sim)


In [24]:
# Define recommendation function

def recommend_anime(index, cosine_sim_df, top_n=5, threshold=0.5):
    
    if index >= len(cosine_sim_df):
        return []

    sim_scores = cosine_sim_df.iloc[index]  # Get similarity scores for that anime
    sim_scores = sim_scores[sim_scores >= threshold].sort_values(ascending=False)

    # Drop itself (similarity = 1)
    sim_scores = sim_scores.drop(index, errors='ignore')

    # Return top N recommended indices
    return sim_scores.head(top_n).index.tolist()

In [25]:
# Define evaluation function

def evaluate_recommendations(train_df, test_df, cosine_sim_df, threshold=0.5):
    
    y_true, y_pred = [], []

    # Limit test size for faster evaluation (optional)
    for i in range(min(100, len(test_df))):  # Evaluate only first 100 test samples
        recs = recommend_anime(i % len(train_df), cosine_sim_df, top_n=5, threshold=threshold)
        true_vector = set(test_df.iloc[i][test_df.iloc[i] == 1].index)

        for rec_idx in recs:
            if rec_idx < len(train_df):
                rec_vector = set(train_df.iloc[rec_idx][train_df.iloc[rec_idx] == 1].index)
                overlap = len(true_vector.intersection(rec_vector)) > 0
                y_true.append(1)
                y_pred.append(1 if overlap else 0)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return precision, recall, f1

In [28]:
# Run evaluation

precision, recall, f1 = evaluate_recommendations(train_df, test_df, train_cosine_df, threshold=0.5)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Precision: 1.0
Recall: 0.438
F1 Score: 0.6091794158553546


In [None]:
"""

Interpretation:

-> Precision = 1.0 means that all the anime the system recommended were actually relevant — i.e., no wrong 
   recommendations were made.

-> Recall = 0.438 means the system missed several relevant anime that it should have recommended — it's being 
   too selective.

-> F1 Score = 0.609 shows a moderate overall balance between precision and recall.

-> So, the system is highly precise but not very comprehensive — it recommends fewer items but with high confidence.


Reason for Current Performance:

-> The model uses item-based collaborative filtering (cosine similarity between anime features).

-> Since we only considered genre and rating, the similarity is limited — it doesn’t capture more subtle 
   relationships.

-> The dataset does not contain user-specific ratings — we only have average ratings per anime, not per user. Hence, 
   the system cannot learn individual preferences.

-> Threshold-based filtering may be too strict, reducing recall.


Areas of Improvement:

-> To enhance recommendation quality:

a. Include More Features - Add members, popularity, or episodes to the feature set for better similarity computation.

b. Use User-Based Collaborative Filtering - If user anime rating data is available, compare users preferences to 
   personalize recommendations.

c. Incorporate Content-Based Features - Extract keywords from anime descriptions or synopses using NLP to capture
   deeper semantic similarities.

d. Hybrid Approach - Combine content based and collaborative filtering for more robust and personalized results.

"""

In [None]:
"""

Interview Questions:

1. Can you explain the difference between user-based and item-based collaborative filtering?

User-Based Collaborative Filtering (UBCF):

-> It finds users who have similar preferences and recommends items they liked.
-> Measures similarity between users (using metrics like cosine similarity or Pearson correlation).
-> For example, if User A and User B both like "Naruto" and User B also likes "One Piece," recommend "One Piece" 
   to User A.
-> Less scalable when number of users is very large (since user preferences keep changing).
-> Useful for small systems or when user preferences are distinct and well-known.

Item-Based Collaborative Filtering (IBCF):

-> Finds items that are similar to the ones a user already liked.
-> Measures similarity between items based on user ratings or interactions.
-> For example, if "Naruto" and "One Piece" are often liked together, recommend "One Piece" to a user who 
   liked "Naruto".
-> More stable and scalable because item relationships remain fairly consistent.
-> Preferred in large systems (like Amazon, Netflix) due to better performance and stability.


2. What is collaborative filtering, and how does it work?

-> Collaborative Filtering (CF) is a popular recommendation technique that predicts what a user might like based 
   on past behaviors and preferences of similar users.
-> It assumes that if two users reacted similarly in the past, they will react similarly in the future.

How It Works:

a. Collect user-item interaction data:
-> For example, ratings given by users to different anime shows.

b. Find similarities:
-> Compute similarity between users (user-based) or items (item-based) using cosine similarity, correlation, etc.

c. Generate predictions:
-> If User A hasn’t watched “Attack on Titan,” predict their rating by looking at ratings from similar users or
   similar anime.

d. Recommend items:
-> Suggest the top-rated or most similar anime shows to the user.

Example:

-> If many users who liked "Naruto" also liked "One Piece",
   then the system will recommend "One Piece" to a new user who liked "Naruto".


Advantages:

-> Doesn't require item metadata (like genre, type, etc.).
-> Learns from real user preferences and behavior.

Disadvantages:

-> Struggles with new items or users.
-> Requires a lot of user-item interaction data to perform well.

"""