In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler



In [3]:
# Load Dataset

df = pd.read_csv(r"D:\documents\anime.csv")

# Step 3: Explore Dataset
print("Dataset Shape:", df.shape)
print(df.head())
print(df.info())
print(df.isnull().sum())

Dataset Shape: (12294, 7)
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column 

In [4]:
# Handle Missing Values
# Filling missing 'genre' with empty string
df['genre'] = df['genre'].fillna('')
# Filling missing 'rating' with mean
df['rating'] = df['rating'].fillna(df['rating'].mean())
# Filling missing 'episodes' with 0
df['episodes'] = df['episodes'].replace('Unknown', 0).astype(int)


In [5]:
#  Feature Extraction

# Combine relevant features into a single string for similarity calculation
# We'll use genre and broadcast type (optional) as text features
df['features'] = df['genre'] + ' ' + df['type']

# Convert textual features into numerical vectors using CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))  # splitting genres
feature_matrix = vectorizer.fit_transform(df['genre'])

# Normalize numerical features (rating & episodes)
scaler = MinMaxScaler()
df[['rating_scaled', 'episodes_scaled']] = scaler.fit_transform(df[['rating', 'episodes']])

# Combine text similarity and numerical features
numerical_features = df[['rating_scaled', 'episodes_scaled']].values




In [6]:
# Step 6: Compute Cosine Similarity

# Text-based similarity
text_similarity = cosine_similarity(feature_matrix)

# Optionally, combine numerical features into similarity
numerical_similarity = cosine_similarity(numerical_features)

# Weighted combination (you can adjust weights)
cosine_sim = 0.7 * text_similarity + 0.3 * numerical_similarity


In [7]:
# Step 7: Recommendation Function

def recommend_anime(title, df=df, cosine_sim=cosine_sim, top_n=10):
    # Check if anime exists in the dataset
    if title not in df['name'].values:
        return f"{title} not found in dataset."
    
    # Get the index of the target anime
    idx = df.index[df['name'] == title][0]
    
    # Get similarity scores for all anime with the target
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort anime based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the target anime itself
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of recommended anime
    anime_indices = [i[0] for i in sim_scores]
    
    # Return recommended anime titles
    return df['name'].iloc[anime_indices].values


In [8]:
# Step 8: Test the Recommendation System

# Example: Recommend anime similar to "Naruto"
recommendations = recommend_anime("Naruto", top_n=10)
print("Recommended Anime for 'Naruto':")
for i, anime in enumerate(recommendations, 1):
    print(f"{i}. {anime}")


Recommended Anime for 'Naruto':
1. Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!
2. Naruto Shippuuden: Sunny Side Battle
3. Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono
4. Naruto: Shippuuden Movie 4 - The Lost Tower
5. Naruto x UT
6. Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi
7. Boruto: Naruto the Movie
8. Naruto: Shippuuden
9. Katekyo Hitman Reborn!
10. Battle Spirits: Ryuuko no Ken


In [9]:
# Step 9: Analyze Recommendations
# You can check the similarity scores for the top recommendations
target_idx = df.index[df['name'] == 'Naruto'][0]
sim_scores = list(enumerate(cosine_sim[target_idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
print("Top 10 Similarity Scores:")
for i, score in sim_scores[1:11]:
    print(f"{df['name'].iloc[i]}: {score:.4f}")


Top 10 Similarity Scores:
Naruto Soyokazeden Movie: Naruto to Mashin to Mitsu no Onegai Dattebayo!!: 0.9961
Naruto Shippuuden: Sunny Side Battle: 0.9961
Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono: 0.9961
Naruto: Shippuuden Movie 4 - The Lost Tower: 0.9961
Naruto x UT: 0.9961
Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi: 0.9961
Boruto: Naruto the Movie: 0.9961
Naruto: Shippuuden: 0.9960
Katekyo Hitman Reborn!: 0.9260
Battle Spirits: Ryuuko no Ken: 0.9222


In [None]:
# Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?

# User-Based vs Item-Based Collaborative Filtering
# User-Based: Recommends items liked by similar users. (“People like you liked this.”)
# Item-Based: Recommends items similar to what the user liked. (“Items like this are recommended.”)
# Key Difference: User-based focuses on user similarity; item-based focuses on item similarity.

# 2. What is collaborative filtering, and how does it work?

# Definition: Predicts what a user will like based on other users’ behaviors.
# How it works:
# Collect user-item ratings.
# Find similar users or items.
# Recommend items based on these similarities.

