In [2]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv(r"C:\Users\Raiyana Reyaz\OneDrive\Desktop\data science assignments\Recommendation system\anime.csv")

# Data Preprocessing:

In [3]:
# Handle missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [4]:
# genre → categorical → fill with "Unknown"
df['genre'] = df['genre'].fillna("Unknown")

# type → categorical → fill with mode
df['type'] = df['type'].fillna(df['type'].mode()[0])

# rating → numerical → fill with median
df['rating'] = df['rating'].fillna(df['rating'].median())

In [5]:
#  BASIC EXPLORATION
print("\nAfter cleaning missing values:\n", df.isnull().sum())
print("\nDataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes)
print("\nFirst 5 rows:\n", df.head())
print("\nSummary statistics:\n", df.describe())


After cleaning missing values:
 anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

Dataset shape: (12294, 7)

Column types:
 anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

First 5 rows:
    anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                           

# Feature Extraction:

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
# 1. Select Features
# Using "genre" (categorical) + "rating" (numeric)
features = df[['genre', 'rating']].copy()
print(df.columns)

df['genre_split'] = df['genre'].apply(lambda x: str(x).split(', '))
print(df[['genre', 'genre_split']].head())

df['genre_split'] = df['genre'].str.split(', ')
df_exploded = df.explode('genre_split')
dummies = pd.get_dummies(df_exploded['genre_split'])
genre_ohe = dummies.groupby(df_exploded.index).max()
df_final = df.join(genre_ohe)

print(df_final.head())

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members',
       'genre_split'],
      dtype='object')
                                               genre  \
0               Drama, Romance, School, Supernatural   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...   
2  Action, Comedy, Historical, Parody, Samurai, S...   
3                                   Sci-Fi, Thriller   
4  Action, Comedy, Historical, Parody, Samurai, S...   

                                         genre_split  
0             [Drama, Romance, School, Supernatural]  
1  [Action, Adventure, Drama, Fantasy, Magic, Mil...  
2  [Action, Comedy, Historical, Parody, Samurai, ...  
3                                 [Sci-Fi, Thriller]  
4  [Action, Comedy, Historical, Parody, Samurai, ...  
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253    

In [10]:
df_final = df.join(genre_ohe.astype(int))

print(df_final.head())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members                                        genre_split  Action  \
0   200630             [Drama, Romance, School, Supernatural]       0   
1   793665  [Action, Adventure, Drama, Fantasy, Magic, Mil...       1   
2   1

In [11]:
df_final['episodes'] = pd.to_numeric(df_final['episodes'], errors='coerce').fillna(0)


In [12]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns to normalize
num_cols = ['rating', 'members', 'episodes']

# Initialize scaler
scaler = MinMaxScaler()

# Apply scaling
df_final[num_cols] = scaler.fit_transform(df_final[num_cols])

print(df_final[num_cols].head())

     rating   members  episodes
0  0.924370  0.197872  0.000550
1  0.911164  0.782770  0.035204
2  0.909964  0.112689  0.028053
3  0.900360  0.664325  0.013201
4  0.899160  0.149186  0.028053


# Recommendation System:

In [15]:
#Step 1: Build the cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity


feature_cols = [col for col in df_final.columns if col not in 
                ['anime_id', 'name', 'genre', 'type', 'genre_split']]

# Compute cosine similarity
cosine_sim = cosine_similarity(df_final[feature_cols])

# Create dataframe for easy lookup
similarity_df = pd.DataFrame(
    cosine_sim, 
    index=df_final['name'], 
    columns=df_final['name']
)

def recommend_anime(anime_name, n=5):
    """
    Recommend top-n similar anime based on cosine similarity
    """
    
    if anime_name not in similarity_df.index:
        return f"Error: '{anime_name}' not found in the dataset."
    
    # Get similarity scores for the anime
    scores = similarity_df[anime_name].sort_values(ascending=False)
    
    # Exclude the anime itself and return top-n
    top_anime = scores.iloc[1:n+1]
    
    return top_anime

    

In [16]:
recommend_anime("Kimi no Na wa.", n=5)

name
Wind: A Breath of Heart OVA                              0.983501
Wind: A Breath of Heart (TV)                             0.981817
Aura: Maryuuin Kouga Saigo no Tatakai                    0.898629
Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen    0.889102
Kokoro ga Sakebitagatterunda.                            0.888324
Name: Kimi no Na wa., dtype: float64

In [17]:
#Recommend Anime Using Similarity Score Threshold
def recommend_with_threshold(anime_name, threshold=0.7):
    """
    Recommend anime whose cosine similarity score is above a given threshold.
    """
    # Validate
    if anime_name not in similarity_df.index:
        return f"Error: '{anime_name}' not found in dataset."
    
    # Get similarity scores
    scores = similarity_df[anime_name]
    
    # Filter by threshold & exclude itself
    filtered = scores[(scores >= threshold) & (scores.index != anime_name)]
    
    # Sort high → low
    filtered = filtered.sort_values(ascending=False)
    
    if filtered.empty:
        return f"No anime found above threshold {threshold}."
    
    return filtered

In [18]:
recommend_with_threshold("Kimi no Na wa.", threshold=0.75)

name
Wind: A Breath of Heart OVA                              0.983501
Wind: A Breath of Heart (TV)                             0.981817
Aura: Maryuuin Kouga Saigo no Tatakai                    0.898629
Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen    0.889102
Kokoro ga Sakebitagatterunda.                            0.888324
                                                           ...   
Tooi Sekai                                               0.751881
Winter Sonata Episode 0                                  0.750934
VitaminX Addiction                                       0.750726
Yotsunoha                                                0.750297
Shin Dousei Jidai: Hawaiian Breeze                       0.750029
Name: Kimi no Na wa., Length: 92, dtype: float64

In [19]:
# Experimenting With Threshold Values
thresholds = [0.9, 0.8, 0.7, 0.6]

for t in thresholds:
    result = recommend_with_threshold("Kimi no Na wa.", threshold=t)
    print(f"\n--- Threshold {t} ---")
    print(result.head())


--- Threshold 0.9 ---
name
Wind: A Breath of Heart OVA     0.983501
Wind: A Breath of Heart (TV)    0.981817
Name: Kimi no Na wa., dtype: float64

--- Threshold 0.8 ---
name
Wind: A Breath of Heart OVA                              0.983501
Wind: A Breath of Heart (TV)                             0.981817
Aura: Maryuuin Kouga Saigo no Tatakai                    0.898629
Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen    0.889102
Kokoro ga Sakebitagatterunda.                            0.888324
Name: Kimi no Na wa., dtype: float64

--- Threshold 0.7 ---
name
Wind: A Breath of Heart OVA                              0.983501
Wind: A Breath of Heart (TV)                             0.981817
Aura: Maryuuin Kouga Saigo no Tatakai                    0.898629
Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen    0.889102
Kokoro ga Sakebitagatterunda.                            0.888324
Name: Kimi no Na wa., dtype: float64

--- Threshold 0.6 ---
name
Wind: A Breath of Heart OVA        

# Analyze the performance of the recommendation system and identify areas of improvement.

Overall Performance Evaluation
Strengths:
- System works technically (generates valid similarity).
- Genre-based similarity is successfully captured.
- Cosine similarity efficiently retrieves close vector neighbors.
- Threshold method is functional and produces interpretable results.

Weaknesses:
- Genre OHE dominates similarity → unrealistic recommendations
- Popularity and rating do not influence results enough
- Obscure anime overshadow well-known ones
- Semantic similarity (storyline, themes) is not captured
- No learning from user preferences (no collaborative filtering

The system successfully calculates content-based similarity, but the recommendations are heavily biased toward exact genre matches, leading to unrealistic or obscure results. Cosine similarity using One-Hot Encoded genres fails to capture story similarity, popularity, and semantic meaning.

### Areas of Improvement:
1. Use TF-IDF instead of OHE for genres ->This reduces the weight of common genres (Action, Comedy) and increases the weight of rare genres (Tragedy, Supernatural).
2. Weight features (give more weight to genre)
3. Combine content + ratings (Hybrid Recommender)

# Interview Questions:
1. Can you explain the difference between user-based and item-based collaborative filtering?
2. What is collaborative filtering, and how does it work?


### Difference between User-Based and Item-Based Collaborative Filtering
#### User-Based Collaborative Filtering (UBCF):
- Focus: Find similar users based on their preferences/ratings.
- Steps:
-  1. Compare users using similarity metrics (cosine, Pearson, etc.)
   2. Recommend items that similar users liked.
- Pros:Works well with many users and fewer items
- Cons:
   1. Similar users might be difficult to find when user base is huge
   2.  Real-time recommendations become slow

#### Item-Based Collaborative Filtering (IBCF)
- Focus: Find similar items based on how users rated them.
- Steps:
    1. Compute similarity between items (anime–anime similarity)
    2. Recommend items similar to what user has already liked.
- Pros:
     1. More stable because items do not change as frequently
     2.  Faster for large datasets
- Cons:Needs good historical rating data

### What is Collaborative Filtering, and How Does It Work?
Collaborative Filtering is a recommendation technique that predicts a user’s interests by using patterns from many users.
####  How it Works:-
- Collect ratings or interactions (likes, views, purchases, etc.)
- Identify patterns such as:
   1. Users who like similar items
   2. Items that are liked together
- Use similarity measures (cosine similarity, Pearson correlation, etc.)
- Recommend items based on behavior of users with similar preferences.