In [1]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# 2. Load Dataset
df = pd.read_csv('C:\\Users\\Sindu\\Desktop\\Assignment ExcelR\\Recommendation System\\Recommendation System\\anime.csv')

# 3. Explore Dataset
print(df.head())
print(df.info())
print(df.isnull().sum())

# 4. Handle Missing Values
df['genre'] = df['genre'].fillna('Unknown')
df['type'] = df['type'].fillna('Unknown')
df['rating'] = df['rating'].fillna(df['rating'].mean())  # fill missing ratings with mean

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  

In [5]:
# 5. Feature Extraction
# Use "genre" + "type" as text features
df['features'] = df['genre'] + " " + df['type']

# Show a sample of extracted features
print(df[['name','features']].head())

# Convert text (genre+type) to numerical vectors using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)  # (rows, features)

# Normalize numerical features (rating + members)
scaler = MinMaxScaler()
num_features = scaler.fit_transform(df[['rating','members']].fillna(0))

print("Numeric Features Shape:", num_features.shape)

# Combine text + numeric features
final_features = hstack([tfidf_matrix, num_features])

print("Final Feature Matrix Shape:", final_features.shape)


                               name  \
0                    Kimi no Na wa.   
1  Fullmetal Alchemist: Brotherhood   
2                          Gintama°   
3                       Steins;Gate   
4                     Gintama&#039;   

                                            features  
0         Drama, Romance, School, Supernatural Movie  
1  Action, Adventure, Drama, Fantasy, Magic, Mili...  
2  Action, Comedy, Historical, Parody, Samurai, S...  
3                                Sci-Fi, Thriller TV  
4  Action, Comedy, Historical, Parody, Samurai, S...  
TF-IDF Matrix Shape: (12294, 52)
Numeric Features Shape: (12294, 2)
Final Feature Matrix Shape: (12294, 54)


In [9]:
# 6. Compute Cosine Similarity

# Check the shape of cosine similarity matrix
print("Cosine Similarity Matrix Shape:", cosine_sim.shape)

# Show first 5x5 similarity values
print(cosine_sim[:5, :5])


Cosine Similarity Matrix Shape: (12294, 12294)
[[1.         0.52502301 0.46247873 0.46668313 0.46225335]
 [0.52502301 1.         0.53788793 0.60287706 0.54820695]
 [0.46247873 0.53788793 1.         0.57155151 0.99960911]
 [0.46668313 0.60287706 0.57155151 1.         0.58025853]
 [0.46225335 0.54820695 0.99960911 0.58025853 1.        ]]


In [11]:
# 7. Recommendation Function
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

def recommend_anime(title, top_n=5, threshold=0.2):
    if title not in indices:
        return f"Anime '{title}' not found in dataset."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # filter by threshold
    sim_scores = [s for s in sim_scores if s[1] >= threshold]
    
    sim_scores = sim_scores[1:top_n+1]  # skip itself
    anime_indices = [i[0] for i in sim_scores]
    return df['name'].iloc[anime_indices]

# Example: Get recommendations
print("Recommendations for 'Naruto':")
print(recommend_anime("Naruto", top_n=10, threshold=0.3))

Recommendations for 'Naruto':
615                                    Naruto: Shippuuden
206                                         Dragon Ball Z
346                                           Dragon Ball
588                                       Dragon Ball Kai
1930                                    Dragon Ball Super
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
486                              Boruto: Naruto the Movie
2342                               Kurokami The Animation
1796                                       Rekka no Honoo
Name: name, dtype: object


In [13]:
# 8. Evaluation (Simple Approach)
# Split into train/test on ratings
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Example evaluation using precision/recall
# (In real cases, we need user-anime interaction matrix, but here we check similarity coverage)
y_true = []
y_pred = []

for anime in test['name'].sample(50, random_state=42):  # sample 50 test items
    recs = recommend_anime(anime, top_n=5, threshold=0.3)
    if isinstance(recs, str):  # skip if anime not found
        continue
    y_true.append(1)  # assume true relevant
    y_pred.append(1 if len(recs) > 0 else 0)

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Evaluation Metrics:")
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Evaluation Metrics:
Precision: 1.0
Recall: 1.0
F1-Score: 1.0


In [None]:
Interview Questions:

1: Difference between User-Based and Item-Based Collaborative Filtering

* User-Based Collaborative Filtering (UBCF): Finds similar users based on their ratings or preferences.
                                             Recommends items liked by similar users.
- Example: If User A and User B have watched many of the same anime, and User B liked "Naruto Shippuden", then User A will get it recommended.
- Challenge: Works poorly when the number of users is huge (scalability).

* Item-Based Collaborative Filtering (IBCF): Finds similar items (anime, movies, books) based on how users have rated them.
                                             Recommends items similar to what the user already liked.
- Example: If "Naruto" and "Bleach" are rated similarly by many users, then if a user liked "Naruto", they will be recommended "Bleach".
- Advantage: More stable and scalable than user-based, since items are usually fewer and more consistent than users.

* Key Difference: UBCF: looks for similar users.
                  IBCF: looks for similar items.

In [None]:
2: What is Collaborative Filtering, and How Does it Work?

* Collaborative Filtering (CF) is a recommendation technique that makes predictions based on the collective preferences of users.
  The idea: “People who agreed in the past will likely agree in the future.”

* How it works: Build a user–item interaction matrix (e.g., users vs. anime ratings).
                Measure similarity (cosine similarity, Pearson correlation, etc.) either between users or between items.
                Recommend items that similar users liked (user-based) or items similar to what the user liked (item-based).

* Types of CF: User-based CF → recommends items liked by similar users.
               Item-based CF → recommends items similar to those the user liked.

* Advantages: No need for deep knowledge of items (genres, features).
              Learns purely from user interactions.

* Limitations: Cold Start Problem → new users or items have no history.
               Sparsity → when user–item matrix has too many missing values.
               Scalability → with millions of users/items, similarity computation becomes expensive.