# Assignment 11: Recommendation System

In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv("anime.csv")

In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [7]:
df.shape

(12294, 7)

## Step 1: Processing

In [8]:
df['genre'] = df['genre'].fillna('Unknown')
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

## Step 2: Feature Extraction

In [9]:
df['genre_list'] = df['genre'].apply(lambda x: [i.strip() for i in x.split(',')])

In [10]:
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=df.index)

In [11]:
genre_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
scaler = MinMaxScaler()
num_features = scaler.fit_transform(df[['episodes', 'rating', 'members']])
num_df = pd.DataFrame(num_features, columns=['episodes_norm', 'rating_norm', 'members_norm'], index=df.index)

In [13]:
num_df

Unnamed: 0,episodes_norm,rating_norm,members_norm
0,0.000000,0.924370,0.197872
1,0.034673,0.911164,0.782770
2,0.027518,0.909964,0.112689
3,0.012658,0.900360,0.664325
4,0.027518,0.899160,0.149186
...,...,...,...
12289,0.000000,0.297719,0.000203
12290,0.000000,0.313325,0.000176
12291,0.001651,0.385354,0.000211
12292,0.000000,0.397359,0.000168


In [14]:
# Final feature matrix
feature_matrix = pd.concat([genre_df, num_df], axis=1)

In [15]:
feature_matrix

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri,episodes_norm,rating_norm,members_norm
0,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0.000000,0.924370,0.197872
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0.034673,0.911164,0.782770
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.027518,0.909964,0.112689
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0.012658,0.900360,0.664325
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.027518,0.899160,0.149186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000000,0.297719,0.000203
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000000,0.313325,0.000176
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.001651,0.385354,0.000211
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000000,0.397359,0.000168


## Step 3: Recommendation System

In [16]:
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

In [17]:
cosine_sim

array([[1.        , 0.31068191, 0.13938585, ..., 0.15027137, 0.15431875,
        0.17306034],
       [0.31068191, 1.        , 0.35863381, ..., 0.11282056, 0.11583098,
        0.12988786],
       [0.13938585, 0.35863381, 1.        , ..., 0.11687054, 0.12000412,
        0.1345798 ],
       ...,
       [0.15027137, 0.11282056, 0.11687054, ..., 1.        , 0.99994463,
        0.99824866],
       [0.15431875, 0.11583098, 0.12000412, ..., 0.99994463, 1.        ,
        0.99881138],
       [0.17306034, 0.12988786, 0.1345798 , ..., 0.99824866, 0.99881138,
        1.        ]])

In [18]:
def recommend_anime(title, top_n=5, threshold=0.5):
    if title not in df['name'].values:
        return "Anime not found!"
    idx = df.index[df['name'] == title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [s for s in sim_scores if s[1] >= threshold]
    sim_scores = sim_scores[1:top_n+1]  # exclude itself
    recommendations = df.iloc[[i[0] for i in sim_scores]][['name','genre','type','rating']]
    return recommendations

In [19]:
# Example recommendation
print("Recommendations for 'Steins;Gate':")
print(recommend_anime("Steins;Gate", top_n=5))

Recommendations for 'Steins;Gate':
                                                    name  \
59            Steins;Gate Movie: Fuka Ryouiki no Déjà vu   
126                Steins;Gate: Oukoubakko no Poriomania   
196    Steins;Gate: Kyoukaimenjou no Missing Link - D...   
10898                                      Steins;Gate 0   
5126                                       Under the Dog   

                          genre     type    rating  
59             Sci-Fi, Thriller    Movie  8.610000  
126            Sci-Fi, Thriller  Special  8.460000  
196            Sci-Fi, Thriller  Special  8.340000  
10898          Sci-Fi, Thriller      NaN  6.473902  
5126   Action, Sci-Fi, Thriller      OVA  6.550000  


## Step 4: Evaluation

In [20]:
# (Toy evaluation since true user preferences are not included in dataset)
X_train, X_test = train_test_split(feature_matrix, test_size=0.2, random_state=42)

In [21]:
# Fake "ground truth": assume most similar anime are relevant
y_true = []
y_pred = []

In [22]:
for i in range(len(X_test)):
    sims = cosine_similarity([X_test.iloc[i]], X_train)[0]
    top_idx = np.argsort(sims)[::-1][:5]
    y_true.extend([1]*5)  # pretend top 5 are true
    y_pred.extend([1 if sims[j] > 0.5 else 0 for j in top_idx])


In [23]:
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1-score:", f1_score(y_true, y_pred))


Precision: 1.0
Recall: 1.0
F1-score: 1.0


## Interview Question:

**1. Difference between user-based and item-based collaborative filtering?**



User-based CF → Finds similar users based on ratings and recommends items those users liked.
Item-based CF → Finds similar items based on ratings and recommends similar items to what a user has rated/liked.
Item-based is more scalable in large datasets.

**2. What is collaborative filtering, and how does it work?**

Collaborative filtering is a recommendation technique that uses past user-item interactions.
It assumes: “Users who agreed in the past will agree in the future.
Works by either comparing users (user-based) or comparing items (item-based) using similarity metrics like cosine similarity
or Pearson correlation.