In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('anime.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [3]:
data.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
genere_dummies = data['genre'].str.get_dummies(sep=',')
genere_dummies

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
anime = pd.concat([data, genere_dummies], axis=1)
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
numerical_features = ['rating', 'members']
anime[numerical_features] = (anime[numerical_features] - anime[numerical_features].mean()) / anime[numerical_features].std()

In [8]:
features = ['Action', 'Adventure', 'Cars']
features

['Action', 'Adventure', 'Cars']

In [9]:
cosine_sim = cosine_similarity(anime[features], anime[features])
cosine_sim

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
similarity = pd.DataFrame(cosine_sim, index=anime['name'], columns=anime['name'])
similarity

name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,Hunter x Hunter (2011),Ginga Eiyuu Densetsu,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare,Gintama&#039;: Enchousen,...,Super Erotic Anime,Taimanin Asagi 3,Teleclub no Himitsu,Tenshi no Habataki Jun,The Satisfaction,Toushindai My Lover: Minami tai Mecha-Minami,Under World,Violence Gekiga David no Hoshi,Violence Gekiga Shin David no Hoshi: Inma Densetsu,Yasuji no Pornorama: Yacchimae!!
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Kimi no Na wa.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fullmetal Alchemist: Brotherhood,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gintama°,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Steins;Gate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gintama&#039;,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toushindai My Lover: Minami tai Mecha-Minami,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Under World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Violence Gekiga David no Hoshi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Violence Gekiga Shin David no Hoshi: Inma Densetsu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def recommend_anime(target_title, top_n=5):
    similarity_scores = similarity[target_title]
    similarity_scores = similarity_scores.sort_values(ascending=False)
    return similarity_scores

similarity_scores = recommend_anime('Death Note') # Example target title
similarity_scores_sorted = similarity_scores.sort_values(ascending=False)
similarity_scores_sorted

name
Code Geass: Hangyaku no Lelouch R2             0.0
Yasuji no Pornorama: Yacchimae!!               0.0
Kimi no Na wa.                                 0.0
Fullmetal Alchemist: Brotherhood               0.0
Gintama°                                       0.0
                                              ... 
Hataraku Otona no Renai Jijou The Animation    0.0
Hi Gekiga Ukiyoe Senya Ichiya                  0.0
Hitorijime My Hero                             0.0
Hokenshitsu de Aimashou                        0.0
Idol Kyousei Sousa                             0.0
Name: Death Note, Length: 12294, dtype: float64

In [12]:
top_n = 5 
recommended_anime = similarity_scores_sorted.index[1:top_n+1]
recommended_anime

Index(['Yasuji no Pornorama: Yacchimae!!', 'Kimi no Na wa.',
       'Fullmetal Alchemist: Brotherhood', 'Gintama°', 'Steins;Gate'],
      dtype='object', name='name')

In [13]:
target_anime = 'Death Note'
recommendations = recommend_anime(target_anime)
recommendations

name
Yasuji no Pornorama: Yacchimae!!      0.0
Kimi no Na wa.                        0.0
Fullmetal Alchemist: Brotherhood      0.0
Gintama°                              0.0
Steins;Gate                           0.0
                                     ... 
Gintama&#039;: Enchousen              0.0
Clannad: After Story                  0.0
Koe no Katachi                        0.0
Gintama                               0.0
Code Geass: Hangyaku no Lelouch R2    0.0
Name: Death Note, Length: 12294, dtype: float64

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
anime['rating_binary'] = (anime['rating'] >= 7).astype(int)
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,rating_binary
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,2.820656,3.330106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,2.713522,14.147831,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.703782,1.754642,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,2.625866,11.957179,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,2.616127,2.429643,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train_data, test_data = train_test_split(anime, test_size=0.2, random_state=42)
train_data.head()
test_data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Adventure,Cars,Comedy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,rating_binary
6329,17209,Suzy&#039;s Zoo: Daisuki! Witzy - Happy Birthday,Kids,Special,1,-0.295985,-0.326762,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2167,173,Tactics,"Comedy, Drama, Fantasy, Mystery, Shounen, Supe...",TV,25,0.843537,0.169401,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2882,3616,Kamen no Maid Guy,"Action, Comedy, Ecchi, Super Power",TV,12,0.648747,0.176752,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4700,18799,Take Your Way,"Action, Music, Seinen, Supernatural",Music,1,0.181251,-0.304344,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7258,18831,Rinkaku,"Dementia, Horror, Music",Music,1,-0.851137,-0.31859,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
def evaluate_recommendations(test_data, similarity_df, top_n=5):
    precision = []
    recall = []

    for _,test_anime in test_data.iterrows():
        target_title = test_anime['name']
        recommended_anime = recommend_anime(target_title, top_n)
        relevent_recommendations = [1 if anime in recommended_anime and anime.loc[anime['name'] == target_title, 'rating_binary'].values[0] == 1 else 0 for anime in recommended_anime]
        precision.append(sum(relevent_recommendations) / len(recommended_anime) if len(recommended_anime) > 0 else 0)
        if len(test_data[test_data['rating_binary'] == 1]) > 0:
            recall.append(sum(relevent_recommendations) / len(test_data[test_data['rating_binary'] == 1]))
        else:
            recall.append(0)
    print(f"Precision: {np.mean(precision)}")
    print(f"Recall: {np.mean(recall)}")
    print(f"Recall: {np.mean(recall)}")
    return np.mean(precision), np.mean(recall)

In [None]:
evaluate_recommendations(test_data, similarity)

In [None]:
def recommend_anime(target_title, top_n=5, similarity_df=similarity_df):# Using the global similarity_df as default
    similarity_scores = similarity_df[target_title]
    similarity_scores = similarity_scores.sort_values(ascending=False)

    recommend_anime = similarity_scores.index[1:top_n+1]
    return recommend_anime

In [None]:
relevent_recommendations = [1 if anime in recommended_anime and anime.loc[anime['name'] == target_anime, 'rating_binary'].values[0] == 1 else 0 for anime in recommended_anime]
precision = sum(relevent_recommendations) / len(recommended_anime) if len(recommended_anime) > 0 else 0 
if len(test_data[test_data['rating_binary'] == 1]) > 0:
    recall = sum(relevent_recommendations) / len(test_data[test_data['rating_binary'] == 1])
else:
    recall = 0 

In [None]:
precision

In [None]:
recall

# Inetrview Questions

Can you explain the difference between user-based and item-based collaborative filtering?

User-Based Collaborative Filtering:

This method finds users who are similar to you based on what they like. It recommends things that those similar users have liked. Works well when there are many users and lots of interactions (like ratings or purchases) to compare. Item-Based Collaborative Filtering:

This method finds items that are similar to the ones you’ve already liked or used. It recommends similar items to you. Works well when there are many items but not as many user interactions with each item.

Overall, user-based collaborative filtering focuses on similar users, while item-based collaborative filtering focuses on similar items. Each approach has its strengths and is suitable for different scenarios.

What is collaborative filtering, and how does it work?

Collaborative filtering is a technique used in recommender systems to generate personalized recommendations by analyzing user interactions with items.Collaborative filtering is a method used to suggest things to you based on what other people with similar preferences have liked. It works in two ways:

It compares you with other users to see what they’ve liked (user-based).
Or, it compares the items you’ve liked with other items that are similar (item-based).
This technique is widely used in online stores or streaming platforms to give you personalized suggestions without needing to know much about you personally—it just looks at the patterns in behavior.