In [46]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [47]:
df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [48]:
df.shape

(12294, 7)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [50]:
len(df.name.unique())

12292

In [51]:
len(df.genre.unique())

3265

In [52]:
df.type.unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA', nan], dtype=object)

### Data preprocessing

In [53]:
### Handle missing value

In [54]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [55]:
rat_mean = df['rating'].mean()
rat_mean

np.float64(6.473901690981432)

In [56]:
df['rating'] = df['rating'].fillna(rat_mean)

In [57]:
df['type'] = df['type'].fillna(df['type'].mode()[0])

In [58]:
df['genre'] = df['genre'].fillna(df['genre'].mode()[0])

In [59]:
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [60]:
df.duplicated().sum()

np.int64(0)

In [61]:
df.groupby('name')['rating'].mean().sort_values(ascending = False)[:6]

name
Taka no Tsume 8: Yoshida-kun no X-Files    10.00
Spoon-hime no Swing Kitchen                 9.60
Mogura no Motoro                            9.50
Kimi no Na wa.                              9.37
Kahei no Umi                                9.33
Fullmetal Alchemist: Brotherhood            9.26
Name: rating, dtype: float64

In [62]:
features = df[['genre', 'type', 'rating', 'members']]
features.head()

Unnamed: 0,genre,type,rating,members
0,"Drama, Romance, School, Supernatural",Movie,9.37,200630
1,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,9.26,793665
2,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.25,114262
3,"Sci-Fi, Thriller",TV,9.17,673572
4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,9.16,151266


### Feature Extraction 

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [64]:
tfidf = TfidfVectorizer(stop_words="english")
genre_matrix = tfidf.fit_transform(features['genre'])

In [65]:
ohe = OneHotEncoder()
type_matrix = ohe.fit_transform(features[['type']])

In [66]:
genre_df = pd.DataFrame(genre_matrix.toarray(), index=df.index)
type_df = pd.DataFrame(type_matrix.toarray(), index=df.index)

In [67]:
from sklearn.preprocessing import MinMaxScaler

In [68]:
scaler = MinMaxScaler()
num_features = scaler.fit_transform(features[['rating', 'members']])
num_df = pd.DataFrame(num_features, index=df.index, columns=['rating', 'members'])

In [69]:
from scipy.sparse import hstack

In [70]:
final_features = pd.concat([genre_df, type_df, num_df], axis=1)

In [71]:
print("Final feature shape:", final_features.shape)
final_features.head()

Final feature shape: (12294, 54)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,0.1,1.1,2.1,3.1,4.1,5.1,rating,members
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.440247,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.92437,0.197872
1,0.294649,0.317607,0.0,0.0,0.0,0.0,0.0,0.0,0.335834,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.911164,0.78277
2,0.250631,0.0,0.0,0.0,0.0,0.200766,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.909964,0.112689
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.90036,0.664325
4,0.250631,0.0,0.0,0.0,0.0,0.200766,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.89916,0.149186


### Cosine similarities

In [73]:
from sklearn.metrics.pairwise import cosine_similarity

In [74]:
similarity_matrix = cosine_similarity(final_features)

In [83]:
df1=df.pivot_table(index='name',columns='anime_id',values='rating')
df1.fillna(0,axis=1,inplace = True)
df1.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34498,34501,34502,34503,34506,34514,34519,34522,34525,34527
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;0&quot;,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
cos_sim = cosine_similarity(df1)

In [85]:
cos_sim.shape

(12292, 12292)

In [86]:
cos_sim[1]

array([0., 1., 0., ..., 0., 0., 0.])

In [97]:
def recommended_movie(similar_movie):
    if similar_movie in df1.index:
        index = np.where(similar_movie == df1.index)[0][0]
        similar = sorted(list(enumerate(cos_sim[index])),reverse = True , key = lambda x: x[1])[1:6]
        print('Recommended movie of',similar_movie)
        for i in similar:
            print(df1.index[i[0]])
        else:
            print('Movie is not in the list')
recommended_movie('Hatsukoi')

Recommended movie of Hatsukoi
&quot;0&quot;
&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi
&quot;Bungaku Shoujo&quot; Memoire
&quot;Bungaku Shoujo&quot; Movie
Movie is not in the list


In [98]:
similar = sorted(list(enumerate(cos_sim[5])),reverse = True , key = lambda x: x[1])[1:6]
for i in similar:
    print(df1.index[i[0]])

&quot;0&quot;
&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi
&quot;Bungaku Shoujo&quot; Memoire
&quot;Bungaku Shoujo&quot; Movie


In [113]:
def recommend_anime(anime_name, threshold=0.3, top_n=10):
    if anime_name not in df['name'].values:
        idx = df[df['name'] == anime_name].index[0]
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        filtered = [(i, score) for i, score in sim_scores if score >= threshold and i != idx]
        top_recommendations = filtered[:top_n]
        recommendations = pd.DataFrame({
        'Anime': df.loc[[i for i, _ in top_recommendations], 'name'].values,
        'Similarity Score': [score for _, score in top_recommendations]
    })
        return recommendations

In [117]:
print(recommend_anime("Hatsukoi", threshold=0.4, top_n=5))

None


### Interview Question 

In [None]:
### User Based Collaborative Filtering
Approach: Find users who are similar to the target user and recommend items that those similar users liked.