In [11]:
import pandas as pd
import numpy as np
# Load the dataset
file_path = "/content/anime.csv"
anime_df = pd.read_csv(file_path)

# Display basic information
print(anime_df.info())

# Show the first few rows
print(anime_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB
None
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Advent

In [4]:
print(anime_df.isnull().sum())

# Fill missing values (if any)
anime_df.fillna({'rating': anime_df['rating'].mean()}, inplace=True)
anime_df.fillna({'genre': 'Unknown'}, inplace=True)

# Convert genre to lowercase and split into lists
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.lower().split(', ') if isinstance(x, str) else [])

# View dataset after cleaning
print(anime_df.head())

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0             [drama, romance, school, supernatural]  Movie        1    9.37   
1  [action, adventure, drama, fantasy, magic, mil...     TV       64    9.26   
2  [action, comedy, historical, parody, samurai, ...     TV       51    9.25   
3                                 [sci-fi, thriller]     TV       24    9.17   
4  [action, comedy, historical, parody, samurai, ...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [8]:
anime_df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",Movie,1,0.92437,200630
1,5114,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",TV,64,0.911164,793665
2,28977,Gintama°,"[action, comedy, historical, parody, samurai, ...",TV,51,0.909964,114262
3,9253,Steins;Gate,"[sci-fi, thriller]",TV,24,0.90036,673572
4,9969,Gintama&#039;,"[action, comedy, historical, parody, samurai, ...",TV,51,0.89916,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"[comedy, drama, school, shounen, sports]",TV,10,0.897959,93351
6,11061,Hunter x Hunter (2011),"[action, adventure, shounen, super power]",TV,148,0.895558,425855
7,820,Ginga Eiyuu Densetsu,"[drama, military, sci-fi, space]",OVA,110,0.893157,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"[action, comedy, historical, parody, samurai, ...",Movie,1,0.891957,72534
9,15417,Gintama&#039;: Enchousen,"[action, comedy, historical, parody, samurai, ...",TV,13,0.893157,81109


In [7]:
anime_df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Convert genres into numerical representation using TF-IDF Vectorization
tfidf = TfidfVectorizer()
genre_matrix = tfidf.fit_transform(anime_df['genre'].apply(lambda x: ' '.join(x)))

# Normalize the rating column
scaler = MinMaxScaler()
anime_df[['rating']] = scaler.fit_transform(anime_df[['rating']])

# Combine all feature vectors
from scipy.sparse import hstack
features = hstack([genre_matrix, anime_df[['rating']].values])

print("Feature extraction complete.")


Feature extraction complete.


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(features, features)

# Function to get anime recommendations
def recommend_anime(anime_title, top_n=10):
    if anime_title not in anime_df['name'].values:
        return "Anime not found in dataset."

    # Get index of given anime
    anime_idx = anime_df.index[anime_df['name'] == anime_title][0]

    # Get similarity scores
    similarity_scores = list(enumerate(cosine_sim[anime_idx]))

    # Sort by similarity
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top N recommendations (excluding the anime itself)
    recommendations = [anime_df.iloc[i[0]]['name'] for i in similarity_scores[1:top_n+1]]

    return recommendations

print(recommend_anime("Naruto", top_n=5))


['Naruto: Shippuuden', 'Boruto: Naruto the Movie - Naruto ga Hokage ni Natta Hi', 'Boruto: Naruto the Movie', 'Naruto x UT', 'Naruto: Shippuuden Movie 4 - The Lost Tower']


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Dummy target labels (for evaluation)
y = np.random.randint(0, 2, size=len(anime_df))  # Replace with actual labels if available
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# Compute similarity for test data
y_pred = []
for i in range(len(X_test.toarray())):
    similarity_scores = cosine_similarity([X_test.toarray()[i]], X_train.toarray()).flatten()
    y_pred.append(1 if max(similarity_scores) > 0.7 else 0)  # Adjust threshold

# Calculate evaluation metrics
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))


Precision: 0.49816998779991867
Recall: 1.0
F1-score: 0.6650380021715526


In [13]:
"""Interview Questions & Answers
1. Difference between User-Based and Item-Based Collaborative Filtering
i) User-Based Collaborative Filtering: Recommends items based on users with similar preferences. It finds users who have rated similar items in the past and suggests what they liked.
ii) Item-Based Collaborative Filtering: Recommends items similar to what the user has already liked. It calculates similarities between items instead of users.

2. What is Collaborative Filtering and How Does It Work?
Collaborative Filtering is a recommendation technique that relies on past user behavior rather than explicit features of items.
 It works by:
 Collecting user-item interactions (ratings, clicks, purchases).
 Finding similarities between users or items.
 Making predictions based on similar users/items."""

'Interview Questions & Answers\n1. Difference between User-Based and Item-Based Collaborative Filtering\ni) User-Based Collaborative Filtering: Recommends items based on users with similar preferences. It finds users who have rated similar items in the past and suggests what they liked.\nii) Item-Based Collaborative Filtering: Recommends items similar to what the user has already liked. It calculates similarities between items instead of users.\n\n2. What is Collaborative Filtering and How Does It Work?\nCollaborative Filtering is a recommendation technique that relies on past user behavior rather than explicit features of items.\n It works by:\n Collecting user-item interactions (ratings, clicks, purchases).\n Finding similarities between users or items.\n Making predictions based on similar users/items.'