In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
 #Load the dataset
anime_df = pd.read_csv('anime.csv')
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
anime_df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [4]:
# Impute missing values in 'genre' with the mode (most frequent genre)
anime_df['genre'].fillna(anime_df['genre'].mode()[0], inplace=True)
# Impute missing values in 'type' with the mode (most frequent type)
anime_df['type'].fillna(anime_df['type'].mode()[0], inplace=True)
# Impute missing values in 'rating' with the mean rating
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)
# Check if any missing values remain
anime_df.isnull().sum()


Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [None]:
anime_df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12294.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.017096,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.9,225.0
50%,10260.5,6.55,1550.0
75%,24794.5,7.17,9437.0
max,34527.0,10.0,1013917.0


In [6]:
# Count the number of occurrences of each genre
genre_counts = anime_df['genre'].value_counts()

# Identify rare genres that appear fewer than 2 times
rare_genres = genre_counts[genre_counts < 2].index

# Group rare genres into 'Other'
anime_df['genre'] = anime_df['genre'].apply(lambda x: 'Other' if x in rare_genres else x)

# Proceed with the stratified split using the modified dataset
train_df, test_df = train_test_split(
    anime_df, test_size=0.2, random_state=42, stratify=anime_df['genre']
)

# Reset indices
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [7]:
# Convert 'genre' column to numerical representations using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf.fit_transform(train_df['genre'])

# Normalize the 'rating' column
scaler = MinMaxScaler()
train_df['rating_norm'] = scaler.fit_transform(train_df[['rating']])

# Combine the features into a single matrix
features_matrix_train = np.hstack([tfidf_matrix_train.toarray(), train_df[['rating_norm']].values])


In [8]:
# Compute the cosine similarity matrix
cosine_sim_train = cosine_similarity(features_matrix_train, features_matrix_train)


In [9]:
# Debugging the cosine similarity matrix
print("Cosine similarity matrix shape:", cosine_sim_train.shape)
print("Sample cosine similarity values:")
print(cosine_sim_train[:5, :5])  # Display a small part of the matrix


Cosine similarity matrix shape: (9835, 9835)
Sample cosine similarity values:
[[1.         0.43022802 0.5805908  0.4801796  0.53319033]
 [0.43022802 1.         0.37130342 0.20658672 0.47187681]
 [0.5805908  0.37130342 1.         0.42411822 0.3095654 ]
 [0.4801796  0.20658672 0.42411822 1.         0.58727018]
 [0.53319033 0.47187681 0.3095654  0.58727018 1.        ]]


In [10]:
# Verify the genre transformation
print(anime_df['genre'].value_counts())

# Check the first few rows of the processed data
print(anime_df.head())


genre
Other                                                        1973
Hentai                                                        885
Comedy                                                        523
Music                                                         301
Kids                                                          199
                                                             ... 
Action, Mecha, Romance, Sci-Fi, Super Power, Supernatural       2
Action, Demons, Horror, Supernatural                            2
Demons, Horror, Sci-Fi, Supernatural                            2
Magic, Music, Romance, School                                   2
Comedy, Hentai, Historical                                      2
Name: count, Length: 1292, dtype: int64
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                     

In [11]:
def recommend_anime_train(anime_title, df=train_df, cosine_sim=cosine_sim_train, top_n=10):
    if anime_title not in df['name'].values:
        return []

    indices = df[df['name'] == anime_title].index
    if len(indices) == 0 or indices[0] >= len(cosine_sim):
        return []

    idx = indices[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Skip the first one since it is the anime itself
    sim_scores = sim_scores[1:top_n+1]

    anime_indices = [i[0] for i in sim_scores if i[1] > 0]
    return df.iloc[anime_indices]['name'].tolist() if anime_indices else []


In [12]:
# Example with  anime
recommendations = recommend_anime_train('One Piece', top_n=10)
recommendations

['One Piece: Episode of Nami - Koukaishi no Namida to Nakama no Kizuna',
 'One Piece: Episode of Sabo - 3 Kyoudai no Kizuna Kiseki no Saikai to Uketsugareru Ishi',
 'One Piece Film: Strong World Episode 0',
 'One Piece: Episode of Luffy - Hand Island no Bouken',
 'One Piece Movie 4: Dead End no Bouken',
 'One Piece Movie 9: Episode of Chopper Plus - Fuyu ni Saku, Kiseki no Sakura',
 'One Piece: Adventure of Nebulandia',
 'One Piece: Umi no Heso no Daibouken-hen',
 'One Piece Movie 5: Norowareta Seiken',
 'One Piece: Oounabara ni Hirake! Dekkai Dekkai Chichi no Yume!']

In [14]:
def evaluate_recommendation_system_v2(df, top_n=20):
    precision_list = []
    recall_list = []
    f1_list = []

    # Create a list of all anime titles
    all_anime_titles = df['name'].tolist()

    for target_anime in all_anime_titles:
        actual_anime_list = recommend_anime_train(target_anime, top_n=top_n)
        if not actual_anime_list:
            continue

        actual_anime_set = set(actual_anime_list)
        target_anime_set = set([target_anime])

        precision = len(actual_anime_set.intersection(target_anime_set)) / len(actual_anime_set) if actual_anime_set else 0
        recall = len(actual_anime_set.intersection(target_anime_set)) / len(target_anime_set) if target_anime_set else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    avg_precision = np.mean(precision_list) if precision_list else 0
    avg_recall = np.mean(recall_list) if recall_list else 0
    avg_f1 = np.mean(f1_list) if f1_list else 0

    return avg_precision, avg_recall, avg_f1

# Evaluate the system
precision, recall, f1 = evaluate_recommendation_system_v2(anime_df, top_n=10)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')


Precision: 0.0116, Recall: 0.1158, F1-Score: 0.0211


To improve the performance of  recommendation system, consider these suggestions:

Feature Enhancement: Include additional features beyond genre and rating. For example, use descriptions, tags, and user reviews to enrich the feature set. Leveraging embeddings from models like Word2Vec or BERT can capture more nuanced semantic similarities.

Advanced Similarity Metrics: Experiment with different similarity measures or distance metrics. Besides cosine similarity, metrics like Jaccard similarity or Euclidean distance might offer better performance depending on the data distribution.

Hybrid Models: Combine collaborative filtering with content-based filtering. Collaborative filtering uses user interactions, while content-based filtering leverages item features, providing a more comprehensive recommendation approach.

Increase Recommendation Diversity: Ensure that the recommendation system provides a diverse set of recommendations. You might implement diversity strategies to avoid recommending very similar items repeatedly.

Cross-Validation: Apply cross-validation techniques to evaluate the recommendation system on different subsets of data. This provides a more reliable estimate of performance and helps in identifying overfitting or underfitting.

Data Quality: Clean and preprocess the data thoroughly. Ensure there are no inconsistencies or errors that could negatively impact the recommendations.

1) User-Based Collaborative Filtering: Focus: Finds users with similar tastes to the target user. Recommendation: Suggests items liked by those similar users. Pros: Good for diverse item sets, captures individual preferences. Cons: Can struggle with new users (cold start), computationally expensive with large user bases.

Item-Based Collaborative Filtering: Focus: Calculates similarity between items based on user ratings. Recommendation: If a user likes item A, suggest similar item B. Pros: Handles new users better, pre-computable similarity matrix for efficiency. Cons: Less personalized, may not capture nuanced user preferences as well.

2) Collaborative filtering is a technique used in recommendation systems. Analyzing user behavior: This includes ratings, purchases, or interactions with items. Identifying similar users or items: Patterns in behavior are used to find users with similar tastes or items with similar characteristics. Generating recommendations: Based on the preferences of similar users, the system suggests items they might like. For example, if User x and User y both enjoy several of the same movies, the system might recommend a movie that User x liked to User y, assuming they have similar tastes.

User-based collaborative filtering recommends items by finding users with similar preferences and suggesting items that these similar users liked. It focuses on identifying user "neighborhoods" and assumes that if users agree on past items, they'll likely agree in the future.

 In contrast, item-based collaborative filtering recommends items by analyzing the similarity between items, suggesting items that are similar to those the user has already liked or interacted with. It focuses on item "neighborhoods," assuming that if items have been liked together by many users, they will likely be liked together in the future.

  Item-based filtering is often more scalable, as item similarity is generally more stable over time.
  
  User-based filtering can suffer from the "cold start" problem, especially with new users, while item-based filtering can better handle new user scenarios but struggles with new items.


Collaborative filtering is a technique used in recommendation systems to predict a user's interests by analyzing preferences and behaviors of similar users or items. It works by leveraging the collective knowledge of a large group of users, assuming that if users agreed on certain items in the past, they'll likely agree on other items in the future.

There are two main types: user-based collaborative filtering, which recommends items based on similarities between users, and item-based collaborative filtering, which recommends items based on similarities between items. The system builds a matrix of user-item interactions (like ratings or clicks) and uses this data to find patterns, generating personalized recommendations without needing explicit content information about the items themselves.