In [2]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from annoy import AnnoyIndex
from scipy.sparse import csr_matrix
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ----------------------------
# Step 1: Load user_filtered.csv
# ----------------------------

# Define data types for user_filtered.csv
user_filtered_dtypes = {
    'user_id': 'int32',
    'anime_id': 'int32',
    'rating': 'float32'
}

# Load user_filtered.csv
try:
    user_ratings = pd.read_csv(
        'user-filtered.csv',
        dtype=user_filtered_dtypes
    )
    print("Successfully loaded user_filtered.csv.")
except MemoryError:
    print("MemoryError: Unable to load user_filtered.csv.")
except Exception as e:
    print(f"An error occurred while loading user_filtered.csv: {e}")

# ----------------------------
# Step 2: Load anime_filtered.csv
# ----------------------------

# Define data types for anime_filtered.csv
anime_filtered_dtypes = {
    'anime_id': 'int32',
    'Name': 'object',
    'sypnopsis': 'object'
}

# Load only necessary columns to save memory
try:
    anime_details = pd.read_csv(
        'anime-filtered.csv',
        usecols=['anime_id', 'Name', 'sypnopsis'],
        dtype=anime_filtered_dtypes
    )
    print("Successfully loaded anime_filtered.csv with selected columns.")
except MemoryError:
    print("MemoryError: Unable to load anime_filtered.csv even after optimization.")
except Exception as e:
    print(f"An error occurred while loading anime_filtered.csv: {e}")

# ----------------------------
# Step 3: Handle Duplicates in anime_details
# ----------------------------

# Check for duplicate anime_id in anime_details
duplicate_anime_ids = anime_details[anime_details.duplicated('anime_id', keep=False)]
if not duplicate_anime_ids.empty:
    print(f"Found {duplicate_anime_ids.shape[0]} duplicate anime_id entries in anime_details.")
    # Remove duplicates by keeping the first occurrence
    anime_details = anime_details.drop_duplicates(subset='anime_id', keep='first')
    print("Duplicates removed from anime_details.")
else:
    print("No duplicate anime_id entries found in anime_details.")

# ----------------------------
# Step 4: Filter Anime with Minimum Ratings
# ----------------------------

# Define the minimum number of ratings required
min_ratings = 50

# Calculate the number of ratings per anime
anime_rating_counts = user_ratings['anime_id'].value_counts()

# Identify anime_ids that meet the minimum rating threshold
popular_anime = anime_rating_counts[anime_rating_counts >= min_ratings].index.tolist()

# Filter user_ratings to include only popular anime
user_ratings_filtered = user_ratings[user_ratings['anime_id'].isin(popular_anime)]

print(f"Number of anime after filtering: {len(popular_anime)}")
print(f"Number of user ratings after filtering: {user_ratings_filtered.shape[0]}")

# ----------------------------
# Step 5: Merge Filtered DataFrames
# ----------------------------

# Merge the filtered user ratings with anime details
ratings_with_anime_filtered = pd.merge(
    user_ratings_filtered,
    anime_details,
    on='anime_id',
    how='left'
)

print("Successfully merged filtered user_ratings with anime_details.")
display(ratings_with_anime_filtered.head())

# ----------------------------
# Step 6: Create Mappings
# ----------------------------

# Create unique lists of anime_ids
anime_ids = ratings_with_anime_filtered['anime_id'].unique()

# Create mappings from anime_id to index and vice versa
anime_id_to_index = {anime_id: index for index, anime_id in enumerate(anime_ids)}
index_to_anime_id = {index: anime_id for anime_id, index in anime_id_to_index.items()}

# Create unique lists of user_ids
user_ids = ratings_with_anime_filtered['user_id'].unique()

# Create mapping from user_id to index
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}

# Map anime_ids and user_ids to indices
anime_indices = ratings_with_anime_filtered['anime_id'].map(anime_id_to_index)
user_indices = ratings_with_anime_filtered['user_id'].map(user_id_to_index)
ratings = ratings_with_anime_filtered['rating'].values

# ----------------------------
# Step 7: Construct Sparse Item-User Matrix
# ----------------------------

# Create the sparse matrix (anime-user matrix)
anime_user_matrix_sparse = csr_matrix((ratings, (anime_indices, user_indices)), 
                                      shape=(len(anime_ids), len(user_ids)))

print(f"Sparse Item-User Matrix Shape: {anime_user_matrix_sparse.shape}")

# ----------------------------
# Step 8: Dimensionality Reduction with Truncated SVD
# ----------------------------

# Define the number of latent factors (dimensions)
n_factors = 100  # Adjust based on your system's capacity and desired accuracy

# Initialize Truncated SVD
svd = TruncatedSVD(n_components=n_factors, random_state=42)

# Fit and transform the sparse item-user matrix to obtain item embeddings
item_embeddings = svd.fit_transform(anime_user_matrix_sparse)

print(f"Item Embeddings Shape: {item_embeddings.shape}")

# ----------------------------
# Step 9: Normalize Embeddings
# ----------------------------

# Normalize embeddings for cosine similarity
item_embeddings_normalized = normalize(item_embeddings, norm='l2', axis=1)

# ----------------------------
# Step 10: Build the Annoy Index
# ----------------------------

# Define the number of trees for Annoy
n_trees = 10  # More trees give higher accuracy but take longer to build

# Initialize Annoy Index
annoy_index = AnnoyIndex(n_factors, 'angular')  # 'angular' is suitable for cosine similarity

# Add item embeddings to Annoy index
for i in range(item_embeddings_normalized.shape[0]):
    annoy_index.add_item(i, item_embeddings_normalized[i])

# Build the index
annoy_index.build(n_trees)

print("Annoy index built successfully.")

# ----------------------------
# Step 11: Define the Recommendation Function
# ----------------------------

def recommend_anime_annoy(anime_title, anime_details, annoy_index, 
                         anime_id_to_index, index_to_anime_id, 
                         item_embeddings_normalized, n_neighbors=5, n_recommendations=5):
    """
    Recommend anime based on a given anime title using Annoy.
    
    Parameters:
    - anime_title (str): The title of the anime to base recommendations on.
    - anime_details (DataFrame): The anime details dataset.
    - annoy_index (AnnoyIndex): The Annoy index containing item embeddings.
    - anime_id_to_index (dict): Mapping from anime_id to matrix index.
    - index_to_anime_id (dict): Mapping from matrix index to anime_id.
    - item_embeddings_normalized (ndarray): The normalized dense item embeddings.
    - n_neighbors (int): Number of similar anime to consider.
    - n_recommendations (int): Number of anime to recommend.
    
    Returns:
    - recommendations (DataFrame): Recommended anime with synopsis.
    """
    # Find the anime_id based on the title
    anime_subset = anime_details[anime_details['Name'].str.lower() == anime_title.lower()]
    
    if anime_subset.empty:
        print("Anime title not found in the dataset.")
        return
    
    anime_id = anime_subset['anime_id'].values[0]
    anime_idx = anime_id_to_index.get(anime_id, None)
    
    if anime_idx is None:
        print("Anime index not found.")
        return
    
    # Find similar anime indices using Annoy
    similar_anime_indices = annoy_index.get_nns_by_item(anime_idx, n_neighbors + 1)[1:]  # Exclude the anime itself
    
    # Map indices back to anime_ids
    similar_anime_ids = [index_to_anime_id[idx] for idx in similar_anime_indices]
    
    # Fetch anime details for similar anime
    recommended_anime = anime_details[anime_details['anime_id'].isin(similar_anime_ids)][['Name', 'sypnopsis']]
    
    return recommended_anime.reset_index(drop=True)


Successfully loaded user_filtered.csv.
Successfully loaded anime_filtered.csv with selected columns.
No duplicate anime_id entries found in anime_details.
Number of anime after filtering: 16272
Number of user ratings after filtering: 109179072
Successfully merged filtered user_ratings with anime_details.


Unnamed: 0,user_id,anime_id,rating,Name,sypnopsis
0,0,67,9.0,Basilisk: Kouga Ninpou Chou,"For centuries, the Iga and Kouga ninja clans h..."
1,0,6702,7.0,Fairy Tail,"In the mystical land of Fiore, magic exists as..."
2,0,242,10.0,Gokusen,"Kumiko Yamaguchi is smart, enthusiastic, and r..."
3,0,4898,0.0,Kuroshitsuji,"Young Ciel Phantomhive is known as ""the Queen'..."
4,0,21,10.0,One Piece,"Gol D. Roger was known as the ""Pirate King,"" t..."


Sparse Item-User Matrix Shape: (16272, 325770)
Item Embeddings Shape: (16272, 100)
Annoy index built successfully.


In [3]:
# Example Usage
favorite_anime_title = "Toriko"  # Replace with the desired anime title

recommended_anime = recommend_anime_annoy(
    anime_title=favorite_anime_title,
    anime_details=anime_details,
    annoy_index=annoy_index,
    anime_id_to_index=anime_id_to_index,
    index_to_anime_id=index_to_anime_id,
    item_embeddings_normalized=item_embeddings_normalized,
    n_neighbors=5,
    n_recommendations=5
)

print(f"Top 5 Recommendations based on '{favorite_anime_title}':\n")
display(recommended_anime)


Top 5 Recommendations based on 'Toriko':



Unnamed: 0,Name,sypnopsis
0,Eyeshield 21,Sena is like any other shy kid starting high s...
1,Grappler Baki (TV),"Ever since he was born, Baki Hanma has always ..."
2,Katekyo Hitman Reborn!,There is no putting it lightly—Tsunayoshi Sawa...
3,Cardfight!! Vanguard,Cardfight!! Vanguard features a world where th...
4,Kingdom,"China’s Warring States period, a raging dragon..."


In [4]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from annoy import AnnoyIndex
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

# ----------------------------
# Step 1: Load user_filtered.csv
# ----------------------------

# Define data types for user_filtered.csv
user_filtered_dtypes = {
    'user_id': 'int32',
    'anime_id': 'int32',
    'rating': 'float32'
}

# Load user_filtered.csv
try:
    user_ratings = pd.read_csv(
        'user-filtered.csv',
        dtype=user_filtered_dtypes
    )
    print("Successfully loaded user_filtered.csv.")
except MemoryError:
    print("MemoryError: Unable to load user_filtered.csv.")
except Exception as e:
    print(f"An error occurred while loading user_filtered.csv: {e}")

# ----------------------------
# Step 2: Load anime_filtered.csv
# ----------------------------

# Define data types for anime_filtered.csv
anime_filtered_dtypes = {
    'anime_id': 'int32',
    'Name': 'object',
    'sypnopsis': 'object'
}

# Load only necessary columns to save memory
try:
    anime_details = pd.read_csv(
        'anime-filtered.csv',
        usecols=['anime_id', 'Name', 'sypnopsis'],
        dtype=anime_filtered_dtypes
    )
    print("Successfully loaded anime_filtered.csv with selected columns.")
except MemoryError:
    print("MemoryError: Unable to load anime_filtered.csv even after optimization.")
except Exception as e:
    print(f"An error occurred while loading anime_filtered.csv: {e}")

# ----------------------------
# Step 3: Handle Duplicates in anime_details
# ----------------------------

# Check for duplicate anime_id in anime_details
duplicate_anime_ids = anime_details[anime_details.duplicated('anime_id', keep=False)]
if not duplicate_anime_ids.empty:
    print(f"Found {duplicate_anime_ids.shape[0]} duplicate anime_id entries in anime_details.")
    # Remove duplicates by keeping the first occurrence
    anime_details = anime_details.drop_duplicates(subset='anime_id', keep='first')
    print("Duplicates removed from anime_details.")
else:
    print("No duplicate anime_id entries found in anime_details.")

# ----------------------------
# Step 4: Filter Anime with Minimum Ratings
# ----------------------------

# Define the minimum number of ratings required
min_ratings = 50

# Calculate the number of ratings per anime
anime_rating_counts = user_ratings['anime_id'].value_counts()

# Identify anime_ids that meet the minimum rating threshold
popular_anime = anime_rating_counts[anime_rating_counts >= min_ratings].index.tolist()

# Filter user_ratings to include only popular anime
user_ratings_filtered = user_ratings[user_ratings['anime_id'].isin(popular_anime)]

print(f"Number of anime after filtering: {len(popular_anime)}")
print(f"Number of user ratings after filtering: {user_ratings_filtered.shape[0]}")

# ----------------------------
# Step 5: Merge Filtered DataFrames
# ----------------------------

# Merge the filtered user ratings with anime details
ratings_with_anime_filtered = pd.merge(
    user_ratings_filtered,
    anime_details,
    on='anime_id',
    how='left'
)

print("Successfully merged filtered user_ratings with anime_details.")
display(ratings_with_anime_filtered.head())

# ----------------------------
# Step 6: Create Mappings
# ----------------------------

# Create unique lists of user_ids and anime_ids
user_ids = ratings_with_anime_filtered['user_id'].unique()
anime_ids = ratings_with_anime_filtered['anime_id'].unique()

# Create mappings from ids to indices
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
anime_id_to_index = {anime_id: index for index, anime_id in enumerate(anime_ids)}
index_to_anime_id = {index: anime_id for anime_id, index in anime_id_to_index.items()}

# Map user_ids and anime_ids to indices
user_indices = ratings_with_anime_filtered['user_id'].map(user_id_to_index)
anime_indices = ratings_with_anime_filtered['anime_id'].map(anime_id_to_index)
ratings = ratings_with_anime_filtered['rating'].values

# ----------------------------
# Step 7: Construct Sparse User-Item Matrix
# ----------------------------

# Create the sparse matrix
user_item_matrix_sparse = csr_matrix((ratings, (user_indices, anime_indices)), 
                                     shape=(len(user_ids), len(anime_ids)))

print(f"Sparse User-Item Matrix Shape: {user_item_matrix_sparse.shape}")

# ----------------------------
# Step 8: Dimensionality Reduction with Truncated SVD
# ----------------------------

# Define the number of latent factors (dimensions)
n_factors = 100  # Adjust based on your system's capacity and desired accuracy

# Initialize Truncated SVD
svd = TruncatedSVD(n_components=n_factors, random_state=42)

# Fit and transform the sparse user-item matrix to obtain user embeddings
user_embeddings = svd.fit_transform(user_item_matrix_sparse)

print(f"User Embeddings Shape: {user_embeddings.shape}")

# ----------------------------
# Step 9: Normalize Embeddings
# ----------------------------

# Normalize embeddings for cosine similarity
user_embeddings_normalized = normalize(user_embeddings, norm='l2', axis=1)

# ----------------------------
# Step 10: Build the Annoy Index
# ----------------------------

# Define the number of trees for Annoy
n_trees = 10  # More trees give higher accuracy but take longer to build

# Initialize Annoy Index
annoy_index = AnnoyIndex(n_factors, 'angular')  # 'angular' is suitable for cosine similarity

# Add user embeddings to Annoy index
for i in range(user_embeddings_normalized.shape[0]):
    annoy_index.add_item(i, user_embeddings_normalized[i])

# Build the index
annoy_index.build(n_trees)

print("Annoy index built successfully.")

# ----------------------------
# Step 11: Define the Recommendation Function
# ----------------------------

def recommend_anime_annoy(user_id, user_item_matrix, anime_details, annoy_index, 
                         user_id_to_index, anime_id_to_index, 
                         index_to_anime_id, user_embeddings_normalized, n_neighbors=5, n_recommendations=5):
    """
    Recommend anime to a user based on Annoy.

    Parameters:
    - user_id (int): The ID of the user to make recommendations for.
    - user_item_matrix (csr_matrix): The user-item rating matrix.
    - anime_details (DataFrame): The anime details dataset.
    - annoy_index (AnnoyIndex): The Annoy index containing user embeddings.
    - user_id_to_index (dict): Mapping from user_id to matrix index.
    - anime_id_to_index (dict): Mapping from anime_id to matrix index.
    - index_to_anime_id (dict): Mapping from matrix index to anime_id.
    - user_embeddings_normalized (ndarray): The normalized dense user embeddings.
    - n_neighbors (int): Number of similar users to consider.
    - n_recommendations (int): Number of anime to recommend.

    Returns:
    - recommendations (DataFrame): Recommended anime with synopsis.
    """
    if user_id not in user_id_to_index:
        print("User ID not found in the dataset.")
        return

    # Get the index of the user
    user_idx = user_id_to_index[user_id]

    # Find similar users
    similar_users_indices = annoy_index.get_nns_by_item(user_idx, n_neighbors + 1)[1:]  # Exclude the user itself

    # Aggregate ratings from similar users
    similar_users_ratings = user_item_matrix[similar_users_indices]

    # Compute the mean ratings for each anime
    mean_ratings = similar_users_ratings.mean(axis=0).A1  # Convert to 1D array

    # Convert to a Series for easier manipulation
    mean_ratings_series = pd.Series(mean_ratings, index=index_to_anime_id.keys())

    # Get anime_ids already rated by the user
    user_ratings = user_item_matrix[user_idx]
    user_rated_anime = user_ratings.nonzero()[1]
    user_rated_anime_ids = [index_to_anime_id[idx] for idx in user_rated_anime]

    # Exclude anime already rated by the user
    recommendations = mean_ratings_series.drop(user_rated_anime_ids, errors='ignore')

    # Get top N recommendations
    top_recommendations = recommendations.sort_values(ascending=False).head(n_recommendations).index.tolist()

    # Fetch anime details
    recommended_anime = anime_details[anime_details['anime_id'].isin(top_recommendations)][['Name', 'sypnopsis']]

    return recommended_anime.reset_index(drop=True)

# ----------------------------
# Step 12: Generate Recommendations for a Specific User
# ----------------------------

# Example Usage
user_id_input = 1  # Replace with the desired user_id

recommended_anime_annoy = recommend_anime_annoy(
    user_id=user_id_input,
    user_item_matrix=user_item_matrix_sparse,
    anime_details=anime_details,
    annoy_index=annoy_index,
    user_id_to_index=user_id_to_index,
    anime_id_to_index=anime_id_to_index,
    index_to_anime_id=index_to_anime_id,
    user_embeddings_normalized=user_embeddings_normalized,
    n_neighbors=5,
    n_recommendations=5
)

print(f"Top 5 Recommendations for User ID {user_id_input}:\n")
display(recommended_anime_annoy)


Successfully loaded user_filtered.csv.
Successfully loaded anime_filtered.csv with selected columns.
No duplicate anime_id entries found in anime_details.
Number of anime after filtering: 16272
Number of user ratings after filtering: 109179072
Successfully merged filtered user_ratings with anime_details.


Unnamed: 0,user_id,anime_id,rating,Name,sypnopsis
0,0,67,9.0,Basilisk: Kouga Ninpou Chou,"For centuries, the Iga and Kouga ninja clans h..."
1,0,6702,7.0,Fairy Tail,"In the mystical land of Fiore, magic exists as..."
2,0,242,10.0,Gokusen,"Kumiko Yamaguchi is smart, enthusiastic, and r..."
3,0,4898,0.0,Kuroshitsuji,"Young Ciel Phantomhive is known as ""the Queen'..."
4,0,21,10.0,One Piece,"Gol D. Roger was known as the ""Pirate King,"" t..."


Sparse User-Item Matrix Shape: (325770, 16272)
User Embeddings Shape: (325770, 100)
Annoy index built successfully.
Top 5 Recommendations for User ID 1:



Unnamed: 0,Name,sypnopsis
0,DearS,"One year ago, a UFO containing 150 aliens cras..."
1,El Hazard: The Wanderers,High school science-whiz Makoto Mizuhara is wo...
2,Fruits Basket,fter the accident in which she lost her mother...
3,Gensoumaden Saiyuuki,"any years ago, humans and demons lived in harm..."
4,Sakura Taisen,Sakura travels to the capital with aspirations...


 With Confidence scores

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from annoy import AnnoyIndex
from scipy.sparse import csr_matrix
from collections import defaultdict
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')


In [4]:
import pandas as pd

# Load user ratings
user_ratings = pd.read_csv('user-filtered.csv')  # Columns: user_id, anime_id, rating

# Load anime details
anime_details = pd.read_csv('anime-filtered.csv')  # Columns: anime_id, Name, sypnopsis


In [5]:
from sklearn.model_selection import train_test_split

# Define test size
test_size = 0.2  # 20% for testing

# Perform the split
train, test = train_test_split(user_ratings, test_size=test_size, random_state=42)

print(f"Training set size: {train.shape[0]}")
print(f"Test set size: {test.shape[0]}")


Training set size: 87379797
Test set size: 21844950


In [6]:
# Retain only those anime_ids in the test set that are present in the training set
test = test[test['anime_id'].isin(train['anime_id'].unique())]

print(f"Adjusted Test set size: {test.shape[0]}")


Adjusted Test set size: 21844950


In [7]:
# Find users present in both train and test
common_users = set(train['user_id']).intersection(set(test['user_id']))

# Retain only interactions from common users
train = train[train['user_id'].isin(common_users)]
test = test[test['user_id'].isin(common_users)]

print(f"Number of common users: {len(common_users)}")
print(f"Training set size after user filtering: {train.shape[0]}")
print(f"Test set size after user filtering: {test.shape[0]}")


Number of common users: 318057
Training set size after user filtering: 87353702
Test set size after user filtering: 21844119


In [12]:
pip install matplotlib


Collecting matplotlib
  Downloading matplotlib-3.9.2-cp39-cp39-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.54.1-cp39-cp39-win_amd64.whl.metadata (167 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-10.4.0-cp39-cp39-win_amd64.whl.metadata (9.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.1.4-py3-none-any.whl.metadata (5.1 kB)
Collecting importlib-resources>=3.2.0 (from matplotlib)
  Downloading importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Downloading matplotlib-3.9.2-cp39-cp39-win_amd64.whl (7.8 MB)
   -------------------------

In [15]:
# Define relevant threshold
relevant_threshold = 4
test_relevant = test[test['rating'] >= relevant_threshold]

In [16]:
# Create unique lists of anime_ids and user_ids
anime_ids = train['anime_id'].unique()
user_ids = train['user_id'].unique()

# Create mappings from anime_id to index and vice versa
anime_id_to_index = {anime_id: idx for idx, anime_id in enumerate(anime_ids)}
index_to_anime_id = {idx: anime_id for anime_id, idx in anime_id_to_index.items()}

# Create mapping from user_id to index
user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}

# Map anime_ids and user_ids to indices
anime_indices_train = train['anime_id'].map(anime_id_to_index)
user_indices_train = train['user_id'].map(user_id_to_index)
ratings_train = train['rating'].values

# Create the sparse matrix (anime-user matrix)
num_anime = len(anime_ids)
num_users = len(user_ids)
anime_user_matrix_sparse = csr_matrix((ratings_train, (anime_indices_train, user_indices_train)),
                                      shape=(num_anime, num_users))


In [17]:
# Define the number of latent factors (dimensions)
n_factors = 100  # Adjust based on your system's capacity and desired accuracy

# Initialize Truncated SVD
svd = TruncatedSVD(n_components=n_factors, random_state=42)

# Fit and transform the sparse anime-user matrix to obtain item embeddings
item_embeddings = svd.fit_transform(anime_user_matrix_sparse)

# Normalize embeddings for cosine similarity
item_embeddings_normalized = normalize(item_embeddings, norm='l2', axis=1)


In [18]:
# Define the number of trees for Annoy
n_trees = 10  # More trees give higher accuracy but take longer to build

# Initialize Annoy Index
annoy_index = AnnoyIndex(n_factors, 'angular')  # 'angular' is suitable for cosine similarity

# Add item embeddings to Annoy index
for i in range(item_embeddings_normalized.shape[0]):
    annoy_index.add_item(i, item_embeddings_normalized[i])

# Build the index
annoy_index.build(n_trees)

print("Annoy index built successfully.")


Annoy index built successfully.


In [19]:
def recommend_anime_with_confidence(anime_title, anime_details, annoy_index, 
                                    anime_id_to_index, index_to_anime_id, 
                                    item_embeddings_normalized, n_neighbors=5, n_recommendations=5):
    """
    Recommend anime based on a given anime title using Annoy, along with confidence scores.
    
    Parameters:
    - anime_title (str): The title of the anime to base recommendations on.
    - anime_details (DataFrame): The anime details dataset.
    - annoy_index (AnnoyIndex): The Annoy index containing item embeddings.
    - anime_id_to_index (dict): Mapping from anime_id to matrix index.
    - index_to_anime_id (dict): Mapping from matrix index to anime_id.
    - item_embeddings_normalized (ndarray): The normalized dense item embeddings.
    - n_neighbors (int): Number of similar anime to consider.
    - n_recommendations (int): Number of anime to recommend.
    
    Returns:
    - recommendations (DataFrame): Recommended anime with synopsis and confidence scores.
    """
    # Find the anime_id based on the title
    anime_subset = anime_details[anime_details['Name'].str.lower() == anime_title.lower()]
    
    if anime_subset.empty:
        print("Anime title not found in the dataset.")
        return pd.DataFrame()
    
    anime_id = anime_subset['anime_id'].values[0]
    anime_idx = anime_id_to_index.get(anime_id, None)
    
    if anime_idx is None:
        print("Anime index not found.")
        return pd.DataFrame()
    
    # Find similar anime indices using Annoy
    similar_anime_indices, distances = annoy_index.get_nns_by_item(
        anime_idx, 
        n_neighbors + 1,  # +1 to exclude the anime itself
        include_distances=True
    )
    
    # Exclude the anime itself
    similar_anime_indices = similar_anime_indices[1:]
    distances = distances[1:]
    
    # Convert angular distances back to cosine similarity
    # Cosine similarity = 1 - (distance^2 / 2)
    cosine_similarities = 1 - (np.array(distances)**2) / 2
    
    # Map indices back to anime_ids
    similar_anime_ids = [index_to_anime_id[idx] for idx in similar_anime_indices]
    
    # Fetch anime details for similar anime
    recommended_anime = anime_details[anime_details['anime_id'].isin(similar_anime_ids)].copy()
    
    # Add confidence scores
    recommended_anime['Confidence_Score'] = cosine_similarities
    
    # Sort by confidence score descending
    recommended_anime = recommended_anime.sort_values(by='Confidence_Score', ascending=False)
    
    # Select top N recommendations
    recommended_anime = recommended_anime.head(n_recommendations)
    
    return recommended_anime[['Name', 'sypnopsis', 'Confidence_Score']].reset_index(drop=True)


In [20]:
# Example Usage
favorite_anime_title = "Toriko"  # Replace with an existing title in your dataset

recommended_anime = recommend_anime_with_confidence(
    anime_title=favorite_anime_title,
    anime_details=anime_details,
    annoy_index=annoy_index,
    anime_id_to_index=anime_id_to_index,
    index_to_anime_id=index_to_anime_id,
    item_embeddings_normalized=item_embeddings_normalized,
    n_neighbors=5,
    n_recommendations=5
)

print(f"Top 5 Recommendations based on '{favorite_anime_title}':\n")
print(recommended_anime)


Top 5 Recommendations based on 'Toriko':

                        Name  \
0               Eyeshield 21   
1     Katekyo Hitman Reborn!   
2                  Beelzebub   
3  Phi Brain: Kami no Puzzle   
4                    Kingdom   

                                           sypnopsis  Confidence_Score  
0  Sena is like any other shy kid starting high s...          0.801258  
1  There is no putting it lightly—Tsunayoshi Sawa...          0.794274  
2  Ishiyama High is a school populated entirely b...          0.788396  
3  Kaito Daimon would be a completely average hig...          0.781528  
4  China’s Warring States period, a raging dragon...          0.773314  
