In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the dataset
anime_df = pd.read_csv('anime.csv')

# Fill missing values in the genre column with an empty string
anime_df['genre'] = anime_df['genre'].fillna('')

# Define a function to clean the genres
def clean_genres(genres):
    if isinstance(genres, str):
        return genres.replace(", ", " ")
    else:
        return ""

# Apply the function to clean the genres
anime_df['cleaned_genre'] = anime_df['genre'].apply(clean_genres)

# Split the data into training and test sets
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

# Create the TfidfVectorizer instance to capture genre importance better
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned genres using TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(anime_df['cleaned_genre'])

# Compute the cosine similarity matrix using TF-IDF
cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get recommendations based on anime name using TF-IDF cosine similarity
def get_recommendations_tfidf(anime_name, cosine_sim=cosine_sim_tfidf, num_recommendations=10):
    if anime_name not in anime_df['name'].values:
        return []

    idx = anime_df[anime_df['name'] == anime_name].index[0]
    sim_scores = list(enumerate(cosine_sim_tfidf[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]

# Evaluation metric
def precision_at_k(recommended_list, relevant_list, k):
    recommended_set = set(recommended_list[:k])
    relevant_set = set(relevant_list)
    common_items = recommended_set.intersection(relevant_set)
    return len(common_items) / k

# Relevant list for 'Naruto' for evaluation purposes
relevant_animes = ['One Piece', 'Attack on Titan', 'My Hero Academia', 'Dragon Ball', 'Fairy Tail']

# Evaluate the precision on the test set using TF-IDF
def evaluate_precision_tfidf(test_df, k=5):
    precisions = []

    for anime_name in test_df['name'].unique():
        if anime_name in anime_df['name'].values:
            relevant_animes = test_df[test_df['name'] == anime_name]['name'].values
            recommended_animes = get_recommendations_tfidf(anime_name, num_recommendations=k)
            
            if not recommended_animes.empty and len(relevant_animes) > 0:
                precision = precision_at_k(recommended_animes, relevant_animes, k)
                precisions.append(precision)
            else:
                print(f"No recommendations or relevant animes for {anime_name}")

    return np.mean(precisions) if precisions else 0.0

# Calculate the mean precision at k for the test set using TF-IDF
k=5
mean_precision_tfidf = evaluate_precision_tfidf(test_df)
print(f'Mean Precision at {k} using TF-IDF: {mean_precision_tfidf:.2f}')


Mean Precision at 5 using TF-IDF: 0.06


In [2]:
anime_df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members',
       'cleaned_genre'],
      dtype='object')

In [3]:
import pandas as pd

# Load the dataset
anime_df = pd.read_csv('anime.csv')

# Display the first few rows of the dataset
print(anime_df.head())


   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


In [4]:
# Summary statistics
print(anime_df.describe())

# Data types of columns
print(anime_df.dtypes)


           anime_id        rating       members
count  12294.000000  12064.000000  1.229400e+04
mean   14058.221653      6.473902  1.807134e+04
std    11455.294701      1.026746  5.482068e+04
min        1.000000      1.670000  5.000000e+00
25%     3484.250000      5.880000  2.250000e+02
50%    10260.500000      6.570000  1.550000e+03
75%    24794.500000      7.180000  9.437000e+03
max    34527.000000     10.000000  1.013917e+06
anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object


In [5]:
# Define a function to clean the genres
def clean_genres(genres):
    if isinstance(genres, str):
        return genres.replace(", ", " ")
    else:
        return ""

# Apply the function to clean the genres
anime_df['cleaned_genre'] = anime_df['genre'].apply(clean_genres)


In [6]:
# Check for missing values in the genre column
print(anime_df['genre'].isna().sum())

# Fill missing values in the genre column with an empty string before applying the function
anime_df['genre'] = anime_df['genre'].fillna('')

# Apply the function to clean the genres
anime_df['cleaned_genre'] = anime_df['genre'].apply(clean_genres)


62


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create the CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the cleaned genres
genre_matrix = vectorizer.fit_transform(anime_df['cleaned_genre'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)


In [8]:
# Function to get recommendations based on anime name
def get_recommendations(anime_name, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the name
    idx = anime_df[anime_df['name'] == anime_name].index[0]

    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar animes
    sim_scores = sim_scores[1:6]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar animes
    return anime_df['name'].iloc[anime_indices]


In [9]:
# Function to get recommendations based on anime name
def get_recommendations(anime_name, cosine_sim=cosine_sim):
    # Check if the anime name exists in the dataset
    if anime_name not in anime_df['name'].values:
        return f"Anime '{anime_name}' not found in the dataset."

    # Get the index of the anime that matches the name
    idx = anime_df[anime_df['name'] == anime_name].index[0]

    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar animes
    sim_scores = sim_scores[1:6]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar animes
    return anime_df['name'].iloc[anime_indices]

# Test the recommendation system
print("Recommendations for 'Naruto':\n", get_recommendations('Naruto'))


Recommendations for 'Naruto':
 615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [10]:
# Test the recommendation system
print("Recommendations for 'Naruto':\n", get_recommendations('Naruto'))


Recommendations for 'Naruto':
 615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
Name: name, dtype: object


In [11]:
def precision_at_k(recommended_list, relevant_list, k):
    recommended_set = set(recommended_list[:k])
    relevant_set = set(relevant_list)
    common_items = recommended_set.intersection(relevant_set)
    return len(common_items) / k


In [12]:
# Example relevant list for 'Naruto'
relevant_animes = ['One Piece', 'Attack on Titan', 'My Hero Academia']

# Get recommendations
recommended_animes = get_recommendations('Naruto')

# Calculate precision at k
k = 5
precision = precision_at_k(recommended_animes, relevant_animes, k)
print(f'Precision at {k}: {precision:.2f}')


Precision at 5: 0.00


In [13]:
# Function to get recommendations based on anime name
def get_recommendations(anime_name, cosine_sim=cosine_sim):
    # Check if the anime name exists in the dataset
    if anime_name not in anime_df['name'].values:
        return f"Anime '{anime_name}' not found in the dataset."

    # Get the index of the anime that matches the name
    idx = anime_df[anime_df['name'] == anime_name].index[0]

    # Get the pairwise similarity scores of all animes with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the animes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar animes
    sim_scores = sim_scores[1:6]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar animes
    return anime_df['name'].iloc[anime_indices]

# Evaluation metric
def precision_at_k(recommended_list, relevant_list, k):
    recommended_set = set(recommended_list[:k])
    relevant_set = set(relevant_list)
    common_items = recommended_set.intersection(relevant_set)
    return len(common_items) / k

# Relevant list for 'Naruto' for evaluation purposes
relevant_animes = ['One Piece', 'Attack on Titan', 'My Hero Academia']

# Get recommendations
recommended_animes = get_recommendations('Naruto')

# Calculate precision at k
k = 5
precision = precision_at_k(recommended_animes, relevant_animes, k)
print(f'Precision at {k}: {precision:.2f}')


Precision at 5: 0.00


In [14]:
# Print cleaned genres
print(anime_df[['name', 'cleaned_genre']])

# Check the similarity scores for 'Naruto'
naruto_index = anime_df[anime_df['name'] == 'Naruto'].index[0]
naruto_sim_scores = list(enumerate(cosine_sim[naruto_index]))
naruto_sim_scores = sorted(naruto_sim_scores, key=lambda x: x[1], reverse=True)
print(naruto_sim_scores[:10])  # Print top 10 similarity scores


                                                    name  \
0                                         Kimi no Na wa.   
1                       Fullmetal Alchemist: Brotherhood   
2                                               Gintama°   
3                                            Steins;Gate   
4                                          Gintama&#039;   
...                                                  ...   
12289       Toushindai My Lover: Minami tai Mecha-Minami   
12290                                        Under World   
12291                     Violence Gekiga David no Hoshi   
12292  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293                   Yasuji no Pornorama: Yacchimae!!   

                                           cleaned_genre  
0                      Drama Romance School Supernatural  
1      Action Adventure Drama Fantasy Magic Military ...  
2      Action Comedy Historical Parody Samurai Sci-Fi...  
3                                        Sc

In [15]:
# Function to get recommendations based on anime name
def get_recommendations(anime_name, cosine_sim=cosine_sim, num_recommendations=10):
    if anime_name not in anime_df['name'].values:
        return f"Anime '{anime_name}' not found in the dataset."

    idx = anime_df[anime_df['name'] == anime_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]

# Get more recommendations
num_recommendations = 10
recommended_animes = get_recommendations('Naruto', num_recommendations=num_recommendations)
print(recommended_animes)


615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
7628                              Kyutai Panic Adventure!
206                                         Dragon Ball Z
Name: name, dtype: object


In [16]:
# Calculate precision at k with more recommendations
precision = precision_at_k(recommended_animes, relevant_animes, k)
print(f'Precision at {k}: {precision:.2f}')


Precision at 5: 0.00


In [17]:
# Define a function to clean the genres
def clean_genres(genres):
    if isinstance(genres, str):
        return genres.replace(", ", " ")
    else:
        return ""

# Fill missing values in the genre column with an empty string
anime_df['genre'] = anime_df['genre'].fillna('')

# Apply the function to clean the genres
anime_df['cleaned_genre'] = anime_df['genre'].apply(clean_genres)

# Create the CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the cleaned genres
genre_matrix = vectorizer.fit_transform(anime_df['cleaned_genre'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

# Function to get recommendations based on anime name
def get_recommendations(anime_name, cosine_sim=cosine_sim, num_recommendations=10):
    if anime_name not in anime_df['name'].values:
        return f"Anime '{anime_name}' not found in the dataset."

    idx = anime_df[anime_df['name'] == anime_name].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]

# Evaluation metric
def precision_at_k(recommended_list, relevant_list, k):
    recommended_set = set(recommended_list[:k])
    relevant_set = set(relevant_list)
    common_items = recommended_set.intersection(relevant_set)
    return len(common_items) / k

# Relevant list for 'Naruto' for evaluation purposes
relevant_animes = ['One Piece', 'Attack on Titan', 'My Hero Academia']

# Get more recommendations
num_recommendations = 10
recommended_animes = get_recommendations('Naruto', num_recommendations=num_recommendations)
print(f"Recommendations for 'Naruto':\n{recommended_animes}")

# Calculate precision at k with more recommendations
k = 5
precision = precision_at_k(recommended_animes, relevant_animes, k)
print(f'Precision at {k}: {precision:.2f}')

# Print cleaned genres and similarity scores for sanity check
print(anime_df[['name', 'cleaned_genre']])
naruto_index = anime_df[anime_df['name'] == 'Naruto'].index[0]
naruto_sim_scores = list(enumerate(cosine_sim[naruto_index]))
naruto_sim_scores = sorted(naruto_sim_scores, key=lambda x: x[1], reverse=True)
print(naruto_sim_scores[:10])


Recommendations for 'Naruto':
615                                    Naruto: Shippuuden
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
7628                              Kyutai Panic Adventure!
206                                         Dragon Ball Z
Name: name, dtype: object
Precision at 5: 0.00
                                                    name  \
0                                         Kimi no Na wa.   
1                       Fullmetal Alchemist: Brotherhood   
2                                               Gintama°   
3                                            Steins;Gate   
4                                          

In [25]:
recommended_animes = get_recommendations(' Drama Romance School Supernatural', num_recommendations=10)
print(recommended_animes)

Anime ' Drama Romance School Supernatural' not found in the dataset.


In [20]:
# 4. Evaluate Precision at k
def precision_at_k(recommended_list, relevant_list, k):
    recommended_set = set(recommended_list[:k])
    relevant_set = set(relevant_list)
    common_items = recommended_set.intersection(relevant_set)
    return len(common_items) / k


In [21]:
relevant_animes = ['One Piece', 'Attack on Titan', 'My Hero Academia']
precision = precision_at_k(recommended_animes, relevant_animes, k=5)
print(f'Precision at 5: {precision:.2f}')

Precision at 5: 0.00
