# Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [2]:
# Load the dataset
anime_df = pd.read_csv(r"C:\Users\J G TECH\AppData\Local\Temp\11357d20-ab15-46f9-b371-16a0499f9559_Recommendation System.zip.559\Recommendation System\anime.csv")
print(anime_df)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [3]:
# Data Preprocessing

# Handle missing values by filling them or dropping
anime_df['genre'].fillna('Unknown', inplace=True)
anime_df['type'].fillna('Unknown', inplace=True)
anime_df['rating'].fillna(anime_df['rating'].mean(), inplace=True)

In [4]:
# Ensure each genre entry is a string before splitting it into a list
anime_df['genre'] = anime_df['genre'].apply(lambda x: x if isinstance(x, list) else str(x).split(', '))

# Convert genres into a one-hot encoding (multi-label binarization)
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(anime_df['genre'])
print(anime_df['genre'])
print(mlb)
print(genre_encoded)

0                   [Drama, Romance, School, Supernatural]
1        [Action, Adventure, Drama, Fantasy, Magic, Mil...
2        [Action, Comedy, Historical, Parody, Samurai, ...
3                                       [Sci-Fi, Thriller]
4        [Action, Comedy, Historical, Parody, Samurai, ...
                               ...                        
12289                                             [Hentai]
12290                                             [Hentai]
12291                                             [Hentai]
12292                                             [Hentai]
12293                                             [Hentai]
Name: genre, Length: 12294, dtype: object
MultiLabelBinarizer()
[[0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
# Normalize numerical features: rating and members
scaler = StandardScaler()
rating_scaled = scaler.fit_transform(anime_df[['rating', 'members']])
print(rating_scaled)
print(scaler)

[[ 2.84753513  3.33024137]
 [ 2.73937967 14.14840622]
 [ 2.72954735  1.75471335]
 ...
 [-1.56717438 -0.32566298]
 [-1.46885123 -0.32646563]
 [-0.9969001  -0.32706762]]
StandardScaler()


In [6]:
# Combine the genre and rating features
features = np.hstack([genre_encoded, rating_scaled])
print(features)

[[ 0.          0.          0.         ...  0.          2.84753513
   3.33024137]
 [ 1.          1.          0.         ...  0.          2.73937967
  14.14840622]
 [ 1.          0.          0.         ...  0.          2.72954735
   1.75471335]
 ...
 [ 0.          0.          0.         ...  0.         -1.56717438
  -0.32566298]
 [ 0.          0.          0.         ...  0.         -1.46885123
  -0.32646563]
 [ 0.          0.          0.         ...  0.         -0.9969001
  -0.32706762]]


In [7]:
def recommend_anime(anime_name, anime_df, features, threshold=0.5):
    # Find the index of the anime
    anime_idx = anime_df[anime_df['name'] == anime_name].index[0]
    
    # Compute cosine similarity between the target anime and all others
    similarity_scores = cosine_similarity([features[anime_idx]], features)[0]
    
    # Get indices of anime with similarity scores above the threshold
    similar_indices = np.where(similarity_scores > threshold)[0]
    
    # Sort these indices based on similarity scores (excluding the first one since it's the anime itself)
    similar_indices = similar_indices[similar_indices != anime_idx]
    sorted_indices = similar_indices[np.argsort(similarity_scores[similar_indices])[::-1]]
    
    # Recommend these anime
    return anime_df.iloc[sorted_indices][['name', 'genre', 'rating']]

In [8]:
# Example of recommendation
recommendations = recommend_anime('Kimi no Na wa.', anime_df, features, threshold=0.6)
print(recommendations)

                                                   name  \
504   Clannad: After Story - Mou Hitotsu no Sekai, K...   
60                                   Hotarubi no Mori e   
1201                     Angel Beats!: Another Epilogue   
219   Yahari Ore no Seishun Love Comedy wa Machigatt...   
337                                        Kanon (2006)   
...                                                 ...   
770                         One Punch Man: Road to Hero   
3192                                        Zombie-Loan   
3560  Naruto Movie 3: Dai Koufun! Mikazuki Jima no A...   
9                              Gintama&#039;: Enchousen   
869                               Inu x Boku SS Special   

                                                  genre  rating  
504                            [Drama, Romance, School]    8.02  
60               [Drama, Romance, Shoujo, Supernatural]    8.61  
1201                      [Drama, School, Supernatural]    7.63  
219                    [Com

In [15]:

# Since this is a recommendation system, evaluation metrics are different. We simulate a train-test split.
train_df, test_df, train_features, test_features = train_test_split(anime_df, features, test_size=0.2, random_state=42)

In [16]:
def evaluate_recommendations(test_df, test_features, train_df, train_features, threshold=0.5):
    y_true = []
    y_pred = []
    
    for idx in range(len(test_df)):
        anime_name = test_df.iloc[idx]['name']
        # Consider the anime in the test set as the ground truth relevant item
        y_true.append(1)
        
        # Generate recommendations from the training set
        recommendations = recommend_anime(anime_name, train_df, train_features, threshold=threshold)
        
        # Check if the anime was recommended (this is a simplification)
        if any(test_df.iloc[idx]['name'] == rec for rec in recommendations['name']):
            y_pred.append(1)
        else:
            y_pred.append(0)
    
    precision = precision_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    f1 = f1_score(y_true, y_pred, average='binary')
    
    return precision, recall, f1

In [None]:
anime_idx = anime_df[anime_df['name'] == anime_name].index[0]
print(anime_idx)

In [18]:
def recommend_anime(anime_name, anime_df, features, threshold=0.5):
    # Find the index of the anime
    matching_anime = anime_df[anime_df['name'] == anime_name]
    
    if matching_anime.empty:
        raise ValueError(f"Anime '{anime_name}' not found in the dataset.")
    
    anime_idx = matching_anime.index[0]
    
    # Compute cosine similarity between the target anime and all others
    similarity_scores = cosine_similarity([features[anime_idx]], features)[0]
    
    # Continue with the recommendation logic...

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendations(test_df, test_features, train_df, train_features, threshold=0.6):
    y_true = []
    y_pred = []
    
    for idx, row in test_df.iterrows():
        anime_name = row['name']
        
        # True label (we assume the anime is relevant to itself)
        y_true.append(1)
        
        try:
            # Generate recommendations from the training set
            recommendations = recommend_anime(anime_name, train_df, train_features, threshold=threshold)
            
            # Check if the anime was recommended (this is a simplification)
            is_recommended = any(test_df.iloc[idx]['name'] == rec for rec in recommendations['name'])
            y_pred.append(1 if is_recommended else 0)
        
        except ValueError:
            # If the anime is not found in the dataset, we consider it not recommended
            y_pred.append(0)
    
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return precision, recall, f1