In [78]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset
anime_df = pd.read_csv(r"C:\Users\LENOVO\python datascience basics (hydrabad ds)\ASSIGNMENT DATA SCIENCE\Recommendation System\anime.csv")  # Ensure the path is correct

# Drop rows with missing genres
anime = anime_df.dropna(subset=['genre']).reset_index(drop=True)

# Split into training and testing datasets
train_df, test_df = train_test_split(anime, test_size=0.2, random_state=42)

# Vectorize genres using CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
train_genres_matrix = vectorizer.fit_transform(train_df['genre'])

# Evaluation Function
def evaluate_recommendations(train_df, test_df, train_matrix, top_k=5):
    y_true, y_pred = [], []

    for _, row in test_df.iterrows():
        test_genre = row['genre']
        if pd.isna(test_genre):
            continue
        
        # Vectorize test genre
        test_vector = vectorizer.transform([test_genre])
        
        # Compute cosine similarity
        similarity_scores = cosine_similarity(test_vector, train_matrix).flatten()
        top_indices = similarity_scores.argsort()[::-1][:top_k]
        
        # Compare with recommended genres
        test_genre_set = set(test_genre.split(', '))
        for idx in top_indices:
            rec_genre_set = set(train_df.iloc[idx]['genre'].split(', '))
            y_pred.append(1 if test_genre_set.intersection(rec_genre_set) else 0)
        
        y_true.extend([1] * top_k)

    if not y_pred:
        print("No recommendations to evaluate.")
        return 0, 0, 0

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    return precision, recall, f1

# Run evaluation
evaluate_recommendations(train_df, test_df, train_genres_matrix, top_k=5)




Precision: 1.000
Recall: 1.000
F1 Score: 1.000


(1.0, 1.0, 1.0)

In [79]:
def recommend_animes(anime_title, df, genre_matrix, top_k=10, threshold=0.3):
    if anime_title not in df['name'].values:
        print("Anime not found in dataset.")
        return pd.DataFrame()

    idx = df[df['name'] == anime_title].index[0]
    cosine_similarities = cosine_similarity(genre_matrix[idx], genre_matrix).flatten()

    # Filter based on threshold and exclude itself
    filtered_indices = [i for i, score in enumerate(cosine_similarities) 
                        if score >= threshold and i != idx]

    # Sort and pick top_k
    top_indices = sorted(filtered_indices, key=lambda i: cosine_similarities[i], reverse=True)[:top_k]

    recommendations = df.iloc[top_indices][['name', 'genre']]
    recommendations['Similarity Score'] = [cosine_similarities[i] for i in top_indices]
    return recommendations.reset_index(drop=True)


In [80]:
print("Threshold = 0.2")
print(recommend_animes("Naruto", train_df, train_genres_matrix, top_k=10, threshold=0.2))

print("\nThreshold = 0.4")
print(recommend_animes("Naruto", train_df, train_genres_matrix, top_k=10, threshold=0.4))

print("\nThreshold = 0.6")
print(recommend_animes("Naruto", train_df, train_genres_matrix, top_k=10, threshold=0.6))


Threshold = 0.2
                                                name  \
0                       Danshi Koukousei no Nichijou   
1                                Seitokai Yakuindomo   
2              Danshi Koukousei no Nichijou Specials   
3                             Gin no Saji 2nd Season   
4                                        Gin no Saji   
5                               Seitokai Yakuindomo*   
6                                           Amanchu!   
7  Watashi ga Motenai no wa Dou Kangaetemo Omaera...   
8                                              Aiura   
9                            Seitokai Yakuindomo OVA   

                                    genre  Similarity Score  
0  Comedy, School, Shounen, Slice of Life               1.0  
1  Comedy, School, Shounen, Slice of Life               1.0  
2  Comedy, School, Shounen, Slice of Life               1.0  
3  Comedy, School, Shounen, Slice of Life               1.0  
4  Comedy, School, Shounen, Slice of Life               1

### Hybrid Recommendation

In [81]:
# Ensure 'genre' column exists and has no missing values
anime_df =anime_df.dropna(subset=['genre'])
anime_df['genre'] = anime_df['genre'].astype(str)

# Now apply TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])



In [82]:
from sklearn.preprocessing import MinMaxScaler

#from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
anime_df['rating'] = anime_df['rating'].fillna(0)
anime_df['rating_normalized'] = scaler.fit_transform(anime_df[['rating']])



In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure no null genre values and convert to string
anime_df =anime_df.dropna(subset=['genre'])
anime_df['genre'] = anime_df['genre'].astype(str)

# Vectorize genre text
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(anime_df['genre'])


In [84]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def hybrid_recommendation(anime_title, top_n=10):
    if anime_title not in anime_df['name'].values:
        return f"'{anime_title}' not found in anime list."

    # Correct index selection
    idx = anime_df[anime_df['name'] == anime_title].index[0]

    # Compute genre similarity
    genre_similarity = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]

    # Combine similarity and normalized rating
    combined_score = 0.7 * genre_similarity + 0.3 * anime_df['rating_normalized'].values

    # Get top N recommendations
    top_indices = combined_score.argsort()[::-1][1:top_n+1]
    return anime_df.iloc[top_indices][['name', 'genre', 'rating']]


In [85]:
print(anime_df['name'].head(20))  # Preview titles to choose from

# Run recommendation
recommendations = hybrid_recommendation("Naruto")
print(recommendations)


0                                        Kimi no Na wa.
1                      Fullmetal Alchemist: Brotherhood
2                                              Gintama°
3                                           Steins;Gate
4                                         Gintama&#039;
5     Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...
6                                Hunter x Hunter (2011)
7                                  Ginga Eiyuu Densetsu
8     Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                              Gintama&#039;: Enchousen
10                                 Clannad: After Story
11                                       Koe no Katachi
12                                              Gintama
13                   Code Geass: Hangyaku no Lelouch R2
14                              Haikyuu!! Second Season
15                        Sen to Chihiro no Kamikakushi
16                              Shigatsu wa Kimi no Uso
17                        Mushishi Zoku Shou 2nd

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [87]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(anime_df['genre'].fillna(''))
anime_df['rating_normalized'] = (
    anime_df['rating'] - anime_df['rating'].min()
) / (anime_df['rating'].max() - anime_df['rating'].min())


In [88]:
def hybrid_recommendation(anime_title, top_n=10):
    idx = anime_df[anime_df['name'] == anime_title].index[0]
    genre_similarity = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)[0]

    combined_score = 0.7 * genre_similarity + 0.3 * anime_df['rating_normalized'].values
    top_indices = combined_score.argsort()[::-1][1:top_n+1]

    return anime_df.iloc[top_indices][['name', 'genre', 'rating']]


In [89]:
# Example: Check available titles
print(anime_df['name'].head(20))

# Pick one title from above
recommendations = hybrid_recommendation("Naruto")
print(recommendations)


0                                        Kimi no Na wa.
1                      Fullmetal Alchemist: Brotherhood
2                                              Gintama°
3                                           Steins;Gate
4                                         Gintama&#039;
5     Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...
6                                Hunter x Hunter (2011)
7                                  Ginga Eiyuu Densetsu
8     Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                              Gintama&#039;: Enchousen
10                                 Clannad: After Story
11                                       Koe no Katachi
12                                              Gintama
13                   Code Geass: Hangyaku no Lelouch R2
14                              Haikyuu!! Second Season
15                        Sen to Chihiro no Kamikakushi
16                              Shigatsu wa Kimi no Uso
17                        Mushishi Zoku Shou 2nd

In [90]:
# Example: Check available titles
print(anime_df['name'].head(20))

# Pick one title from above
recommendations = hybrid_recommendation("Naruto")
print(recommendations)


0                                        Kimi no Na wa.
1                      Fullmetal Alchemist: Brotherhood
2                                              Gintama°
3                                           Steins;Gate
4                                         Gintama&#039;
5     Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...
6                                Hunter x Hunter (2011)
7                                  Ginga Eiyuu Densetsu
8     Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...
9                              Gintama&#039;: Enchousen
10                                 Clannad: After Story
11                                       Koe no Katachi
12                                              Gintama
13                   Code Geass: Hangyaku no Lelouch R2
14                              Haikyuu!! Second Season
15                        Sen to Chihiro no Kamikakushi
16                              Shigatsu wa Kimi no Uso
17                        Mushishi Zoku Shou 2nd

### Handle Cold Start Problem

In [91]:
###For New Users (no history)
def top_popular_anime(top_n=10):
    return anime_df.sort_values(by='rating', ascending=False)[['name', 'genre', 'rating']].head(top_n)

print(top_popular_anime())


                                          name  \
10464  Taka no Tsume 8: Yoshida-kun no X-Files   
10400              Spoon-hime no Swing Kitchen   
9595                          Mogura no Motoro   
0                               Kimi no Na wa.   
9078                              Kahei no Umi   
1             Fullmetal Alchemist: Brotherhood   
10786          Yakusoku: Africa Mizu to Midori   
2                                     Gintama°   
3                                  Steins;Gate   
4                                Gintama&#039;   

                                                   genre  rating  
10464                                     Comedy, Parody   10.00  
10400                                    Adventure, Kids    9.60  
9595                                       Slice of Life    9.50  
0                   Drama, Romance, School, Supernatural    9.37  
9078                                          Historical    9.33  
1      Action, Adventure, Drama, Fantasy, Magic

In [92]:
### For New Items (no ratings yet)
def content_based_recommendation(anime_title, top_n=10):
    idx = anime_df[anime_df['name'] == anime_title].index[0]
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    top_indices = sim_scores.argsort()[::-1][1:top_n+1]
    return anime_df.iloc[top_indices][['name', 'genre', 'rating']]

print(content_based_recommendation("Naruto"))


                                                   name  \
615                                  Naruto: Shippuuden   
2997  Naruto Soyokazeden Movie: Naruto to Mashin to ...   
2458               Naruto Shippuuden: Sunny Side Battle   
1343                                        Naruto x UT   
1472        Naruto: Shippuuden Movie 4 - The Lost Tower   
1103  Boruto: Naruto the Movie - Naruto ga Hokage ni...   
1573  Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...   
486                            Boruto: Naruto the Movie   
7628                            Kyutai Panic Adventure!   
2416  Naruto: Honoo no Chuunin Shiken! Naruto vs. Ko...   

                                                  genre  rating  
615   Action, Comedy, Martial Arts, Shounen, Super P...    7.94  
2997  Action, Comedy, Martial Arts, Shounen, Super P...    7.11  
2458  Action, Comedy, Martial Arts, Shounen, Super P...    7.26  
1343  Action, Comedy, Martial Arts, Shounen, Super P...    7.58  
1472  Action, Comedy

### 1. Can you explain the difference between user-based and item-based collaborative filtering?

User-based: Recommends items to a user based on preferences of similar users.

Item-based: Recommends items similar to those the user has liked in the past.
User-user similarity vs. Item-item similarity.

### 2. What is collaborative filtering, and how does it work?
Collaborative filtering is a recommendation technique that predicts a user’s interests by collecting preferences from many users.
It works by finding similarities between users or items using their ratings or interactions.

