In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score


In [5]:
df = pd.read_csv(r"C:\Users\LENOVO THINKPAD 13\Downloads\Recommendation System (1)\anime.csv")

In [6]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
df.dropna(inplace = True)

In [8]:
df.drop_duplicates(inplace = True)

In [9]:
df["genre"] = df["genre"].fillna("")

In [10]:
df["episodes"] = pd.to_numeric(df["episodes"], errors="coerce")
df["episodes"] = df["episodes"].fillna(df["episodes"].median())


In [11]:
df["metadata"] = (
    df["genre"].astype(str) + " " +
    df["type"].astype(str) + " " +
   df["episodes"].astype(str) + " " +
    df["rating"].astype(str) + " " +
df["members"].astype(str)
)

In [12]:
# TF-IDF Vectorization on metadata
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["metadata"])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)


TF-IDF Matrix Shape: (12017, 6668)


In [13]:
5. 
    #Cosine_Similarity
# -------------------------------
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [14]:
# 6. Recommendation Function
# -------------------------------
indices = pd.Series(df.index, index=df["name"]).drop_duplicates()


In [15]:
def recommend_df(title, top_n=10, threshold=0.2):
    # check if title exists
    if title not in indices:
        return "df not found in dataset!"
    
    # index of the given title
    idx = indices[title]
    
    # all similarity scores for this item
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # filter using threshold
    sim_scores = [(i, score) for i, score in sim_scores if score >= threshold]
    
    # skip the first item (itself) and limit to top_n
    sim_scores = sim_scores[1: top_n+1]
    
    # get indices of the similar items
    df_indices = [i[0] for i in sim_scores]
    
    # return selected columns from df
    return df.iloc[df_indices][["name", "genre", "type", "rating"]]


In [16]:
7. #TestRecommendation
print("\nRecommended df for 'Naruto':")
print(recommend_df("Naruto", top_n=5, threshold=0.25))



Recommended df for 'Naruto':
                                                   name  \
7867                                    Iron Virgin Jun   
4067                     Ikkitousen: Extravaganza Epoch   
1930                                  Dragon Ball Super   
1573  Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...   
615                                  Naruto: Shippuuden   

                                                  genre   type  rating  
7867  Action, Comedy, Fantasy, Martial Arts, Super P...    OVA    4.81  
4067  Action, Ecchi, Martial Arts, School, Seinen, S...    OVA    6.81  
1930  Action, Adventure, Comedy, Fantasy, Martial Ar...     TV    7.40  
1573  Action, Comedy, Martial Arts, Shounen, Super P...  Movie    7.50  
615   Action, Comedy, Martial Arts, Shounen, Super P...     TV    7.94  


In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score



# 2. Split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# 3. Build TF-IDF on the TRAIN set (or on full df if you prefer)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(train['metadata'])   # or df['metadata'] if you want full

# 4. Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Build indices mapping (train set only here)
indices = pd.Series(train.index, index=train['name']).drop_duplicates()

# 6. Safe recommend_df
def recommend_df(title, top_n=5, threshold=0.25):
    """
    Return top_n recommendations for a given title.
    Skip if title not in indices.
    """
    if title not in indices:
        return pd.DataFrame(columns=['name'])
    
    idx = indices[title]
    if idx >= cosine_sim.shape[0]:  # safety check
        return pd.DataFrame(columns=['name'])
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [s for s in sim_scores if s[0] != idx and s[1] >= threshold]

    top_items = sim_scores[:top_n]
    anime_indices = [i[0] for i in top_items]

    return train.iloc[anime_indices][['name', 'genre']]

# 7. Evaluate precision/recall
y_true = []
y_pred = []

for title in test["name"].sample(20, random_state=42):
    if title not in indices:
        continue
    
    recs = recommend_df(title, top_n=5, threshold=0.25)
    if recs.empty:
        continue
    
    true_genre = set(train.loc[indices[title], "genre"].split(", "))

    # positives = recommended items
    for rec_title in recs["name"].values:
        rec_genre = set(train.loc[indices[rec_title], "genre"].split(", "))
        y_true.append(1 if len(true_genre & rec_genre) > 0 else 0)
        y_pred.append(1)  # predicted relevant

    # negatives = random items not recommended
    nonrecs = train.loc[~train["name"].isin(recs["name"].values)].sample(
        min(5, len(train)), random_state=42
    )
    for non_title in nonrecs["name"].values:
        rec_genre = set(train.loc[indices[non_title], "genre"].split(", "))
        y_true.append(1 if len(true_genre & rec_genre) > 0 else 0)
        y_pred.append(0)  # predicted not relevant

precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.0
Recall: 0.0
F1 Score: 0.0


1️⃣ Difference between User-Based and Item-Based Collaborative Filtering

| Feature                    | User-Based Collaborative Filtering                                                                                                 | Item-Based Collaborative Filtering                                                                      |
| -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------- |
| **Definition**             | Finds users who behave similarly to the target user and recommends the items those similar users liked.                            | Finds items similar to the ones the target user has already interacted with and recommends those items. |
| **Similarity Computation** | Similarity is calculated between **users** based on their item ratings or interactions.                                            | Similarity is calculated between **items** based on how users have rated or interacted with them.       |
| **Recommendation Logic**   | “People like you liked these items.”                                                                                               | “Items like the ones you liked may interest you.”                                                       |
| **Strengths**              | Good when user patterns are stable and the number of users is not extremely large.                                                 | More scalable and stable because items change less frequently than users.                               |
| **Example**                | If User A and User B both rated similar movies highly, recommend to User A the movies that User B liked but User A hasn’t watched. | If a user liked *Movie X*, recommend *Movie Y* if most users who liked X also liked Y.                  |


2️⃣ What is Collaborative Filtering and How It Works

Definition:
Collaborative Filtering (CF) is a technique used in recommender systems to predict a user’s interests by collecting and analyzing information from many users’ behaviors (ratings, clicks, purchases). It does not need explicit information about the content of items (like genres or descriptions).

How it works (steps):

Create a User–Item Interaction Matrix
Rows represent users, columns represent items. Cells contain ratings or implicit feedback (clicks, purchases).

Compute Similarities

In user-based CF: find users with similar behavior patterns.

In item-based CF: find items that are commonly liked or purchased together.

Predict Missing Preferences
Estimate how likely the user is to like an unseen item based on ratings of similar users or similar items.

Generate Recommendations
Rank items by the predicted score and present the top-N items to the user.

Example:
Netflix or Amazon recommend content using collaborative filtering:

“Users who watched Inception also watched Interstellar” (item-based).

“People with viewing habits like yours watched these shows” (user-based)