In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Load the dataset
df = pd.read_csv('anime.csv')

In [3]:

# Drop any rows with missing values
df.dropna(inplace=True)

In [4]:
# Print the shape of the DataFrame before dropping rows
print("Before dropping rows:", df.shape)

# Drop any rows with missing values
df.dropna(inplace=True)

# Print the shape of the DataFrame after dropping rows
print("After dropping rows:", df.shape)


Before dropping rows: (12017, 7)
After dropping rows: (12017, 7)


In [5]:
# Define features to use for recommendation
features = ['genre', 'type', 'rating']

In [6]:

# Print the list of features
print(features)

['genre', 'type', 'rating']


In [7]:
# Create a TF-IDF vectorizer for genre and type
vectorizer = TfidfVectorizer()

In [8]:
# Fit the vectorizer to the genre and type columns and transform the data
genre_type_vectors = vectorizer.fit_transform(df['genre'] + ' ' + df['type'])

# Print the shape of the resulting matrix
print(genre_type_vectors.shape)

(12017, 52)


In [11]:
# Normalize the rating column
df['rating'] = df['rating'] / df['rating'].max()
print(df)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [15]:
# Add the rating column to the vectorized features
features_vectors = pd.concat([pd.DataFrame(genre_type_vectors.toarray()), df[['rating']]], axis=1)
print(features_vectors)

             0         1    2    3    4         5    6    7         8    9  \
0      0.00000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.405026  0.0   
1      0.28540  0.306009  0.0  0.0  0.0  0.000000  0.0  0.0  0.324302  0.0   
2      0.24559  0.000000  0.0  0.0  0.0  0.195601  0.0  0.0  0.000000  0.0   
3      0.00000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0   
4      0.24559  0.000000  0.0  0.0  0.0  0.195601  0.0  0.0  0.000000  0.0   
...        ...       ...  ...  ...  ...       ...  ...  ...       ...  ...   
12289      NaN       NaN  NaN  NaN  NaN       NaN  NaN  NaN       NaN  NaN   
12290      NaN       NaN  NaN  NaN  NaN       NaN  NaN  NaN       NaN  NaN   
12291      NaN       NaN  NaN  NaN  NaN       NaN  NaN  NaN       NaN  NaN   
12292      NaN       NaN  NaN  NaN  NaN       NaN  NaN  NaN       NaN  NaN   
12293      NaN       NaN  NaN  NaN  NaN       NaN  NaN  NaN       NaN  NaN   

       ...   43   44   45       46        47        48   49   5

In [21]:
import numpy as np

# Get the shape of the features_vectors array
num_samples, num_features = features_vectors.shape

# Define the test size (e.g., 20% of the data)
test_size = 0.2

# Calculate the number of test samples
num_test_samples = int(num_samples * test_size)

# Split the data into training and testing sets
train_features = features_vectors[:-num_test_samples]
test_features = features_vectors[-num_test_samples:]

train_labels = df['rating'].iloc[:-num_test_samples]
test_labels = df['rating'].iloc[-num_test_samples:]

In [22]:
print("Training Features Shape:", train_features.shape)

Training Features Shape: (9400, 53)


In [23]:
print("Testing Features Shape:", test_features.shape)
print("Training Labels Shape:", train_labels.shape)
print("Testing Labels Shape:", test_labels.shape)

Testing Features Shape: (2350, 53)
Training Labels Shape: (9667,)
Testing Labels Shape: (2350,)


In [26]:
print("Training Features:\n", train_features)
print("Testing Features:\n", test_features)
print("Training Labels:\n", train_labels)
print("Testing Labels:\n", test_labels)

Training Features:
             0         1    2    3    4         5    6    7         8    9  \
0     0.00000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.405026  0.0   
1     0.28540  0.306009  0.0  0.0  0.0  0.000000  0.0  0.0  0.324302  0.0   
2     0.24559  0.000000  0.0  0.0  0.0  0.195601  0.0  0.0  0.000000  0.0   
3     0.00000  0.000000  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0   
4     0.24559  0.000000  0.0  0.0  0.0  0.195601  0.0  0.0  0.000000  0.0   
...       ...       ...  ...  ...  ...       ...  ...  ...       ...  ...   
9395  0.00000  0.526659  0.0  0.0  0.0  0.391209  0.0  0.0  0.000000  0.0   
9396  0.00000  0.487885  0.0  0.0  0.0  0.362407  0.0  0.0  0.000000  0.0   
9397  0.00000  0.483826  0.0  0.0  0.0  0.359392  0.0  0.0  0.000000  0.0   
9398  0.00000  0.579018  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0   
9399  0.00000  0.572267  0.0  0.0  0.0  0.000000  0.0  0.0  0.000000  0.0   

      ...   43   44   45       46        47        48  

In [27]:

# Define a function to compute cosine similarity
def compute_cosine_similarity(features):
    similarity = cosine_similarity(features)
    return similarity


In [29]:
# Compute the cosine similarity between the features
similarity = compute_cosine_similarity(features_vectors)

# Print the similarity matrix
print(similarity)

[[1.         0.5348909  0.46429286 ... 0.33948911 0.33948911 0.49153054]
 [0.5348909  1.         0.58193489 ... 0.33734932 0.33734932 0.33734932]
 [0.46429286 0.58193489 1.         ... 0.33715304 0.33715304 0.33715304]
 ...
 [0.33948911 0.33734932 0.33715304 ... 1.         1.         0.73349897]
 [0.33948911 0.33734932 0.33715304 ... 1.         1.         0.73349897]
 [0.49153054 0.33734932 0.33715304 ... 0.73349897 0.73349897 1.        ]]


In [30]:
similarity = compute_cosine_similarity(features_vectors)
print(similarity)

[[1.         0.5348909  0.46429286 ... 0.33948911 0.33948911 0.49153054]
 [0.5348909  1.         0.58193489 ... 0.33734932 0.33734932 0.33734932]
 [0.46429286 0.58193489 1.         ... 0.33715304 0.33715304 0.33715304]
 ...
 [0.33948911 0.33734932 0.33715304 ... 1.         1.         0.73349897]
 [0.33948911 0.33734932 0.33715304 ... 1.         1.         0.73349897]
 [0.49153054 0.33734932 0.33715304 ... 0.73349897 0.73349897 1.        ]]


In [34]:
num_recommendations = 10
top_indices = similarity.argsort()[-num_recommendations-1:-1]

In [36]:
num_recommendations = 10
top_indices = similarity.argsort()[-num_recommendations-1:-1]
print(top_indices)

[[ 8037  8036 10045 ... 11740 11741 11739]
 [ 8037  8036 10045 ... 11740 11741 11739]
 [ 8037  8036 10045 ... 11740 11741 11739]
 ...
 [ 8037  8036 10045 ... 11744 11745 11746]
 [ 8037  8036 10045 ... 11746 11748 11747]
 [ 8037  8036 10045 ... 11746 11748 11747]]


In [39]:
def get_top_ids(df, similarity, top_indices):
    return df['id'].iloc[top_indices]

In [47]:
def get_top_ids(df, similarity, top_indices):
    return df.iloc[:, 0].iloc[top_indices]
# Or, more simply:
def get_top_ids(df, similarity, top_indices):
    return df.iloc[top_indices, 0]

In [52]:
# Return the IDs of the recommended anime
def get_top_ids(df, similarity, top_indices):
    top_indices = top_indices.flatten()  # flatten the array
    return df.iloc[top_indices, 0]

top_ids = get_top_ids(df, similarity, top_indices)
print("Top IDs:", top_ids)


Top IDs: 8058      3287
8057     13405
10087    14781
8289     33863
9854     29435
         ...  
12011     3526
12012     6015
12013     2374
12015     2479
12014     3040
Name: anime_id, Length: 117500, dtype: int64


In [54]:
# Test the recommendation function
def recommend_anime(target_anime_id, num_recommendations):
    # Your recommendation logic goes here
    # For example:
    recommended_anime_ids = [1, 2, 3, 4, 5]  # Replace with your actual recommendation logic
    return recommended_anime_ids

target_anime_id = 123  # Replace with a valid anime ID
num_recommendations = 5
recommended_anime_ids = recommend_anime(target_anime_id, num_recommendations)
print(recommended_anime_ids)

[1, 2, 3, 4, 5]


In [55]:
# Evaluate the recommendation system using precision, recall, and F1-score
def evaluate_recommendation_system(recommended_anime_ids, actual_anime_ids):
    precision = precision_score(actual_anime_ids, recommended_anime_ids)
    recall = recall_score(actual_anime_ids, recommended_anime_ids)
    f1 = f1_score(actual_anime_ids, recommended_anime_ids)
    return precision, recall, f1

In [56]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the recommendation system using precision, recall, and F1-score
def evaluate_recommendation_system(recommended_anime_ids, actual_anime_ids):
    precision = precision_score(actual_anime_ids, recommended_anime_ids, average='macro')
    recall = recall_score(actual_anime_ids, recommended_anime_ids, average='macro')
    f1 = f1_score(actual_anime_ids, recommended_anime_ids, average='macro')
    return precision, recall, f1

In [58]:
recommended_anime_ids = [1, 2, 3, 4, 5]
actual_anime_ids = [1, 2, 3, 6, 7]

precision, recall, f1 = evaluate_recommendation_system(recommended_anime_ids, actual_anime_ids)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:",f1)

Precision: 0.42857142857142855
Recall: 0.42857142857142855
F1-score: 0.42857142857142855


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [60]:
# Test the evaluation function
actual_anime_ids = df['anime_id'].iloc[:num_recommendations]

In [61]:
recommended_anime_ids = df.iloc[:, 0].iloc[:num_recommendations]
actual_anime_ids = df.iloc[:, 0].iloc[:num_recommendations]

print("Recommended anime IDs:", recommended_anime_ids)
print("Actual anime IDs:", actual_anime_ids)

precision, recall, f1 = evaluate_recommendation_system(recommended_anime_ids, actual_anime_ids)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Recommended anime IDs: 0    32281
1     5114
2    28977
3     9253
4     9969
Name: anime_id, dtype: int64
Actual anime IDs: 0    32281
1     5114
2    28977
3     9253
4     9969
Name: anime_id, dtype: int64
Precision: 1.0
Recall: 1.0
F1-score: 1.0


In [None]:
Interview questions

1.Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering (UBCF) and item-based collaborative filtering (IBCF) are two approaches used in recommender systems.

User-Based Collaborative Filtering (UBCF)

- UBCF focuses on finding similar users to the active user.
- It calculates the similarity between users based on their past behavior (e.g., ratings, purchases).
- Once similar users are identified, the system recommends items that these similar users have liked or rated highly.

Item-Based Collaborative Filtering (IBCF)

- IBCF focuses on finding similar items to the ones the active user has liked or interacted with.
- It calculates the similarity between items based on their past interactions with users (e.g., ratings, purchases).
- Once similar items are identified, the system recommends these items to the active user.

Key Differences

- Focus: UBCF focuses on users, while IBCF focuses on items.
- Similarity Calculation: UBCF calculates similarity between users, while IBCF calculates similarity between items.
- Recommendation Approach: UBCF recommends items liked by similar users, while IBCF recommends items similar to the ones the user has liked.
    
2.What is collaborative filtering, and how does it work?

Collaborative filtering (CF) is a technique used in recommender systems to predict the interests of a user by collecting preferences from many users (collaborating). The underlying assumption of CF is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue.

How Collaborative Filtering Works

1. Data Collection: Gather user-item interaction data (e.g., ratings, clicks, purchases).
2. Matrix Construction: Create a user-item interaction matrix, where rows represent users, columns represent items, and cell values represent the interaction (e.g., rating).
3. Similarity Calculation: Calculate the similarity between users (UBCF) or items (IBCF) using metrics such as cosine similarity, Pearson correlation, or Jaccard similarity.
4. Neighborhood Formation: Identify the most similar users (UBCF) or items (IBCF) to the active user or item.
5. Prediction: Generate predictions for the active user by aggregating the preferences of similar users (UBCF) or the attributes of similar items (IBCF).
6. Recommendation: Provide personalized recommendations to the user based on the predicted preferences.

Collaborative filtering is a powerful technique for building recommender systems, as it can effectively capture complex user preferences and item relationships.