In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

In [2]:
# -------------------------------
# Step 1: Load Dataset from Hugging Face
# -------------------------------
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Movies_and_TV", trust_remote_code=True, split='full')

In [3]:
# Convert dataset to Pandas DataFrame
df = pd.DataFrame(dataset)

df_original_backup = df.copy()

In [12]:
# df = df_original_backup.copy()

In [13]:
dataset[0].keys()

dict_keys(['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'])

In [14]:
print(f"Length of original dataset: {len(df)}")

# Retain relevant columns and drop missing descriptions
df = df[['title', 'description', 'average_rating', 'rating_number', 'price', 'categories']]

# Drop rows where description is an empty list
df = df[df['description'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

# Reset index
df.reset_index(drop=True, inplace=True)
print(f"Length of filtered dataset, after removing blank descriptions: {len(df)}")

Length of original dataset: 748224
Length of filtered dataset, after removing blank descriptions: 347683


In [15]:
# sample the df for easier management
df = df.sample(10000).reset_index(drop=True)

In [16]:
# -------------------------------
# Step 2: Traditional Recommendation Engine (TF-IDF + K-Means)
# -------------------------------

# 2.1 TF–IDF Vectorization of product descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'].apply(lambda x: ' '.join(x)))

# 2.2 Clustering with K-Means
num_clusters = 10  # Adjust based on dataset size
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

def traditional_recommendations(title, top_n=5):
    """
    Given a movie title, recommend top_n similar movies based on TF-IDF clustering.
    """
    # Find the cluster of the given movie
    product_cluster = df.loc[df['title'] == title, 'cluster'].values[0]
    
    # Filter products in the same cluster (excluding the queried product)
    cluster_products = df[(df['cluster'] == product_cluster) & (df['title'] != title)]
    
    # Return the top_n recommendations
    return cluster_products.head(top_n)[['title', 'description', 'average_rating', 'price']]

In [17]:
# -------------------------------
# Step 3: SBERT-Based Recommendation Engine
# -------------------------------

# 3.1 Load SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# 3.2 Compute SBERT embeddings for product descriptions
df['sbert_embedding'] = df['description'].apply(lambda x: sbert_model.encode(' '.join(x)))

# Stack embeddings for similarity computation
embeddings = np.vstack(df['sbert_embedding'].values)

In [18]:
def sbert_recommendations(title, top_n=5):
    """
    Given a movie title, recommend top_n similar movies using SBERT embeddings.
    """
    # Find the embedding for the queried movie
    idx = df.index[df['title'] == title][0]
    query_embedding = embeddings[idx].reshape(1, -1)
    
    # Compute cosine similarity between the query movie and all others
    sim_scores = cosine_similarity(query_embedding, embeddings).flatten()
    
    # Exclude the movie itself
    sim_scores[idx] = -np.inf
    
    # Get indices of top_n similar movies
    top_indices = sim_scores.argsort()[-top_n:][::-1]
    recommendations = df.iloc[top_indices]
    return recommendations[['title', 'description', 'average_rating', 'price']]

In [27]:
# -------------------------------
# Step 4: Test & Compare Recommendations
# -------------------------------

# Select a test movie
test_movie = df['title'].iloc[100]
display(f"Test Movie: {test_movie}")
display(df.loc[df['title'] == test_movie, ['title', 'description', 'average_rating', 'price']])

# Traditional recommendations
display("\nTraditional (TF-IDF + K-Means) Recommendations:")
display(traditional_recommendations(test_movie, top_n=5))

# SBERT-based recommendations
display("\nSBERT-Based Recommendations:")
display(sbert_recommendations(test_movie, top_n=5))

'Test Movie: Attack on Titan: Season 3 - Part 2 [Blu-ray]'

Unnamed: 0,title,description,average_rating,price
100,Attack on Titan: Season 3 - Part 2 [Blu-ray],"[Once again, Eren faces the Titans in his home...",4.9,27.88


'\nTraditional (TF-IDF + K-Means) Recommendations:'

Unnamed: 0,title,description,average_rating,price
24,Farscape - The Best of Season One,"[Product Description, Farscape: The Best of Se...",4.5,7.1
51,Strike Back: The Complete Fifth Season (DVD + ...,[A fresh team of fearless commandos are recrui...,4.6,47.0
52,I Spy - Season 2 [DVD],[Robert Culp and Bill Cosby star as internatio...,4.4,28.5
85,Game of Thrones: The Complete Seasons 1-7 (DVD),[Game of Thrones: The Complete Seasons 1-7 (DV...,4.8,
109,The Flintstones: The Complete Series [DVD],[All 166 Episodes of a primetime tv classic on...,4.7,149.86


'\nSBERT-Based Recommendations:'

Unnamed: 0,title,description,average_rating,price
4252,Blue Exorcist: The Movie [DVD],"[When his adopted father, Shiro Fujimoto, was ...",4.2,7.65
5380,Insurgent [DVD],"[Insurgent, , the next gripping action-adventu...",4.7,8.97
5726,Teen Titans: Volume 2 - Switched,[Teen Titans: Switched - Season 1 Vol. 2 (DVD)],4.7,6.63
51,Strike Back: The Complete Fifth Season (DVD + ...,[A fresh team of fearless commandos are recrui...,4.6,47.0
2825,Transformers Animated: Season 3,"[Following the events of the season 2 finale, ...",4.9,10.46
