In [1]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505169 sha256=573bb17d3ceb9fe7312006a688d48f66ec118e16db34765e3a25eb0df049deb3
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e28991

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader


In [4]:
# Load dataset
data = pd.read_csv('ratings_Beauty.csv')

# 1. Content-Based Filtering
def content_based_filtering(user_recent_product_id):
    # Create a product catalog with 'ProductId'
    products = data[['ProductId']].drop_duplicates()

    # Add a dummy 'combined_features' column (since we lack product names and categories)
    products['combined_features'] = products['ProductId']

    # Convert text data into TF-IDF features
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(products['combined_features'])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Find the index of the recent product
    product_idx = products.index[products['ProductId'] == user_recent_product_id].tolist()[0]

    # Get similarity scores for the recent product
    sim_scores = list(enumerate(cosine_sim[product_idx]))

    # Sort products by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 10 similar products
    top_products = [products.iloc[i[0]].ProductId for i in sim_scores[1:11]]

    return top_products

In [5]:
# 2. Collaborative Filtering (Using Surprise Library)
def collaborative_filtering(user_id):
    # Create a subset of the dataset for collaborative filtering
    ratings_data = data[['UserId', 'ProductId', 'Rating']]

    # Prepare the data for Surprise
    reader = Reader(rating_scale=(1, 5))
    surprise_data = Dataset.load_from_df(ratings_data, reader)

    # Train-test split
    trainset = surprise_data.build_full_trainset()

    # Build and train the SVD model
    model = SVD()
    cross_validate(model, surprise_data, cv=5, verbose=True)
    model.fit(trainset)

    # Predict top products for the user
    all_products = data['ProductId'].unique()
    predictions = []
    for product_id in all_products:
        predictions.append((product_id, model.predict(user_id, product_id).est))

    # Sort by predicted rating
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

    # Get top 10 recommendations
    top_recommendations = [pred[0] for pred in predictions[:10]]

    return top_recommendations

# Example Usage
user_id = 'A39HTATAQ9V7YF'  # Example user ID
user_recent_product_id = '0205616461'  # Example product ID

print("Content-Based Recommendations:", content_based_filtering(user_recent_product_id))
print("Collaborative Filtering Recommendations:", collaborative_filtering(user_id))


Content-Based Recommendations: ['0558925278', '0733001998', '0737104473', '0762451459', '1304139212', '1304139220', '130414089X', '130414643X', '1304146537', '130414674X']
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2778  1.2635  1.2576  1.2854  1.2708  1.2710  0.0099  
MAE (testset)     1.0039  0.9954  0.9909  1.0079  1.0006  0.9997  0.0060  
Fit time          3.35    2.51    2.04    2.03    1.90    2.37    0.53    
Test time         0.16    0.12    0.10    0.11    0.28    0.15    0.07    
Collaborative Filtering Recommendations: ['B00018TMV4', 'B00021C1LI', 'B00009YJSJ', 'B0000Y3D4G', 'B00029RD72', 'B000052YN6', 'B00025WYK2', 'B000026BTH', 'B0001EKRSU', 'B00008J2XQ']
