In [2]:
!pip install pandas
!pip install scikit-surprise
!pip install notebook

Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=24.6.0 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook)
  Downloading webcolors-25.10.0-py3-none-any.whl.metadata (2.2 kB)
Downloading webcolors-25.10.0-py3-none-any.whl (14 kB)
Downloading fqdn-1.5.1-py3-none-any.whl (9.1 kB)
Downloading isoduration-20.11.0-py3-none-any.whl (11 kB)
Downloading uri_template-1.3.0-py3-none-any.whl (11 kB)
Installing collected packag

In [3]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from collections import defaultdict

print("Libraries imported successfully.")

Libraries imported successfully.


In [15]:
try:
    df = pd.read_csv('Downloads/data.csv', encoding='ISO-8859-1')
    print("Dataset loaded successfully.")
    print("-----------------------------------")
    print("Data Head:")
    print(df.head())
    print("\n-----------------------------------")
    print("Data Info:")
    df.info()
except FileNotFoundError:
    print("Error: 'ecommerce-data.csv' not found.")
    print("Please make sure the dataset is in the correct directory.")
    
    df = None

Dataset loaded successfully.
-----------------------------------
Data Head:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  

-----------------------------------
Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data column

In [8]:
if df is not None:
    # Remove transactions with no CustomerID
    df_cleaned = df.dropna(subset=['CustomerID'])
    
    # Remove returns (Quantity <= 0)
    df_cleaned = df_cleaned[df_cleaned['Quantity'] > 0]

    # Convert CustomerID to integer
    df_cleaned['CustomerID'] = df_cleaned['CustomerID'].astype(int)

    # Optional: Filter out users and items with few interactions to reduce noise
    min_item_interactions = 5
    min_user_interactions = 5

    # Filter items
    item_counts = df_cleaned['StockCode'].value_counts()
    items_to_keep = item_counts[item_counts >= min_item_interactions].index
    df_filtered = df_cleaned[df_cleaned['StockCode'].isin(items_to_keep)]

    # Filter users
    user_counts = df_filtered['CustomerID'].value_counts()
    users_to_keep = user_counts[user_counts >= min_user_interactions].index
    df_final = df_filtered[df_filtered['CustomerID'].isin(users_to_keep)]

    print("\n-----------------------------------")
    print("Data cleaned and filtered.")
    print(f"Original shape: {df.shape}")
    print(f"Shape after cleaning: {df_final.shape}")
    
    # Create the data in the format required by Surprise: (user, item, rating)
    # Using Quantity as implicit 'rating'.
    data_for_surprise = df_final[['CustomerID', 'StockCode', 'Quantity']]
    data_for_surprise.columns = ['userID', 'itemID', 'rating']


-----------------------------------
Data cleaned and filtered.
Original shape: (541909, 8)
Shape after cleaning: (396370, 8)


In [9]:
if 'data_for_surprise' in locals():
    # A Reader is needed to parse the file or dataframe. The rating_scale parameter
    # must be specified. Since Quantity can be large, getting its min and max.
    reader = Reader(rating_scale=(data_for_surprise['rating'].min(), data_for_surprise['rating'].max()))

    # Loading the data from the pandas dataframe
    data = Dataset.load_from_df(data_for_surprise, reader)

    # Splitting the data into a training set and a testing set (e.g., 80% train, 20% test)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    print("\n-----------------------------------")
    print("Data loaded into Surprise and split into training and testing sets.")


-----------------------------------
Data loaded into Surprise and split into training and testing sets.


In [10]:
if 'trainset' in locals():
    # Configuring the algorithm
    sim_options_user = {
        'name': 'cosine',  # Use cosine similarity
        'user_based': True  # Compute similarities between users
    }
    algo_user_based = KNNBasic(sim_options=sim_options_user)

    # Training the model on the training set
    algo_user_based.fit(trainset)
    print("\nUser-Based Collaborative Filtering model trained.")


Computing the cosine similarity matrix...
Done computing similarity matrix.

User-Based Collaborative Filtering model trained.


In [11]:
if 'trainset' in locals():
    # Configure the algorithm
    sim_options_item = {
        'name': 'cosine',  # Use cosine similarity
        'user_based': False # Compute similarities between items
    }
    algo_item_based = KNNBasic(sim_options=sim_options_item)

    # Training the model on the training set
    algo_item_based.fit(trainset)
    print("Item-Based Collaborative Filtering model trained.")

Computing the cosine similarity matrix...
Done computing similarity matrix.
Item-Based Collaborative Filtering model trained.


In [12]:
def precision_recall_at_k(predictions, k=10, threshold=1):
    """
    Return precision and recall at k for each user.
    `predictions` is a list of Prediction objects from a Surprise model.
    `threshold` defines a 'relevant' item (rating above this value). 
    Since our ratings are purchase quantities, we'll set a low threshold.
    """
    # First, mapping the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sorting user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant AND recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@k: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@k: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    # Return the average precision and recall over all users
    return sum(precisions.values()) / len(precisions), sum(recalls.values()) / len(recalls)

In [16]:
if 'testset' in locals() and 'algo_user_based' in locals():
    k = 10
    
    # Making predictions on the test set
    predictions_user = algo_user_based.test(testset)
    predictions_item = algo_item_based.test(testset)

    # Calculating Precision@k and Recall@k for User-Based Model
    precision_user, recall_user = precision_recall_at_k(predictions_user, k=k)
    print("Evaluating models with k =", k)
    print(f"User-Based CF - Precision@{k}: {precision_user:.4f}")
    print(f"User-Based CF - Recall@{k}:    {recall_user:.4f}")

    # Calculating Precision@k and Recall@k for Item-Based Model
    precision_item, recall_item = precision_recall_at_k(predictions_item, k=k)
    print(f"Item-Based CF - Precision@{k}: {precision_item:.4f}")
    print(f"Item-Based CF - Recall@{k}:    {recall_item:.4f}")

Evaluating models with k = 10
User-Based CF - Precision@10: 1.0000
User-Based CF - Recall@10:    0.7496
Item-Based CF - Precision@10: 1.0000
Item-Based CF - Recall@10:    0.7496


In [14]:
def get_top_n_recommendations(algo, user_id, n=5):
    """
    Get the top N item recommendations for a specific user.
    """
    # 1. Get a list of all item IDs
    all_item_ids = data_for_surprise['itemID'].unique()
    
    # 2. Get the items the user has already purchased
    items_purchased_by_user = data_for_surprise[data_for_surprise['userID'] == user_id]['itemID'].unique()
    
    # 3. Predict ratings for all items the user hasn't purchased yet
    items_to_predict = [item for item in all_item_ids if item not in items_purchased_by_user]
    
    predictions = [algo.predict(user_id, item_id) for item_id in items_to_predict]
    
    # 4. Sort the predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # 5. Get the top N recommendations
    top_n = predictions[:n]
    
    # Map StockCode back to Description for better readability
    item_descriptions = df_final[['StockCode', 'Description']].drop_duplicates().set_index('StockCode')
    
    top_n_recommendations = []
    for pred in top_n:
        item_id = pred.iid
        estimated_rating = pred.est
        description = item_descriptions.loc[item_id]['Description']
        top_n_recommendations.append((item_id, description, estimated_rating))
        
    return top_n_recommendations


if 'data_for_surprise' in locals() and 'algo_item_based' in locals():
    # Choose a sample user ID from the filtered dataset
    sample_user_id = users_to_keep[0] 
    
    print(f"\nGenerating Top 5 Recommendations for Sample User: {sample_user_id}")
    
    # Using the item-based model as it often performs better in e-commerce
    top_5_recs = get_top_n_recommendations(algo_item_based, user_id=sample_user_id, n=5)
    
    print("\nTop 5 Product Recommendations:")
    for i, (item_id, description, est_rating) in enumerate(top_5_recs):
        print(f"{i+1}. Item ID (StockCode): {item_id}")
        print(f"   Description: {description}")
        print(f"   Predicted Score: {est_rating:.2f}\n")


Generating Top 5 Recommendations for Sample User: 17841

Top 5 Product Recommendations:
1. Item ID (StockCode): 20898
   Description: VINTAGE NOTEBOOK TRAVELOGUE
   Predicted Score: 7.62

2. Item ID (StockCode): 22262
   Description: FELT EGG COSY CHICKEN
   Predicted Score: 6.80

3. Item ID (StockCode): 22304
   Description: COFFEE MUG BLUE PAISLEY DESIGN
   Predicted Score: 6.47

4. Item ID (StockCode): 22398
   Description: MAGNETS PACK OF 4 SWALLOWS
   Predicted Score: 6.15

5. Item ID (StockCode): 21788
   Description: KIDS RAIN MAC BLUE
   Predicted Score: 6.10

