# 🛒 Online Retail Recommendation System
This project builds a product recommendation system using online retail transaction data and machine learning techniques (item-based collaborative filtering).

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

# Step 2: Load Dataset (Csv File)

In [21]:
df = pd.read_csv("OnlineRetail.csv")  # Make sure file is in same folder
print(" Data Loaded Successfully")
df

 Data Loaded Successfully


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


In [22]:
print(df.shape)
print(df.columns)
print(df.head())

# Check null values
print("\nNull values in each column:")
print(df.isnull().sum())

(541909, 8)
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  

Null values in each column:
InvoiceNo           0
StockCo

# Step 4: Data Cleaning

In [23]:
# Remove missing Customer IDs
df.dropna(subset=['CustomerID'], inplace=True)

# Remove negative or 0 quantity transactions
df = df[df['Quantity'] > 0]

# Remove duplicates
df.drop_duplicates(inplace=True)

# Remove rows with missing description
df.dropna(subset=['Description'], inplace=True)

# Remove unwanted spaces in product names
df['Description'] = df['Description'].str.strip()

print("Data Cleaned")

Data Cleaned


In [14]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

# Step 5: Optional - Filter Frequent Products

In [24]:
popular_products = df['Description'].value_counts()
df = df[df['Description'].isin(popular_products[popular_products > 50].index)]

print("Rare products removed (optional step)")

Rare products removed (optional step)


# Step 6: Create Customer-Product Matrix

In [25]:
basket = df.pivot_table(index='CustomerID', 
                        columns='Description', 
                        values='Quantity', 
                        aggfunc='sum').fillna(0)

print("Customer-Product Matrix Created")

Customer-Product Matrix Created


# Step 7: Convert to Sparse Matrix & Compute Similarity

In [26]:
sparse_matrix = csr_matrix(basket)

item_similarity = cosine_similarity(sparse_matrix.T)

item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=basket.columns, 
                                  columns=basket.columns)

print("Item Similarity Matrix Ready")

Item Similarity Matrix Ready


# Step 8: Recommend Function

In [27]:
def recommend_items(product_name, num_recommendations=5):
    product_name = product_name.strip()
    if product_name not in item_similarity_df.columns:
        return "Product not found in dataset."

    print(f"\n Recommendations for: {product_name}\n")
    similar_items = item_similarity_df[product_name].sort_values(ascending=False)[1:num_recommendations+1]

    for i, (item, score) in enumerate(similar_items.items(), 1):
        print(f"{i}. {item} (Similarity Score: {score:.2f})")

# Step 9: Test the Recommender

In [28]:
# Replace with a real product name from the dataset
test_product = "CHILDREN'S APRON DOLLY GIRL"
recommend_items(test_product, num_recommendations=5)


 Recommendations for: CHILDREN'S APRON DOLLY GIRL

1. 5 HOOK HANGER RED MAGIC TOADSTOOL (Similarity Score: 0.95)
2. FOLDING BUTTERFLY MIRROR RED (Similarity Score: 0.93)
3. FOOD CONTAINER SET 3 LOVE HEART (Similarity Score: 0.93)
4. MAGNETS PACK OF 4 CHILDHOOD MEMORY (Similarity Score: 0.93)
5. 36 FOIL HEART CAKE CASES (Similarity Score: 0.93)
