## 3. Recommendation System

We build a collaborative filtering-based recommendation system to suggest products to customers.

In [None]:
# Import libraries for recommendation systems
from sklearn.metrics.pairwise import cosine_similarity

# Prepare data for recommendation system
# Import additional libraries for recommendation systems
from scipy.sparse import csr_matrix
orders = pd.read_csv('data/olist_orders_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
customers = pd.read_csv('data/olist_customers_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
# Create a user-item matrix (customer-product interactions)
user_item_data = order_items.merge(orders[['order_id', 'customer_id']], on='order_id')
user_item_data = user_item_data.merge(products[['product_id', 'product_category_name']], on='product_id')

# Count purchases of each product category by each customer
purchase_counts = user_item_data.groupby(['customer_id', 'product_category_name']).size().reset_index(name='purchase_count')

# Create a pivot table: customers x product categories
user_item_matrix = purchase_counts.pivot(
    index='customer_id',
    columns='product_category_name',
    values='purchase_count'
).fillna(0)

# Convert to sparse matrix for efficiency
sparse_user_item = csr_matrix(user_item_matrix.values)

# Calculate item-item similarity matrix using cosine similarity
item_similarity = cosine_similarity(sparse_user_item.T)

# Create a DataFrame for the item similarity matrix
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

# Generate recommendations
# Function to get top N similar items
def get_similar_categories(category_name, n=5):
    if category_name not in item_similarity_df.index:
        return pd.Series()
    
    similar_categories = item_similarity_df[category_name].sort_values(ascending=False)
    # Exclude the category itself
    similar_categories = similar_categories.drop(category_name, errors='ignore')
    return similar_categories.head(n)

# Function to recommend products for a customer
def recommend_for_customer(customer_id, n_recommendations=5):
    if customer_id not in user_item_matrix.index:
        return pd.Series()
    
    # Get the customer's purchase history
    customer_purchases = user_item_matrix.loc[customer_id]
    
    # Initialize recommendation scores
    recommendation_scores = pd.Series(0, index=user_item_matrix.columns)
    
    # For each category the customer has purchased
    for category, count in customer_purchases.items():
        if count > 0:
            # Get similar categories
            similar_categories = item_similarity_df[category]
            # Weight by purchase count
            recommendation_scores += similar_categories * count
    
    # Remove categories the customer has already purchased
    purchased_categories = customer_purchases[customer_purchases > 0].index
    recommendation_scores = recommendation_scores.drop(purchased_categories, errors='ignore')
    
    # Return top N recommendations
    return recommendation_scores.sort_values(ascending=False).head(n_recommendations)

In [None]:
# Import additional libraries for recommendation systems
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Prepare data for recommendation system
# We'll use collaborative filtering based on user-item interactions
orders = pd.read_csv('data/olist_orders_dataset.csv')
order_items = pd.read_csv('data/olist_order_items_dataset.csv')
customers = pd.read_csv('data/olist_customers_dataset.csv')
products = pd.read_csv('data/olist_products_dataset.csv')
# Create a user-item matrix (customer-product interactions)
user_item_data = order_items.merge(orders[['order_id', 'customer_id']], on='order_id')
user_item_data = user_item_data.merge(products[['product_id', 'product_category_name']], on='product_id')

# Count purchases of each product category by each customer
purchase_counts = user_item_data.groupby(['customer_id', 'product_category_name']).size().reset_index(name='purchase_count')

# Create a pivot table: customers x product categories
user_item_matrix = purchase_counts.pivot(
    index='customer_id',
    columns='product_category_name',
    values='purchase_count'
).fillna(0)

print("User-item matrix shape:", user_item_matrix.shape)
print("\nSample of the user-item matrix:")
display(user_item_matrix.head())

# Convert to sparse matrix for efficiency
sparse_user_item = csr_matrix(user_item_matrix.values)

# Calculate item-item similarity matrix using cosine similarity
item_similarity = cosine_similarity(sparse_user_item.T)

# Create a DataFrame for the item similarity matrix
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

print("\nItem similarity matrix shape:", item_similarity_df.shape)
print("\nSample of the item similarity matrix:")
display(item_similarity_df.iloc[:5, :5])

# Function to get top N similar items
def get_similar_categories(category_name, n=5):
    if category_name not in item_similarity_df.index:
        return pd.Series()
    
    similar_categories = item_similarity_df[category_name].sort_values(ascending=False)
    # Exclude the category itself
    similar_categories = similar_categories.drop(category_name, errors='ignore')
    return similar_categories.head(n)

# Test the recommendation function with a popular category
popular_category = product_category_counts = products['product_category_name'].value_counts().index[0]
print(f"\nTop 5 categories similar to '{popular_category}':")
similar_to_popular = get_similar_categories(popular_category)
display(similar_to_popular)

# Visualize the similarity network for top categories
top_categories = products['product_category_name'].value_counts().head(10).index.tolist()

# Create a subset of the similarity matrix for visualization
top_similarity = item_similarity_df.loc[top_categories, top_categories]

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(top_similarity, annot=True, cmap='viridis', fmt='.2f')
plt.title('Product Category Similarity Matrix')
plt.tight_layout()
plt.show()

# Function to recommend products for a customer
def recommend_for_customer(customer_id, n_recommendations=5):
    if customer_id not in user_item_matrix.index:
        return pd.Series()
    
    # Get the customer's purchase history
    customer_purchases = user_item_matrix.loc[customer_id]
    
    # Initialize recommendation scores
    recommendation_scores = pd.Series(0, index=user_item_matrix.columns)
    
    # For each category the customer has purchased
    for category, count in customer_purchases.items():
        if count > 0:
            # Get similar categories
            similar_categories = item_similarity_df[category]
            # Weight by purchase count
            recommendation_scores += similar_categories * count
    
    # Remove categories the customer has already purchased
    purchased_categories = customer_purchases[customer_purchases > 0].index
    recommendation_scores = recommendation_scores.drop(purchased_categories, errors='ignore')
    
    # Return top N recommendations
    return recommendation_scores.sort_values(ascending=False).head(n_recommendations)

# Test the recommendation function with a random customer
sample_customer = user_item_matrix.index[np.random.randint(0, len(user_item_matrix))]
print(f"\nRecommendations for customer {sample_customer}:")
customer_recommendations = recommend_for_customer(sample_customer)
display(customer_recommendations)

# Show the customer's purchase history
print(f"\nPurchase history for customer {sample_customer}:")
customer_history = user_item_matrix.loc[sample_customer]
customer_history = customer_history[customer_history > 0].sort_values(ascending=False)
display(customer_history)

# Evaluate the recommendation system
# We'll use a simple hold-out validation approach

# Function to evaluate recommendations
def evaluate_recommendations(test_size=0.2):
    # Customers with at least 2 purchases of different categories
    valid_customers = user_item_matrix[user_item_matrix.sum(axis=1) >= 2]
    valid_customers = valid_customers[(valid_customers > 0).sum(axis=1) >= 2]
    
    if len(valid_customers) == 0:
        print("Not enough data for evaluation")
        return
    
    print(f"Evaluating recommendations for {len(valid_customers)} customers")
    
    # For each customer, hide one purchase and see if we recommend it
    hit_rate = 0
    
    for customer_id in valid_customers.index:
        # Get categories this customer has purchased
        purchased = valid_customers.loc[customer_id]
        purchased = purchased[purchased > 0].index.tolist()
        
        if len(purchased) < 2:
            continue
        
        # Hold out one random purchase
        np.random.shuffle(purchased)
        held_out = purchased[0]
        training = purchased[1:]
        
        # Create a temporary user profile with the held-out item removed
        temp_profile = user_item_matrix.loc[customer_id].copy()
        temp_profile[held_out] = 0
        
        # Generate recommendations based on the modified profile
        # We'll implement a simplified version here
        recommendation_scores = pd.Series(0, index=user_item_matrix.columns)
        
        for category in training:
            similar_categories = item_similarity_df[category]
            recommendation_scores += similar_categories
        
        # Remove categories the customer has already purchased (except the held-out one)
        for category in training:
            recommendation_scores[category] = 0
        
        # Get top 5 recommendations
        top_recommendations = recommendation_scores.sort_values(ascending=False).head(5).index.tolist()
        
        # Check if the held-out item is in the recommendations
        if held_out in top_recommendations:
            hit_rate += 1
    
    # Calculate hit rate
    hit_rate = hit_rate / len(valid_customers) * 100
    print(f"Hit Rate: {hit_rate:.2f}%")
    
    return hit_rate

# Evaluate the recommendation system
hit_rate = evaluate_recommendations()

print("\nRecommendation system complete.")