# Market Basket Analysis - Association Rule Lift

## Definition
Measures how much more likely a customer is to buy **Item B** given they bought **Item A**, compared to buying B randomly.

## Formula
$$\text{Lift} = \frac{P(A \cap B)}{P(A) \times P(B)}$$

Where:
- P(A ∩ B) = Probability of buying both A and B together
- P(A) = Probability of buying A
- P(B) = Probability of buying B

## Interpretation
- **Lift = 1.0**: Items are independent (no relationship)
- **Lift > 1.0**: Items are positively correlated (bought together more than random)
- **Lift < 1.0**: Items are negatively correlated (bought together less than random)

## Use Cases
- Product placement in stores
- Cross-selling recommendations
- Bundle creation
- Promotional strategies

---

## Setup and Imports

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn mlxtend

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All packages imported successfully!")

## Load Data

**For real data**: Download from https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset

Expected format: List of transactions where each transaction is a list of items

In [None]:
def load_groceries_data():
    """
    Load groceries transaction data
    
    For CSV format:
    df = pd.read_csv('groceries.csv')
    transactions = df.groupby('Member_number')['itemDescription'].apply(list).tolist()
    """
    # Sample data for demonstration
    transactions = [
        ['milk', 'bread', 'butter'],
        ['beer', 'diapers', 'bread'],
        ['milk', 'bread', 'butter', 'cheese'],
        ['beer', 'diapers'],
        ['milk', 'bread', 'butter', 'eggs'],
        ['beer', 'diapers', 'chips'],
        ['milk', 'cheese'],
        ['bread', 'butter', 'eggs'],
        ['beer', 'diapers', 'bread', 'chips'],
        ['milk', 'bread', 'cheese', 'eggs'],
        ['coffee', 'sugar', 'milk'],
        ['wine', 'cheese', 'crackers'],
        ['beer', 'chips', 'salsa'],
        ['pasta', 'tomato sauce', 'cheese'],
        ['chicken', 'rice', 'vegetables'],
        ['milk', 'bread', 'eggs'],
        ['beer', 'diapers', 'wipes'],
        ['wine', 'cheese'],
        ['coffee', 'creamer', 'sugar'],
        ['bread', 'butter', 'jam'],
    ]
    
    return transactions

# Load data
transactions = load_groceries_data()
print(f"Loaded {len(transactions)} transactions")
print(f"\nSample transactions:")
for i, trans in enumerate(transactions[:5], 1):
    print(f"  {i}. {trans}")

## Perform Market Basket Analysis

In [None]:
def perform_market_basket_analysis(transactions, min_support=0.15, min_threshold=1.0):
    """
    Perform market basket analysis and calculate lift
    
    Parameters:
    -----------
    transactions : list of lists
        Each inner list contains items in a transaction
    min_support : float (0-1)
        Minimum support threshold for frequent itemsets
        Lower values find more patterns but may be less meaningful
    min_threshold : float
        Minimum lift threshold for association rules
        1.0 means only show positive associations
    """
    # Transform to one-hot encoded DataFrame
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    print("=" * 80)
    print("MARKET BASKET ANALYSIS RESULTS")
    print("=" * 80)
    print(f"\nTotal Transactions: {len(transactions)}")
    print(f"Unique Items: {len(df.columns)}")
    
    # Item frequencies
    print(f"\nTop 10 Most Frequent Items:")
    item_freq = df.sum().sort_values(ascending=False)
    print(item_freq.head(10).to_string())
    
    # Generate frequent itemsets
    print(f"\n\nFinding frequent itemsets (min_support={min_support})...")
    frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
    print(f"Found {len(frequent_itemsets)} frequent itemsets")
    
    if len(frequent_itemsets) == 0:
        print("\n⚠ No frequent itemsets found. Try lowering min_support.")
        return None, None
    
    print("\nTop 15 Frequent Itemsets:")
    print(frequent_itemsets.sort_values('support', ascending=False).head(15).to_string())
    
    # Generate association rules
    print(f"\n\nGenerating association rules (min_lift={min_threshold})...")
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_threshold)
    rules = rules.sort_values('lift', ascending=False)
    
    print(f"Found {len(rules)} association rules")
    
    if len(rules) == 0:
        print("\n⚠ No rules found. Try lowering min_threshold or min_support.")
        return None, frequent_itemsets
    
    print("\nTop 20 Association Rules by Lift:")
    display_cols = ['antecedents', 'consequents', 'support', 'confidence', 'lift']
    print(rules[display_cols].head(20).to_string(index=False))
    
    return rules, frequent_itemsets

# Perform analysis
rules, frequent_itemsets = perform_market_basket_analysis(
    transactions, 
    min_support=0.15,  # Adjust based on your data size
    min_threshold=1.0   # Only show positive associations
)

## Visualize Results

In [None]:
def create_visualizations(rules):
    """Create comprehensive market basket analysis visualizations"""
    
    if rules is None or len(rules) == 0:
        print("No rules to visualize")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Lift Distribution
    axes[0, 0].hist(rules['lift'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    axes[0, 0].axvline(x=1, color='red', linestyle='--', linewidth=2, 
                       label='Lift = 1 (Independence)', alpha=0.8)
    axes[0, 0].set_xlabel('Lift', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
    axes[0, 0].set_title('Distribution of Lift Values', fontsize=14, fontweight='bold')
    axes[0, 0].legend(fontsize=10)
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Support vs Confidence (colored by Lift)
    scatter = axes[0, 1].scatter(rules['support'], rules['confidence'], 
                                  c=rules['lift'], s=100, alpha=0.6, 
                                  cmap='viridis', edgecolors='black', linewidths=1)
    axes[0, 1].set_xlabel('Support', fontsize=12, fontweight='bold')
    axes[0, 1].set_ylabel('Confidence', fontsize=12, fontweight='bold')
    axes[0, 1].set_title('Support vs Confidence (colored by Lift)', 
                         fontsize=14, fontweight='bold')
    cbar = plt.colorbar(scatter, ax=axes[0, 1])
    cbar.set_label('Lift', fontsize=11, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Top Rules by Lift
    top_rules = rules.nlargest(min(10, len(rules)), 'lift')
    rule_labels = [f"{list(ant)[0]} → {list(cons)[0]}" 
                   for ant, cons in zip(top_rules['antecedents'], top_rules['consequents'])]
    
    y_pos = np.arange(len(top_rules))
    axes[1, 0].barh(y_pos, top_rules['lift'], color='coral', edgecolor='black', alpha=0.7)
    axes[1, 0].set_yticks(y_pos)
    axes[1, 0].set_yticklabels(rule_labels, fontsize=10)
    axes[1, 0].set_xlabel('Lift', fontsize=12, fontweight='bold')
    axes[1, 0].set_title('Top 10 Association Rules by Lift', fontsize=14, fontweight='bold')
    axes[1, 0].axvline(x=1, color='red', linestyle='--', linewidth=2, alpha=0.7)
    axes[1, 0].grid(True, alpha=0.3, axis='x')
    axes[1, 0].invert_yaxis()
    
    # Add value labels
    for i, (idx, row) in enumerate(top_rules.iterrows()):
        axes[1, 0].text(row['lift'] + 0.05, i, f"{row['lift']:.2f}", 
                        va='center', fontsize=9, fontweight='bold')
    
    # 4. Lift vs Confidence
    axes[1, 1].scatter(rules['confidence'], rules['lift'], s=100, alpha=0.6, 
                       c='darkgreen', edgecolors='black', linewidths=1)
    axes[1, 1].axhline(y=1, color='red', linestyle='--', linewidth=2, 
                       label='Lift = 1 (No relationship)', alpha=0.7)
    axes[1, 1].set_xlabel('Confidence', fontsize=12, fontweight='bold')
    axes[1, 1].set_ylabel('Lift', fontsize=12, fontweight='bold')
    axes[1, 1].set_title('Confidence vs Lift', fontsize=14, fontweight='bold')
    axes[1, 1].legend(fontsize=10)
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('market_basket_lift_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Visualizations saved as 'market_basket_lift_analysis.png'")

# Create visualizations
if rules is not None:
    create_visualizations(rules)

## Interpretation and Business Insights

In [None]:
def print_business_insights(rules):
    """Print detailed interpretation and business recommendations"""
    
    if rules is None or len(rules) == 0:
        print("No rules to interpret")
        return
    
    print("\n" + "=" * 80)
    print("UNDERSTANDING LIFT IN MARKET BASKET ANALYSIS")
    print("=" * 80)
    
    print("""
Lift measures the strength of association between items:

• Lift = 1.0: Items are independent (no relationship)
  - Buying Item A doesn't affect likelihood of buying Item B

• Lift > 1.0: Positive correlation (items bought together)
  - Lift = 2.0 means customers are 2x more likely to buy B when they buy A
  - Lift = 3.5 means customers are 3.5x more likely

• Lift < 1.0: Negative correlation (items bought apart)
  - Customers who buy A are less likely to buy B
    """)
    
    print("\n" + "=" * 80)
    print("DETAILED EXAMPLE - TOP RULE")
    print("=" * 80)
    
    top_rule = rules.iloc[0]
    ant = list(top_rule['antecedents'])[0]
    cons = list(top_rule['consequents'])[0]
    lift = top_rule['lift']
    conf = top_rule['confidence']
    supp = top_rule['support']
    
    print(f"\nRule: {ant} → {cons}")
    print(f"\nMetrics:")
    print(f"  • Lift:       {lift:.2f}")
    print(f"  • Confidence: {conf:.2%}")
    print(f"  • Support:    {supp:.2%}")
    
    print(f"\nInterpretation:")
    print(f"  Customers who buy '{ant}' are {lift:.2f}x more likely to buy '{cons}'")
    print(f"  compared to customers in general.")
    print(f"\n  Specifically:")
    print(f"  • {conf:.1%} of customers who buy '{ant}' also buy '{cons}'")
    print(f"  • This pattern appears in {supp:.1%} of all transactions")
    
    print("\n" + "=" * 80)
    print("BUSINESS RECOMMENDATIONS")
    print("=" * 80)
    
    print(f"""
Based on the top rule ({ant} → {cons}):

1. PRODUCT PLACEMENT
   • Place '{cons}' near '{ant}' in store layout
   • Ensure both items are easily accessible together

2. CROSS-SELLING STRATEGY
   • When customer adds '{ant}' to cart, recommend '{cons}'
   • Train staff to suggest '{cons}' when '{ant}' is purchased

3. BUNDLE OFFERS
   • Create bundle: '{ant}' + '{cons}' at discounted price
   • Expected lift: {lift:.2f}x higher purchase rate

4. PROMOTIONAL STRATEGY
   • Discount '{ant}' to drive traffic → increase '{cons}' sales
   • Run "Buy '{ant}', get discount on '{cons}'" promotion

5. INVENTORY MANAGEMENT
   • Maintain proportional stock levels
   • If '{ant}' sells well, ensure '{cons}' is in stock
    """)
    
    # Additional high-lift rules
    print("\n" + "=" * 80)
    print("OTHER HIGH-LIFT OPPORTUNITIES")
    print("=" * 80)
    
    top_5 = rules.head(5)
    for i, (idx, row) in enumerate(top_5.iterrows(), 1):
        ant = list(row['antecedents'])[0]
        cons = list(row['consequents'])[0]
        print(f"\n{i}. {ant} → {cons}")
        print(f"   Lift: {row['lift']:.2f}x | Confidence: {row['confidence']:.1%} | Support: {row['support']:.1%}")

# Print insights
if rules is not None:
    print_business_insights(rules)

## Export Rules to CSV

In [None]:
if rules is not None and len(rules) > 0:
    # Prepare export with readable format
    export_df = rules.copy()
    export_df['antecedents'] = export_df['antecedents'].apply(lambda x: ', '.join(list(x)))
    export_df['consequents'] = export_df['consequents'].apply(lambda x: ', '.join(list(x)))
    
    # Select relevant columns
    export_cols = ['antecedents', 'consequents', 'support', 'confidence', 'lift']
    export_df = export_df[export_cols]
    
    # Save to CSV
    export_df.to_csv('market_basket_rules.csv', index=False)
    print("\n✓ Rules exported to 'market_basket_rules.csv'")
    print(f"  Total rules exported: {len(export_df)}")
else:
    print("\nNo rules to export")

## Advanced: Filter Rules by Criteria

In [None]:
if rules is not None and len(rules) > 0:
    print("\n" + "=" * 80)
    print("FILTERING RULES BY CRITERIA")
    print("=" * 80)
    
    # High-impact rules: High lift AND high confidence
    high_impact = rules[(rules['lift'] > 2.0) & (rules['confidence'] > 0.5)]
    print(f"\nHigh-Impact Rules (Lift > 2.0 AND Confidence > 50%):")
    print(f"Found {len(high_impact)} rules")
    if len(high_impact) > 0:
        display_cols = ['antecedents', 'consequents', 'lift', 'confidence', 'support']
        print(high_impact[display_cols].head(10).to_string(index=False))
    
    # Frequent AND strong rules
    frequent_strong = rules[(rules['support'] > 0.1) & (rules['lift'] > 1.5)]
    print(f"\n\nFrequent & Strong Rules (Support > 10% AND Lift > 1.5):")
    print(f"Found {len(frequent_strong)} rules")
    if len(frequent_strong) > 0:
        print(frequent_strong[display_cols].head(10).to_string(index=False))
    
    # Rules for specific product
    target_product = 'milk'  # Change this to any product in your data
    product_rules = rules[
        rules['antecedents'].apply(lambda x: target_product in [str(i).lower() for i in x]) |
        rules['consequents'].apply(lambda x: target_product in [str(i).lower() for i in x])
    ]
    print(f"\n\nRules involving '{target_product}':")
    print(f"Found {len(product_rules)} rules")
    if len(product_rules) > 0:
        print(product_rules[display_cols].head(10).to_string(index=False))
else:
    print("\nNo rules available for filtering")

## Summary Statistics

In [None]:
if rules is not None and len(rules) > 0:
    print("\n" + "=" * 80)
    print("SUMMARY STATISTICS")
    print("=" * 80)
    
    print(f"\nTotal Rules Found: {len(rules)}")
    print(f"\nLift Statistics:")
    print(f"  Mean:   {rules['lift'].mean():.2f}")
    print(f"  Median: {rules['lift'].median():.2f}")
    print(f"  Min:    {rules['lift'].min():.2f}")
    print(f"  Max:    {rules['lift'].max():.2f}")
    print(f"  Std:    {rules['lift'].std():.2f}")
    
    print(f"\nConfidence Statistics:")
    print(f"  Mean:   {rules['confidence'].mean():.2%}")
    print(f"  Median: {rules['confidence'].median():.2%}")
    print(f"  Min:    {rules['confidence'].min():.2%}")
    print(f"  Max:    {rules['confidence'].max():.2%}")
    
    print(f"\nSupport Statistics:")
    print(f"  Mean:   {rules['support'].mean():.2%}")
    print(f"  Median: {rules['support'].median():.2%}")
    print(f"  Min:    {rules['support'].min():.2%}")
    print(f"  Max:    {rules['support'].max():.2%}")
    
    print(f"\nRules by Lift Category:")
    print(f"  Very Strong (Lift > 3.0):     {len(rules[rules['lift'] > 3.0])} rules")
    print(f"  Strong (2.0 < Lift ≤ 3.0):    {len(rules[(rules['lift'] > 2.0) & (rules['lift'] <= 3.0)])} rules")
    print(f"  Moderate (1.5 < Lift ≤ 2.0):  {len(rules[(rules['lift'] > 1.5) & (rules['lift'] <= 2.0)])} rules")
    print(f"  Weak (1.0 < Lift ≤ 1.5):      {len(rules[(rules['lift'] > 1.0) & (rules['lift'] <= 1.5)])} rules")
else:
    print("\nNo rules available for summary statistics")