In [2]:
pip install apache-airflow pandas mlxtend plotly sqlalchemy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\sarum\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.email import EmailOperator
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import plotly.express as px
from sqlalchemy import create_engine
import sqlite3
import json
import os

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

def extract_transaction_data():
    """Generate enhanced sample transaction data"""
    data = {
        'transaction_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010],
        'user_id': [101, 102, 101, 103, 102, 104, 101, 103, 105, 106],
        'product_id': ['P1', 'P2', 'P3', 'P1', 'P4', 'P2', 'P4', 'P3', 'P1', 'P5'],
        'category': ['Electronics', 'Clothing', 'Home', 'Electronics', 
                   'Clothing', 'Clothing', 'Clothing', 'Home', 'Electronics', 'Books'],
        'amount': [120.50, 35.99, 89.99, 120.50, 45.75, 35.99, 45.75, 89.99, 120.50, 15.99],
        'timestamp': [
            '2023-11-01 09:15:23', '2023-11-01 11:30:45',
            '2023-11-02 14:20:10', '2023-11-02 16:45:30',
            '2023-11-03 10:10:00', '2023-11-03 12:30:15',
            '2023-11-04 15:45:20', '2023-11-04 18:00:00',
            '2023-11-05 10:30:00', '2023-11-05 13:45:00'
        ]
    }
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.to_json()

def prepare_affinity_data(**kwargs):
    """Prepare data for affinity analysis"""
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='extract_transaction_data')
    df = pd.read_json(data)
    
    # Group products by transaction
    transactions = df.groupby('transaction_id')['product_id'].apply(list).values.tolist()
    
    # One-hot encode transactions
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    basket_df = pd.DataFrame(te_ary, columns=te.columns_)
    
    # Save prepared data for visualization
    os.makedirs('/tmp/affinity_analysis', exist_ok=True)
    basket_df.to_csv('/tmp/affinity_analysis/encoded_transactions.csv', index=False)
    
    return basket_df.to_json()

def calculate_association_rules(**kwargs):
    """Calculate advanced association rules with multiple metrics"""
    ti = kwargs['ti']
    basket_json = ti.xcom_pull(task_ids='prepare_affinity_data')
    basket_df = pd.read_json(basket_json)
    
    # Find frequent itemsets
    frequent_itemsets = apriori(basket_df, min_support=0.2, use_colnames=True)
    
    # Generate comprehensive association rules
    rules = association_rules(
        frequent_itemsets, 
        metric="confidence", 
        min_threshold=0.7,
        support_only=False
    )
    
    # Add additional metrics
    rules['combo_frequency'] = rules['support'] * len(basket_df)
    rules['expected_confidence'] = rules['consequent support']
    rules['lift_ratio'] = rules['confidence'] / rules['expected_confidence']
    
    # Save to SQLite
    conn = sqlite3.connect('/tmp/ecommerce_analytics.db')
    rules.to_sql('advanced_product_affinity', conn, if_exists='replace', index=False)
    conn.close()
    
    # Save for visualization
    rules.to_csv('/tmp/affinity_analysis/association_rules.csv', index=False)
    return rules.to_json()

def generate_visualizations(**kwargs):
    """Create interactive visualizations of product affinities"""
    ti = kwargs['ti']
    rules_json = ti.xcom_pull(task_ids='calculate_association_rules')
    rules = pd.read_json(rules_json)
    
    # Prepare data for visualization
    rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
    rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))
    rules['rule'] = rules['antecedents'] + " → " + rules['consequents']
    
    # Create visualizations
    fig1 = px.scatter(
        rules, 
        x='support', 
        y='confidence', 
        size='lift', 
        color='lift_ratio',
        hover_data=['rule', 'combo_frequency'],
        title='Product Affinity Rules - Support vs Confidence'
    )
    
    fig2 = px.bar(
        rules.sort_values('lift', ascending=False).head(10),
        x='rule',
        y='lift',
        color='confidence',
        title='Top 10 Product Affinities by Lift Score'
    )
    
    # Save visualizations
    fig1.write_html('/tmp/affinity_analysis/support_vs_confidence.html')
    fig2.write_html('/tmp/affinity_analysis/top_affinities.html')
    
    # Create network graph
    create_affinity_network(rules)
    
    return '/tmp/affinity_analysis'

def create_affinity_network(rules):
    """Generate network graph of product relationships"""
    import networkx as nx
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(12, 8))
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in rules.iterrows():
        antecedents = ', '.join(list(row['antecedents']))
        consequents = ', '.join(list(row['consequents']))
        G.add_edge(antecedents, consequents, weight=row['lift'])
    
    # Draw graph
    pos = nx.spring_layout(G, k=0.5)
    nx.draw_networkx_nodes(G, pos, node_size=2000, alpha=0.8)
    nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, 
                          edge_color='gray', 
                          arrowsize=20)
    nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
    
    plt.title('Product Affinity Network')
    plt.savefig('/tmp/affinity_analysis/affinity_network.png')
    plt.close()

with DAG(
    'advanced_product_affinity_analysis',
    default_args=default_args,
    description='Advanced analysis of frequently bought together products',
    schedule_interval='@weekly',
    catchup=False,
) as dag:
    
    extract_data = PythonOperator(
        task_id='extract_transaction_data',
        python_callable=extract_transaction_data
    )
    
    prepare_data = PythonOperator(
        task_id='prepare_affinity_data',
        python_callable=prepare_affinity_data
    )
    
    calculate_rules = PythonOperator(
        task_id='calculate_association_rules',
        python_callable=calculate_association_rules
    )
    
    generate_viz = PythonOperator(
        task_id='generate_visualizations',
        python_callable=generate_visualizations
    )
    
    email_results = EmailOperator(
        task_id='email_results',
        to='analytics-team@company.com',
        subject='Weekly Product Affinity Analysis',
        html_content="""<h1>Product Affinity Analysis Report</h1>
                      <p>Attached are this week's product affinity insights.</p>""",
        files=[
            '/tmp/affinity_analysis/support_vs_confidence.html',
            '/tmp/affinity_analysis/top_affinities.html',
            '/tmp/affinity_analysis/affinity_network.png'
        ]
    )
    
    extract_data >> prepare_data >> calculate_rules >> generate_viz >> email_results

OSError while attempting to symlink the latest log directory
