In [1]:
pip install apache-airflow-providers-postgres pandas scikit-learn mlxtend

Collecting apache-airflow-providers-postgres
  Downloading apache_airflow_providers_postgres-6.2.1-py3-none-any.whl.metadata (5.6 kB)
Collecting asyncpg>=0.30.0 (from apache-airflow-providers-postgres)
  Downloading asyncpg-0.30.0-cp311-cp311-win_amd64.whl.metadata (5.2 kB)
Downloading apache_airflow_providers_postgres-6.2.1-py3-none-any.whl (18 kB)
Downloading asyncpg-0.30.0-cp311-cp311-win_amd64.whl (629 kB)
   ---------------------------------------- 0.0/629.4 kB ? eta -:--:--
   -- ------------------------------------ 41.0/629.4 kB 991.0 kB/s eta 0:00:01
   ----------- ---------------------------- 184.3/629.4 kB 1.9 MB/s eta 0:00:01
   ------------- -------------------------- 215.0/629.4 kB 1.9 MB/s eta 0:00:01
   -------------- ------------------------- 235.5/629.4 kB 1.2 MB/s eta 0:00:01
   -------------------- ------------------- 317.4/629.4 kB 1.4 MB/s eta 0:00:01
   ------------------------ --------------- 378.9/629.4 kB 1.4 MB/s eta 0:00:01
   ------------------------ -------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\sarum\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.postgres.hooks.postgres import PostgresHook
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

def extract_transaction_data():
    """Simulate extracting raw transaction data"""
    # In production, this would connect to your actual data source
    data = {
        'transaction_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
        'user_id': [101, 102, 101, 103, 102, 104, 101, 103],
        'product_id': ['P1', 'P2', 'P3', 'P1', 'P4', 'P2', 'P4', 'P3'],
        'amount': [120.50, 35.99, 89.99, 120.50, 45.75, 35.99, 45.75, 89.99],
        'timestamp': [
            '2023-11-01 09:15:23', '2023-11-01 11:30:45',
            '2023-11-02 14:20:10', '2023-11-02 16:45:30',
            '2023-11-03 10:10:00', '2023-11-03 12:30:15',
            '2023-11-04 15:45:20', '2023-11-04 18:00:00'
        ]
    }
    df = pd.DataFrame(data)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df.to_json()

def calculate_clv(**kwargs):
    """Calculate Customer Lifetime Value"""
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='extract_transaction_data')
    df = pd.read_json(data)
    
    # Calculate CLV (simplified version)
    clv = df.groupby('user_id').agg(
        total_spend=('amount', 'sum'),
        purchase_count=('transaction_id', 'count'),
        avg_purchase_value=('amount', 'mean')
    ).reset_index()
    
    # Add customer segmentation
    kmeans = KMeans(n_clusters=3)
    clv['segment'] = kmeans.fit_predict(clv[['total_spend', 'purchase_count']])
    clv['segment'] = clv['segment'].map({0: 'Low', 1: 'Medium', 2: 'High'})
    
    # Save to database
    hook = PostgresHook(postgres_conn_id='postgres_conn')
    conn = hook.get_conn()
    clv.to_sql('customer_clv', conn, if_exists='replace', index=False)
    conn.close()

def analyze_time_trends(**kwargs):
    """Analyze hourly/daily purchase patterns"""
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='extract_transaction_data')
    df = pd.read_json(data)
    
    # Extract time features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.day_name()
    
    # Time-based analysis
    hourly_trends = df.groupby('hour').agg(
        transaction_count=('transaction_id', 'count'),
        total_sales=('amount', 'sum')
    ).reset_index()
    
    daily_trends = df.groupby('day_of_week').agg(
        transaction_count=('transaction_id', 'count'),
        total_sales=('amount', 'sum')
    ).reset_index()
    
    # Save to database
    hook = PostgresHook(postgres_conn_id='postgres_conn')
    conn = hook.get_conn()
    hourly_trends.to_sql('hourly_trends', conn, if_exists='replace', index=False)
    daily_trends.to_sql('daily_trends', conn, if_exists='replace', index=False)
    conn.close()

def analyze_product_affinity(**kwargs):
    """Identify frequently bought together products"""
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='extract_transaction_data')
    df = pd.read_json(data)
    
    # Prepare data for association rules
    basket = df.groupby(['transaction_id', 'product_id'])['amount'] \
              .sum().unstack().reset_index().fillna(0) \
              .set_index('transaction_id')
    
    # Convert to binary matrix
    basket = basket.applymap(lambda x: 1 if x > 0 else 0)
    
    # Find frequent itemsets
    frequent_itemsets = apriori(basket, min_support=0.1, use_colnames=True)
    
    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
    rules = rules.sort_values('confidence', ascending=False)
    
    # Save to database
    hook = PostgresHook(postgres_conn_id='postgres_conn')
    conn = hook.get_conn()
    rules.to_sql('product_affinity_rules', conn, if_exists='replace', index=False)
    conn.close()

with DAG(
    'ecommerce_behavior_analytics',
    default_args=default_args,
    description='ETL pipeline for e-commerce customer behavior analytics',
    schedule_interval='@daily',
    catchup=False,
) as dag:
    
    
    
    extract_data = PythonOperator(
        task_id='extract_transaction_data',
        python_callable=extract_transaction_data
    )
    
    clv_analysis = PythonOperator(
        task_id='calculate_clv',
        python_callable=calculate_clv
    )
    
    time_analysis = PythonOperator(
        task_id='analyze_time_trends',
        python_callable=analyze_time_trends
    )
    
    product_analysis = PythonOperator(
        task_id='analyze_product_affinity',
        python_callable=analyze_product_affinity
    )
