# Vendor Payment Delay Prediction - Data Exploration

## Project Overview

This notebook explores vendor payment data to understand patterns and factors that influence payment delays. The goal is to build predictive models that can classify whether upcoming vendor payments will be "On-time" or "Delayed."

### Problem Statement
- Late payments to vendors can disrupt supply chains and damage relationships
- Delays may stem from cash flow issues, operational inefficiencies, or external factors
- Early identification of potential delays enables proactive management

### Business Impact
- Early alerts for finance teams
- Prioritization of payments to critical vendors  
- Better vendor relationship management
- Improved cash flow planning

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Generate Sample Data
import sys
import os
sys.path.append('../src')

from data_generator import VendorPaymentDataGenerator

# Generate synthetic vendor payment data
print("Generating synthetic vendor payment data...")
generator = VendorPaymentDataGenerator(n_records=10000, random_state=42)
df = generator.generate_vendor_data()

print(f"Generated {len(df)} records")
print(f"Data shape: {df.shape}")
print(f"Delay rate: {df['is_delayed'].mean():.2%}")

In [None]:
# Basic Dataset Information
print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Total records: {len(df):,}")
print(f"Number of features: {len(df.columns)}")

print("\n=== COLUMN INFORMATION ===")
print(df.info())

print("\n=== FIRST FEW ROWS ===")
display(df.head())

print("\n=== MISSING VALUES ===")
missing_values = df.isnull().sum()
print(f"Total missing values: {missing_values.sum()}")
if missing_values.sum() > 0:
    print("\nMissing values by column:")
    print(missing_values[missing_values > 0])
else:
    print("No missing values found!")

In [None]:
# Target Variable Analysis
print("=== TARGET VARIABLE ANALYSIS ===")

# Payment delay distribution
delay_counts = df['is_delayed'].value_counts()
delay_percentages = df['is_delayed'].value_counts(normalize=True) * 100

print(f"On-time payments: {delay_counts[0]:,} ({delay_percentages[0]:.1f}%)")
print(f"Delayed payments: {delay_counts[1]:,} ({delay_percentages[1]:.1f}%)")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
delay_counts.plot(kind='bar', ax=axes[0], color=['lightgreen', 'lightcoral'])
axes[0].set_title('Payment Status Distribution')
axes[0].set_xlabel('Payment Status')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['On-time', 'Delayed'], rotation=0)

# Pie chart
axes[1].pie(delay_counts.values, labels=['On-time', 'Delayed'], 
           autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
axes[1].set_title('Payment Status Percentage')

plt.tight_layout()
plt.show()

# Days difference analysis
print(f"\n=== PAYMENT TIMING ANALYSIS ===")
print(f"Average days difference: {df['days_difference'].mean():.2f}")
print(f"Median days difference: {df['days_difference'].median():.2f}")
print(f"Standard deviation: {df['days_difference'].std():.2f}")
print(f"Min days (early): {df['days_difference'].min()}")
print(f"Max days (late): {df['days_difference'].max()}")

In [None]:
# Vendor Category Analysis
print("=== VENDOR CATEGORY ANALYSIS ===")

# Category distribution
category_counts = df['vendor_category'].value_counts()
print("Vendor category distribution:")
print(category_counts)

# Delay rate by category
delay_by_category = df.groupby('vendor_category').agg({
    'is_delayed': ['count', 'sum', 'mean'],
    'invoice_amount': 'mean',
    'payment_terms': 'mean'
}).round(3)

delay_by_category.columns = ['Total_Invoices', 'Delayed_Count', 'Delay_Rate', 'Avg_Amount', 'Avg_Terms']
delay_by_category = delay_by_category.sort_values('Delay_Rate', ascending=False)

print("\nDelay analysis by vendor category:")
display(delay_by_category)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Category distribution
category_counts.plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Number of Invoices by Vendor Category')
axes[0,0].set_xlabel('Vendor Category')
axes[0,0].set_ylabel('Count')
axes[0,0].tick_params(axis='x', rotation=45)

# Delay rate by category
delay_by_category['Delay_Rate'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Delay Rate by Vendor Category')
axes[0,1].set_xlabel('Vendor Category')
axes[0,1].set_ylabel('Delay Rate')
axes[0,1].tick_params(axis='x', rotation=45)

# Average amount by category
delay_by_category['Avg_Amount'].plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('Average Invoice Amount by Category')
axes[1,0].set_xlabel('Vendor Category')
axes[1,0].set_ylabel('Average Amount ($)')
axes[1,0].tick_params(axis='x', rotation=45)

# Payment terms by category
delay_by_category['Avg_Terms'].plot(kind='bar', ax=axes[1,1], color='orange')
axes[1,1].set_title('Average Payment Terms by Category')
axes[1,1].set_xlabel('Vendor Category')
axes[1,1].set_ylabel('Average Payment Terms (days)')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Invoice Amount Analysis
print("=== INVOICE AMOUNT ANALYSIS ===")

# Basic statistics
print("Invoice amount statistics:")
print(df['invoice_amount'].describe())

# Amount distribution by delay status
print("\nAmount statistics by payment status:")
amount_by_status = df.groupby('is_delayed')['invoice_amount'].describe()
amount_by_status.index = ['On-time', 'Delayed']
display(amount_by_status)

# Create amount categories for analysis
df['amount_category'] = pd.cut(df['invoice_amount'], 
                              bins=[0, 10000, 50000, 100000, np.inf], 
                              labels=['Small', 'Medium', 'Large', 'XLarge'])

# Delay rate by amount category
delay_by_amount = df.groupby('amount_category').agg({
    'is_delayed': ['count', 'mean'],
    'invoice_amount': ['min', 'max', 'mean']
}).round(3)

delay_by_amount.columns = ['Count', 'Delay_Rate', 'Min_Amount', 'Max_Amount', 'Avg_Amount']
print("\nDelay rate by amount category:")
display(delay_by_amount)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Amount distribution
axes[0,0].hist(df['invoice_amount'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribution of Invoice Amounts')
axes[0,0].set_xlabel('Invoice Amount ($)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_xlim(0, 200000)

# Amount by delay status (box plot)
df.boxplot(column='invoice_amount', by='is_delayed', ax=axes[0,1])
axes[0,1].set_title('Invoice Amount by Payment Status')
axes[0,1].set_xlabel('Payment Status (0=On-time, 1=Delayed)')
axes[0,1].set_ylabel('Invoice Amount ($)')

# Delay rate by amount category
delay_by_amount['Delay_Rate'].plot(kind='bar', ax=axes[1,0], color='lightcoral')
axes[1,0].set_title('Delay Rate by Amount Category')
axes[1,0].set_xlabel('Amount Category')
axes[1,0].set_ylabel('Delay Rate')
axes[1,0].tick_params(axis='x', rotation=45)

# Scatter plot: Amount vs Days Difference
scatter = axes[1,1].scatter(df['invoice_amount'], df['days_difference'], 
                          c=df['is_delayed'], cmap='RdYlBu', alpha=0.6)
axes[1,1].set_title('Invoice Amount vs Payment Delay')
axes[1,1].set_xlabel('Invoice Amount ($)')
axes[1,1].set_ylabel('Days Difference')
axes[1,1].set_xlim(0, 200000)
plt.colorbar(scatter, ax=axes[1,1], label='Delayed (1) / On-time (0)')

plt.tight_layout()
plt.show()

In [None]:
# Temporal Patterns Analysis
print("=== TEMPORAL PATTERNS ANALYSIS ===")

# Extract temporal features
df['invoice_month'] = df['invoice_date'].dt.month
df['invoice_quarter'] = df['invoice_date'].dt.quarter
df['invoice_year'] = df['invoice_date'].dt.year
df['invoice_day_of_week'] = df['invoice_date'].dt.dayofweek
df['is_month_end'] = (df['invoice_date'].dt.day > 25).astype(int)

# Monthly patterns
monthly_delays = df.groupby('invoice_month').agg({
    'is_delayed': ['count', 'mean'],
    'invoice_amount': 'mean'
}).round(3)
monthly_delays.columns = ['Invoice_Count', 'Delay_Rate', 'Avg_Amount']

print("Monthly delay patterns:")
display(monthly_delays)

# Quarterly patterns
quarterly_delays = df.groupby('quarter').agg({
    'is_delayed': ['count', 'mean'],
    'cash_flow_level': lambda x: (x == 'Low').mean()
}).round(3)
quarterly_delays.columns = ['Invoice_Count', 'Delay_Rate', 'Low_Cash_Flow_Rate']

print("\nQuarterly patterns:")
display(quarterly_delays)

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Monthly delay rate
monthly_delays['Delay_Rate'].plot(kind='line', marker='o', ax=axes[0,0], color='red')
axes[0,0].set_title('Delay Rate by Month')
axes[0,0].set_xlabel('Month')
axes[0,0].set_ylabel('Delay Rate')
axes[0,0].grid(True, alpha=0.3)

# Quarterly delay rate
quarterly_delays['Delay_Rate'].plot(kind='bar', ax=axes[0,1], color='orange')
axes[0,1].set_title('Delay Rate by Quarter')
axes[0,1].set_xlabel('Quarter')
axes[0,1].set_ylabel('Delay Rate')
axes[0,1].tick_params(axis='x', rotation=0)

# Day of week patterns
dow_delays = df.groupby('invoice_day_of_week')['is_delayed'].mean()
dow_labels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
dow_delays.index = dow_labels
dow_delays.plot(kind='bar', ax=axes[0,2], color='green')
axes[0,2].set_title('Delay Rate by Day of Week')
axes[0,2].set_xlabel('Day of Week')
axes[0,2].set_ylabel('Delay Rate')
axes[0,2].tick_params(axis='x', rotation=45)

# Monthly invoice volume
monthly_delays['Invoice_Count'].plot(kind='bar', ax=axes[1,0], color='skyblue')
axes[1,0].set_title('Invoice Volume by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Invoice Count')

# Cash flow vs delay rate by quarter
x = np.arange(len(quarterly_delays))
width = 0.35
axes[1,1].bar(x - width/2, quarterly_delays['Delay_Rate'], width, label='Delay Rate', color='red', alpha=0.7)
axes[1,1].bar(x + width/2, quarterly_delays['Low_Cash_Flow_Rate'], width, label='Low Cash Flow Rate', color='blue', alpha=0.7)
axes[1,1].set_title('Delay Rate vs Cash Flow by Quarter')
axes[1,1].set_xlabel('Quarter')
axes[1,1].set_xticks(x)
axes[1,1].set_xticklabels(quarterly_delays.index)
axes[1,1].legend()

# Month-end effect
month_end_effect = df.groupby('is_month_end')['is_delayed'].mean()
month_end_effect.index = ['Regular Days', 'Month End']
month_end_effect.plot(kind='bar', ax=axes[1,2], color='purple')
axes[1,2].set_title('Month-End Effect on Delays')
axes[1,2].set_xlabel('Period')
axes[1,2].set_ylabel('Delay Rate')
axes[1,2].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Cash Flow and Financial Analysis
print("=== CASH FLOW & FINANCIAL ANALYSIS ===")

# Cash flow level analysis
cash_flow_analysis = df.groupby('cash_flow_level').agg({
    'is_delayed': ['count', 'mean'],
    'invoice_amount': 'mean',
    'month_cash_available': 'mean'
}).round(3)
cash_flow_analysis.columns = ['Count', 'Delay_Rate', 'Avg_Invoice', 'Avg_Cash_Available']

print("Delay rate by cash flow level:")
display(cash_flow_analysis)

# Economic indicator impact
df['economic_stress'] = (df['economic_indicator'] < 85).astype(int)
economic_impact = df.groupby('economic_stress')['is_delayed'].mean()
economic_impact.index = ['Normal Economy', 'Economic Stress']

print(f"\nEconomic impact on delays:")
print(economic_impact)

# Dispute analysis
dispute_impact = df.groupby('has_dispute').agg({
    'is_delayed': ['count', 'mean'],
    'days_difference': 'mean'
}).round(3)
dispute_impact.columns = ['Count', 'Delay_Rate', 'Avg_Days_Diff']
dispute_impact.index = ['No Dispute', 'Has Dispute']

print(f"\nDispute impact on payments:")
display(dispute_impact)

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Cash flow level impact
cash_flow_analysis['Delay_Rate'].plot(kind='bar', ax=axes[0,0], color='lightblue')
axes[0,0].set_title('Delay Rate by Cash Flow Level')
axes[0,0].set_xlabel('Cash Flow Level')
axes[0,0].set_ylabel('Delay Rate')
axes[0,0].tick_params(axis='x', rotation=0)

# Economic stress impact
economic_impact.plot(kind='bar', ax=axes[0,1], color='orange')
axes[0,1].set_title('Delay Rate by Economic Condition')
axes[0,1].set_xlabel('Economic Condition')
axes[0,1].set_ylabel('Delay Rate')
axes[0,1].tick_params(axis='x', rotation=45)

# Dispute impact
dispute_impact['Delay_Rate'].plot(kind='bar', ax=axes[0,2], color='red')
axes[0,2].set_title('Delay Rate by Dispute Status')
axes[0,2].set_xlabel('Dispute Status')
axes[0,2].set_ylabel('Delay Rate')
axes[0,2].tick_params(axis='x', rotation=0)

# Cash available vs delays (scatter)
scatter = axes[1,0].scatter(df['month_cash_available'], df['is_delayed'], 
                          alpha=0.6, c=df['invoice_amount'], cmap='viridis')
axes[1,0].set_title('Cash Available vs Payment Delays')
axes[1,0].set_xlabel('Monthly Cash Available ($)')
axes[1,0].set_ylabel('Is Delayed')
plt.colorbar(scatter, ax=axes[1,0], label='Invoice Amount')

# Economic indicator distribution
df['economic_indicator'].hist(bins=30, ax=axes[1,1], alpha=0.7, color='green')
axes[1,1].axvline(85, color='red', linestyle='--', label='Stress Threshold')
axes[1,1].set_title('Economic Indicator Distribution')
axes[1,1].set_xlabel('Economic Indicator')
axes[1,1].set_ylabel('Frequency')
axes[1,1].legend()

# Cash flow vs amount relationship
cash_flow_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
df['cash_flow_numeric'] = df['cash_flow_level'].map(cash_flow_mapping)
axes[1,2].scatter(df['cash_flow_numeric'], df['invoice_amount'], 
                 c=df['is_delayed'], cmap='RdYlBu', alpha=0.6)
axes[1,2].set_title('Cash Flow vs Invoice Amount')
axes[1,2].set_xlabel('Cash Flow Level (1=Low, 2=Medium, 3=High)')
axes[1,2].set_ylabel('Invoice Amount ($)')
axes[1,2].set_xticks([1, 2, 3])
axes[1,2].set_xticklabels(['Low', 'Medium', 'High'])

plt.tight_layout()
plt.show()

In [None]:
# Vendor Relationship Analysis
print("=== VENDOR RELATIONSHIP ANALYSIS ===")

# Past delays impact
past_delays_impact = df.groupby('vendor_past_delays').agg({
    'is_delayed': ['count', 'mean'],
    'invoice_amount': 'mean'
}).round(3)
past_delays_impact.columns = ['Count', 'Current_Delay_Rate', 'Avg_Amount']

print("Impact of past delays on current payments:")
display(past_delays_impact.head(10))

# Relationship duration analysis
df['relationship_category'] = pd.cut(df['vendor_relationship_years'], 
                                   bins=[0, 1, 3, 5, np.inf], 
                                   labels=['New', 'Developing', 'Established', 'Long_term'])

relationship_analysis = df.groupby('relationship_category').agg({
    'is_delayed': ['count', 'mean'],
    'vendor_past_delays': 'mean',
    'payment_frequency': 'mean'
}).round(3)
relationship_analysis.columns = ['Count', 'Delay_Rate', 'Avg_Past_Delays', 'Avg_Frequency']

print("\nDelay patterns by relationship duration:")
display(relationship_analysis)

# Payment frequency analysis
frequency_analysis = df.groupby('payment_frequency').agg({
    'is_delayed': ['count', 'mean'],
    'invoice_amount': 'mean'
}).round(3)
frequency_analysis.columns = ['Count', 'Delay_Rate', 'Avg_Amount']

print("\nDelay patterns by payment frequency:")
display(frequency_analysis.head(10))

# Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Past delays vs current delays
past_delays_impact['Current_Delay_Rate'].plot(kind='line', marker='o', ax=axes[0,0], color='red')
axes[0,0].set_title('Current Delay Rate vs Past Delays')
axes[0,0].set_xlabel('Number of Past Delays')
axes[0,0].set_ylabel('Current Delay Rate')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].set_xlim(0, 10)

# Relationship duration impact
relationship_analysis['Delay_Rate'].plot(kind='bar', ax=axes[0,1], color='green')
axes[0,1].set_title('Delay Rate by Relationship Duration')
axes[0,1].set_xlabel('Relationship Category')
axes[0,1].set_ylabel('Delay Rate')
axes[0,1].tick_params(axis='x', rotation=45)

# Payment frequency impact
frequency_analysis.head(10)['Delay_Rate'].plot(kind='bar', ax=axes[0,2], color='blue')
axes[0,2].set_title('Delay Rate by Payment Frequency')
axes[0,2].set_xlabel('Monthly Payment Frequency')
axes[0,2].set_ylabel('Delay Rate')

# Relationship years distribution
df['vendor_relationship_years'].hist(bins=30, ax=axes[1,0], alpha=0.7, color='orange')
axes[1,0].set_title('Distribution of Vendor Relationship Duration')
axes[1,0].set_xlabel('Relationship Years')
axes[1,0].set_ylabel('Frequency')

# Scatter: Relationship years vs past delays
scatter = axes[1,1].scatter(df['vendor_relationship_years'], df['vendor_past_delays'], 
                          c=df['is_delayed'], cmap='RdYlBu', alpha=0.6)
axes[1,1].set_title('Relationship Duration vs Past Delays')
axes[1,1].set_xlabel('Relationship Years')
axes[1,1].set_ylabel('Past Delays')
plt.colorbar(scatter, ax=axes[1,1], label='Currently Delayed')

# Payment terms vs delay rate
terms_analysis = df.groupby('payment_terms')['is_delayed'].mean()
terms_analysis.plot(kind='line', marker='o', ax=axes[1,2], color='purple')
axes[1,2].set_title('Delay Rate by Payment Terms')
axes[1,2].set_xlabel('Payment Terms (days)')
axes[1,2].set_ylabel('Delay Rate')
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis
print("=== CORRELATION ANALYSIS ===")

# Select numerical features for correlation
numerical_features = [
    'payment_terms', 'invoice_amount', 'vendor_past_delays',
    'vendor_relationship_years', 'payment_frequency', 'month_cash_available',
    'economic_indicator', 'has_dispute', 'is_delayed', 'days_difference',
    'invoice_month', 'invoice_quarter', 'is_month_end', 'economic_stress'
]

# Calculate correlation matrix
correlation_matrix = df[numerical_features].corr()

print("Top correlations with payment delays:")
delay_correlations = correlation_matrix['is_delayed'].abs().sort_values(ascending=False)
display(delay_correlations.head(10))

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
           fmt='.2f', ax=axes[0,0], cbar_kws={'shrink': 0.8})
axes[0,0].set_title('Feature Correlation Matrix')

# Feature importance for delays (correlation)
delay_correlations.drop('is_delayed').plot(kind='barh', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('Feature Correlation with Payment Delays')
axes[0,1].set_xlabel('Absolute Correlation')

# Pairplot of top features
top_features = ['is_delayed', 'vendor_past_delays', 'has_dispute', 'economic_indicator', 'month_cash_available']
# Create scatter plots for top feature pairs
axes[1,0].scatter(df['vendor_past_delays'], df['has_dispute'], 
                 c=df['is_delayed'], cmap='RdYlBu', alpha=0.6)
axes[1,0].set_xlabel('Vendor Past Delays')
axes[1,0].set_ylabel('Has Dispute')
axes[1,0].set_title('Past Delays vs Disputes')

axes[1,1].scatter(df['economic_indicator'], df['month_cash_available'], 
                 c=df['is_delayed'], cmap='RdYlBu', alpha=0.6)
axes[1,1].set_xlabel('Economic Indicator')
axes[1,1].set_ylabel('Monthly Cash Available')
axes[1,1].set_title('Economic vs Cash Flow Factors')

plt.tight_layout()
plt.show()

# Feature interaction analysis
print("\n=== FEATURE INTERACTIONS ===")

# High-risk combinations
high_risk_conditions = [
    (df['vendor_past_delays'] > 2) & (df['has_dispute'] == 1),
    (df['cash_flow_level'] == 'Low') & (df['economic_stress'] == 1),
    (df['invoice_amount'] > 100000) & (df['cash_flow_level'] == 'Low'),
    (df['payment_terms'] > 45) & (df['vendor_past_delays'] > 1)
]

condition_names = [
    'Past Delays + Dispute',
    'Low Cash + Economic Stress', 
    'Large Amount + Low Cash',
    'Long Terms + Past Delays'
]

print("Delay rates for high-risk combinations:")
for condition, name in zip(high_risk_conditions, condition_names):
    if condition.sum() > 0:
        delay_rate = df[condition]['is_delayed'].mean()
        count = condition.sum()
        print(f"{name}: {delay_rate:.2%} delay rate ({count} cases)")
    else:
        print(f"{name}: No cases found")

In [None]:
# Key Insights and Summary
print("=" * 60)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("=" * 60)

print("\n🎯 TARGET VARIABLE INSIGHTS:")
print(f"• Overall delay rate: {df['is_delayed'].mean():.1%}")
print(f"• Class balance: Reasonably balanced for ML modeling")

print("\n📊 VENDOR CATEGORY INSIGHTS:")
category_insights = df.groupby('vendor_category')['is_delayed'].mean().sort_values(ascending=False)
print(f"• Highest delay rate: {category_insights.index[0]} ({category_insights.iloc[0]:.1%})")
print(f"• Lowest delay rate: {category_insights.index[-1]} ({category_insights.iloc[-1]:.1%})")

print("\n💰 AMOUNT INSIGHTS:")
large_amounts = df['invoice_amount'] > 100000
print(f"• Large invoices (>$100k) delay rate: {df[large_amounts]['is_delayed'].mean():.1%}")
print(f"• Small invoices (<$10k) delay rate: {df[df['invoice_amount'] < 10000]['is_delayed'].mean():.1%}")

print("\n📅 TEMPORAL INSIGHTS:")
q4_delays = df[df['quarter'] == 'Q4']['is_delayed'].mean()
q2_delays = df[df['quarter'] == 'Q2']['is_delayed'].mean()
print(f"• Q4 delay rate: {q4_delays:.1%}")
print(f"• Q2 delay rate: {q2_delays:.1%}")

print("\n💳 CASH FLOW INSIGHTS:")
cash_flow_insights = df.groupby('cash_flow_level')['is_delayed'].mean()
print(f"• Low cash flow delay rate: {cash_flow_insights['Low']:.1%}")
print(f"• High cash flow delay rate: {cash_flow_insights['High']:.1%}")

print("\n🤝 VENDOR RELATIONSHIP INSIGHTS:")
high_past_delays = df['vendor_past_delays'] > 3
print(f"• Vendors with >3 past delays: {df[high_past_delays]['is_delayed'].mean():.1%} current delay rate")

dispute_impact = df[df['has_dispute'] == 1]['is_delayed'].mean()
print(f"• Invoices with disputes: {dispute_impact:.1%} delay rate")

print("\n⚠️ HIGH-RISK FACTORS:")
print("• Past payment delays are strong predictors")
print("• Disputes significantly increase delay probability")
print("• Low cash flow periods show higher delays")
print("• Certain vendor categories are more prone to delays")
print("• Large invoice amounts correlate with delays")

print("\n📈 MODELING RECOMMENDATIONS:")
print("• Use vendor past performance as key feature")
print("• Include cash flow and economic indicators")
print("• Consider seasonal/temporal features")
print("• Encode vendor categories appropriately")
print("• Handle potential class imbalance if needed")
print("• Feature engineering for amount categories")

print("\n🔍 FEATURE IMPORTANCE PREVIEW:")
important_features = [
    'vendor_past_delays', 'has_dispute', 'cash_flow_level',
    'invoice_amount', 'vendor_category', 'economic_indicator',
    'payment_terms', 'vendor_relationship_years'
]
print("Most promising features for modeling:")
for i, feature in enumerate(important_features[:8], 1):
    print(f"{i:2d}. {feature}")

print("\n" + "=" * 60)
print("READY FOR DATA PREPROCESSING AND MODELING!")
print("=" * 60)