In [1]:
import pandas as pd
import numpy as np

def align_dataframes(df1, df2, df1_name="df1", df2_name="df2", 
                    drop_na=True, verbose=True):
    """
    Align two DataFrames by their datetime index, keeping only common dates.
    
    Parameters:
    -----------
    df1, df2 : pandas.DataFrame
        DataFrames to align (must have datetime index)
    df1_name, df2_name : str
        Names for the dataframes (used in verbose output)
    drop_na : bool, default=True
        Whether to drop rows with NaN values after alignment
    verbose : bool, default=True
        Whether to print diagnostic information
        
    Returns:
    --------
    tuple : (df1_aligned, df2_aligned)
        Aligned DataFrames with common datetime index
        
    Raises:
    -------
    ValueError : If no common dates are found
    """
    
    if verbose:
        print(f"{df1_name} shape: {df1.shape}")
        print(f"{df2_name} shape: {df2.shape}")
        print(f"{df1_name} date range: {df1.index.min()} to {df1.index.max()}")
        print(f"{df2_name} date range: {df2.index.min()} to {df2.index.max()}")
    
    # Find common dates
    common_dates = df1.index.intersection(df2.index)
    
    if verbose:
        print(f"Common dates: {len(common_dates)}")
    
    if len(common_dates) == 0:
        if verbose:
            print(f"ERROR: No common dates between {df1_name} and {df2_name}!")
            print(f"{df1_name} sample dates:", df1.index[:5].tolist())
            print(f"{df2_name} sample dates:", df2.index[:5].tolist())
        raise ValueError(f"No common dates found between {df1_name} and {df2_name}")
    
    # Filter both dataframes to common dates
    df1_aligned = df1.loc[common_dates].copy()
    df2_aligned = df2.loc[common_dates].copy()
    
    if verbose:
        print(f"After date alignment - {df1_name}: {df1_aligned.shape}, {df2_name}: {df2_aligned.shape}")
    
    # Remove NaN values if requested
    if drop_na:
        before_len = len(df1_aligned)
        
        # Drop NaN from both DataFrames
        df1_aligned = df1_aligned.dropna()
        df2_aligned = df2_aligned.dropna()
        
        # Ensure both have the same final index (intersection of clean data)
        final_common = df1_aligned.index.intersection(df2_aligned.index)
        df1_aligned = df1_aligned.loc[final_common]
        df2_aligned = df2_aligned.loc[final_common]
        
        if verbose:
            print(f"After cleaning NaN - {df1_name}: {df1_aligned.shape}, {df2_name}: {df2_aligned.shape}")
            print(f"Removed {before_len - len(df1_aligned)} rows due to NaN values")
    
    # Final validation
    if len(df1_aligned) == 0 or len(df2_aligned) == 0:
        raise ValueError("No data remaining after alignment and cleaning!")
    
    if not df1_aligned.index.equals(df2_aligned.index):
        raise ValueError("Final indices are not equal - alignment failed!")
    
    if verbose:
        print(f"✓ Alignment successful! Final shape: {df1_aligned.shape[0]} rows")
        print(f"Date range: {df1_aligned.index.min()} to {df1_aligned.index.max()}")
    
    return df1_aligned, df2_aligned


# Example usage (uncomment to test):
# X_aligned, Y_aligned = align_dataframes(X, Y, "X (factors)", "Y (assets)")

# For silent operation:
# X_aligned, Y_aligned = align_dataframes(X, Y, verbose=False)

# To keep NaN values:
# X_aligned, Y_aligned = align_dataframes(X, Y, drop_na=False)

In [2]:
# Example: Create sample DataFrames to demonstrate the function

# Create sample data with different date ranges
dates1 = pd.date_range('2020-01-01', '2023-12-31', freq='D')
dates2 = pd.date_range('2021-06-01', '2024-06-30', freq='D')

# Sample DataFrame 1: Stock prices
np.random.seed(42)
df_stocks = pd.DataFrame({
    'AAPL': np.random.randn(len(dates1)).cumsum() + 100,
    'MSFT': np.random.randn(len(dates1)).cumsum() + 200,
    'GOOGL': np.random.randn(len(dates1)).cumsum() + 150
}, index=dates1)

# Sample DataFrame 2: Market factors (different date range)
np.random.seed(123)
df_factors = pd.DataFrame({
    'Market_Factor': np.random.randn(len(dates2)) * 0.02,
    'Size_Factor': np.random.randn(len(dates2)) * 0.01,
    'Value_Factor': np.random.randn(len(dates2)) * 0.015
}, index=dates2)

# Add some NaN values to simulate real-world data
df_stocks.iloc[100:105, 0] = np.nan  # Some missing AAPL data
df_factors.iloc[50:55, 1] = np.nan   # Some missing Size_Factor data

print("Sample DataFrames created:")
print(f"Stocks: {df_stocks.shape}, date range: {df_stocks.index.min()} to {df_stocks.index.max()}")
print(f"Factors: {df_factors.shape}, date range: {df_factors.index.min()} to {df_factors.index.max()}")
print(f"Stocks NaN count: {df_stocks.isnull().sum().sum()}")
print(f"Factors NaN count: {df_factors.isnull().sum().sum()}")

Sample DataFrames created:
Stocks: (1461, 3), date range: 2020-01-01 00:00:00 to 2023-12-31 00:00:00
Factors: (1126, 3), date range: 2021-06-01 00:00:00 to 2024-06-30 00:00:00
Stocks NaN count: 5
Factors NaN count: 5


# DataFrame Alignment Function

This notebook contains a generic function `align_dataframes()` for aligning two DataFrames with datetime indices.

## Key Features:
- **Automatic alignment**: Finds common dates between two DataFrames
- **Flexible options**: Control NaN handling and verbosity
- **Error handling**: Clear error messages for troubleshooting
- **Validation**: Ensures final alignment is correct

## Common Use Cases:
- Aligning stock prices with market factors
- Synchronizing economic indicators with financial data
- Preparing data for regression analysis
- Portfolio optimization with multiple data sources

## Usage Examples:
```python
# Basic usage with verbose output
df1_aligned, df2_aligned = align_dataframes(df1, df2, "Stock Prices", "Market Factors")

# Silent operation
df1_aligned, df2_aligned = align_dataframes(df1, df2, verbose=False)

# Keep NaN values (no cleaning)
df1_aligned, df2_aligned = align_dataframes(df1, df2, drop_na=False)
```

In [3]:
# Example 1: Basic alignment with verbose output
print("=== Example 1: Basic Alignment ===")
try:
    stocks_aligned, factors_aligned = align_dataframes(
        df_stocks, df_factors, 
        "Stock Prices", "Market Factors"
    )
    print("✓ Alignment successful!")
    print(f"Final aligned data shape: {stocks_aligned.shape}")
    print()
except Exception as e:
    print(f"✗ Error: {e}")
    print()

# Example 2: Silent operation
print("=== Example 2: Silent Operation ===")
try:
    stocks_silent, factors_silent = align_dataframes(
        df_stocks, df_factors, 
        verbose=False
    )
    print(f"✓ Silent alignment completed: {stocks_silent.shape[0]} rows")
    print()
except Exception as e:
    print(f"✗ Error: {e}")
    print()

# Example 3: Keep NaN values
print("=== Example 3: Keep NaN Values ===")
try:
    stocks_with_nan, factors_with_nan = align_dataframes(
        df_stocks, df_factors, 
        "Stocks", "Factors",
        drop_na=False
    )
    print(f"✓ Alignment with NaN: {stocks_with_nan.shape[0]} rows")
    print(f"Stocks NaN count: {stocks_with_nan.isnull().sum().sum()}")
    print(f"Factors NaN count: {factors_with_nan.isnull().sum().sum()}")
except Exception as e:
    print(f"✗ Error: {e}")

=== Example 1: Basic Alignment ===
Stock Prices shape: (1461, 3)
Market Factors shape: (1126, 3)
Stock Prices date range: 2020-01-01 00:00:00 to 2023-12-31 00:00:00
Market Factors date range: 2021-06-01 00:00:00 to 2024-06-30 00:00:00
Common dates: 944
After date alignment - Stock Prices: (944, 3), Market Factors: (944, 3)
After cleaning NaN - Stock Prices: (939, 3), Market Factors: (939, 3)
Removed 5 rows due to NaN values
✓ Alignment successful! Final shape: 939 rows
Date range: 2021-06-01 00:00:00 to 2023-12-31 00:00:00
✓ Alignment successful!
Final aligned data shape: (939, 3)

=== Example 2: Silent Operation ===
✓ Silent alignment completed: 939 rows

=== Example 3: Keep NaN Values ===
Stocks shape: (1461, 3)
Factors shape: (1126, 3)
Stocks date range: 2020-01-01 00:00:00 to 2023-12-31 00:00:00
Factors date range: 2021-06-01 00:00:00 to 2024-06-30 00:00:00
Common dates: 944
After date alignment - Stocks: (944, 3), Factors: (944, 3)
✓ Alignment successful! Final shape: 944 rows
Dat

In [4]:
# Example 4: Error scenarios and additional utility functions

def quick_align(df1, df2):
    """
    Quick alignment function for minimal output
    """
    try:
        return align_dataframes(df1, df2, verbose=False)
    except ValueError as e:
        print(f"Alignment failed: {e}")
        return None, None

def alignment_summary(df1, df2, df1_name="df1", df2_name="df2"):
    """
    Get alignment summary without actually aligning the data
    """
    common_dates = df1.index.intersection(df2.index)
    overlap_pct = len(common_dates) / max(len(df1), len(df2)) * 100
    
    summary = {
        'df1_shape': df1.shape,
        'df2_shape': df2.shape,
        'df1_date_range': (df1.index.min(), df1.index.max()),
        'df2_date_range': (df2.index.min(), df2.index.max()),
        'common_dates': len(common_dates),
        'overlap_percentage': overlap_pct,
        'can_align': len(common_dates) > 0
    }
    
    print(f"=== Alignment Summary: {df1_name} vs {df2_name} ===")
    print(f"{df1_name}: {summary['df1_shape']} | {summary['df1_date_range'][0]} to {summary['df1_date_range'][1]}")
    print(f"{df2_name}: {summary['df2_shape']} | {summary['df2_date_range'][0]} to {summary['df2_date_range'][1]}")
    print(f"Common dates: {summary['common_dates']} ({summary['overlap_percentage']:.1f}% overlap)")
    print(f"Can align: {'✓ Yes' if summary['can_align'] else '✗ No'}")
    
    return summary

# Test the utility functions
print("=== Testing Utility Functions ===")
summary = alignment_summary(df_stocks, df_factors, "Stocks", "Factors")
print()

# Test quick align
print("=== Quick Align Test ===")
quick_stocks, quick_factors = quick_align(df_stocks, df_factors)
if quick_stocks is not None:
    print(f"✓ Quick alignment successful: {quick_stocks.shape}")
else:
    print("✗ Quick alignment failed")
print()

# Test error scenario: non-overlapping dates
print("=== Error Scenario: No Overlap ===")
dates_future = pd.date_range('2030-01-01', '2030-12-31', freq='D')
df_future = pd.DataFrame({'Future_Data': range(len(dates_future))}, index=dates_future)

try:
    alignment_summary(df_stocks, df_future, "Stocks", "Future Data")
    future_aligned = align_dataframes(df_stocks, df_future, verbose=False)
except ValueError as e:
    print(f"Expected error caught: {e}")

=== Testing Utility Functions ===
=== Alignment Summary: Stocks vs Factors ===
Stocks: (1461, 3) | 2020-01-01 00:00:00 to 2023-12-31 00:00:00
Factors: (1126, 3) | 2021-06-01 00:00:00 to 2024-06-30 00:00:00
Common dates: 944 (64.6% overlap)
Can align: ✓ Yes

=== Quick Align Test ===
✓ Quick alignment successful: (939, 3)

=== Error Scenario: No Overlap ===
=== Alignment Summary: Stocks vs Future Data ===
Stocks: (1461, 3) | 2020-01-01 00:00:00 to 2023-12-31 00:00:00
Future Data: (365, 1) | 2030-01-01 00:00:00 to 2030-12-31 00:00:00
Common dates: 0 (0.0% overlap)
Can align: ✗ No
Expected error caught: No common dates found between df1 and df2
