# SVD Time Series Imputation - Augmented Demo

This notebook demonstrates the key functionality of the SVD time series imputer package, including:

1. Generating synthetic time series with correlated and uncorrelated components
2. Basic imputation with automatic rank estimation
3. Data augmentation techniques for improved imputation

The package uses Singular Value Decomposition (SVD) to identify low-dimensional patterns in multivariate time series and impute missing values based on these patterns.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Add parent directory to path for imports
repo_root = os.path.abspath(os.path.join('..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from svd_imputer import Imputer
from svd_imputer.preprocessing import (
    create_derivative_augmented_matrix,
    create_symmetric_augmented_matrix
)

# Set random seed for reproducibility
np.random.seed(42)
plt.style.use('default')

## 1. Generate Synthetic Time Series Data

We create a synthetic dataset with:
- 3 correlated time series (Series A, B, C) with seasonal patterns and trends
- 1 uncorrelated time series (Series D) with random walk behavior
- Missing values randomly distributed across all series except one reference series

In [None]:
def generate_synthetic_timeseries(n_periods=200, seed=42):
    """
    Generate synthetic time series data with correlated and uncorrelated components.
    
    Returns:
        df_with_missing: DataFrame with missing values
        df_complete: Complete DataFrame (ground truth)
    """
    rng = np.random.default_rng(seed)
    dates = pd.date_range('2020-01-01', periods=n_periods, freq='D')
    t = np.arange(n_periods)
    
    # Create correlated time series (A, B, C)
    # Shared seasonal and trend components
    seasonal_component = 2 * np.sin(2 * np.pi * t / 30)  # 30-day cycle
    trend_component = -0.01 * t  # Slight declining trend
    
    # Series A: Base series with seasonal pattern
    series_a = 10 + seasonal_component + trend_component + rng.normal(0, 0.3, n_periods)
    
    # Series B: Correlated with A, different offset and noise
    series_b = 15 + seasonal_component + trend_component + rng.normal(0, 0.4, n_periods)
    
    # Series C: Also correlated, with phase shift
    seasonal_shifted = 1.5 * np.sin(2 * np.pi * t / 30 + np.pi/4)  # Phase shifted
    series_c = 8 + seasonal_shifted + 0.8 * trend_component + rng.normal(0, 0.35, n_periods)
    
    # Series D: Uncorrelated random walk
    series_d = np.cumsum(rng.normal(0, 0.5, n_periods)) + 5
    
    # Create complete DataFrame
    df_complete = pd.DataFrame({
        'Series_A': series_a,
        'Series_B': series_b, 
        'Series_C': series_c,
        'Series_D': series_d
    }, index=dates)
    
    # Create version with missing values
    df_with_missing = df_complete.copy()
    
    # Introduce missing values in Series B, C and D (keep A complete as reference)
    for col in ['Series_B', 'Series_C', 'Series_D']:
        n_missing = int(0.25 * n_periods)  # 25% missing
        missing_indices = rng.choice(n_periods, size=n_missing, replace=False)
        df_with_missing.iloc[missing_indices, df_with_missing.columns.get_loc(col)] = np.nan
    
    # blank portion of one column
    df_with_missing.iloc[50:80, df_with_missing.columns.get_loc('Series_B')] = np.nan

    return df_with_missing, df_complete

# Generate the data
df_missing, df_truth = generate_synthetic_timeseries(n_periods=200)

print("Dataset shape:", df_missing.shape)
print("\nMissing values per series:")
print(df_missing.isna().sum())
print(f"\nTotal missing values: {df_missing.isna().sum().sum()}")

# Display first few rows
df_missing.head(10)

In [None]:
def plot_timeseries_comparison(df_observed, df_truth, df_imputed=None, title="Time Series Data",oe=None):
    """
    Plot observed vs truth vs imputed time series.
    """
    fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)
    
    colors = {'observed': 'black', 'truth': 'red', 'imputed': 'blue'}
    
    for i, col in enumerate(df_observed.columns):
        ax = axes[i]
        
        # Plot truth (complete data)
        ax.plot(df_truth.index, df_truth[col], color=colors['truth'], 
               alpha=0.7, linewidth=1, label='True values')
        
        # Plot observed (non-missing) values
        observed_mask = ~df_observed[col].isna()
        ax.scatter(df_observed.index[observed_mask], df_observed[col][observed_mask], 
                  color=colors['observed'], s=8, alpha=0.8, label='Observed', zorder=5)
        
        # Plot imputed values if available
        if df_imputed is not None:
            missing_mask = df_observed[col].isna()
            if missing_mask.any():
                ax.scatter(df_observed.index[missing_mask], df_imputed[col][missing_mask], 
                          color=colors['imputed'], s=12, alpha=0.9, label='Imputed', 
                          marker='x', zorder=6)
            if oe is not None:
                # (50,200,4)
                arr = oe[:,:,i]
                [ax.plot(df_observed.index, arr[j,:], color='gray', alpha=0.1) for j in range(arr.shape[0])]
            
        ax.set_ylabel(col)
        ax.grid(True, alpha=0.3)
        
        if i == 0:
            ax.legend(loc='upper right')
    
    axes[-1].set_xlabel('Date')
    plt.suptitle(title, fontsize=14)
    plt.tight_layout()
    plt.show()

# Plot the generated data
plot_timeseries_comparison(df_missing, df_truth, title="Synthetic Time Series with Missing Values")

## 2. Analyze Correlations

Let's examine the correlation structure to understand the relationships between series.

In [None]:
# Calculate correlation matrix using only observed values
correlation_matrix = df_truth.corr()

print("Correlation Matrix (True Values):")
print(correlation_matrix.round(3))

# Visualize correlation matrix
fig, ax = plt.subplots(figsize=(8, 6))
im = ax.imshow(correlation_matrix, cmap='RdBu_r', vmin=-1, vmax=1)

# Add text annotations
for i in range(len(correlation_matrix)):
    for j in range(len(correlation_matrix)):
        text = ax.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                      ha="center", va="center", color="black", fontweight='bold')

ax.set_xticks(range(len(correlation_matrix)))
ax.set_yticks(range(len(correlation_matrix)))
ax.set_xticklabels(correlation_matrix.columns)
ax.set_yticklabels(correlation_matrix.columns)
plt.colorbar(im, ax=ax, label='Correlation')
plt.title('Correlation Matrix of Time Series')
plt.tight_layout()
plt.show()

print("\nKey observations:")
print("- Series A, B, C are moderately correlated (share seasonal patterns)")
print("- Series D is uncorrelated with others (random walk behavior)")

## 3. Basic SVD Imputation

First, we apply the SVD imputer with default settings using automatic rank estimation based on percentage of explained variance.

In [None]:
# Initialize imputer with automatic rank estimation
imputer_basic = Imputer(
    data=df_missing,
    variance_threshold=0.95,  # Capture 95% of variance
    verbose=True
)

# Fit and transform
df_imputed_basic = imputer_basic.fit_transform()

print(f"\nEstimated rank: {imputer_basic.rank_}")
print(f"Data shape: {imputer_basic.shape_}")

In [None]:
#residuals_df, stats_dict = imputer_basic.calculate_reconstruction_residuals(
#    return_stats=True
#)

In [None]:
# Plot results
plot_timeseries_comparison(df_missing, df_truth, df_imputed_basic, 
                          title=f"Basic SVD Imputation (Rank = {imputer_basic.rank_})")

In [None]:
def calculate_imputation_metrics(df_truth, df_observed, df_imputed):
    """
    Calculate imputation performance metrics.
    """
    metrics = {}
    
    for col in df_observed.columns:
        # Get missing positions
        missing_mask = df_observed[col].isna()
        
        if missing_mask.sum() == 0:
            continue
            
        # Calculate metrics for missing positions only
        true_vals = df_truth[col][missing_mask]
        imputed_vals = df_imputed[col][missing_mask]
        
        rmse = np.sqrt(np.mean((true_vals - imputed_vals) ** 2))
        mae = np.mean(np.abs(true_vals - imputed_vals))
        
        # Correlation between true and imputed values
        correlation = np.corrcoef(true_vals, imputed_vals)[0, 1]
        
        metrics[col] = {
            'RMSE': rmse,
            'MAE': mae,
            'Correlation': correlation,
            'N_missing': missing_mask.sum()
        }
    
    return metrics

# Evaluate basic imputation
metrics_basic = calculate_imputation_metrics(df_truth, df_missing, df_imputed_basic)

print("Basic Imputation Performance:")
print("-" * 50)
for col, metric in metrics_basic.items():
    print(f"{col}:")
    print(f"  RMSE: {metric['RMSE']:.3f}")
    print(f"  MAE:  {metric['MAE']:.3f}")
    print(f"  Corr: {metric['Correlation']:.3f}")
    print(f"  N_missing: {metric['N_missing']}")
    print()

## 4. Automatic Rank Estimation

That is OK i guess...but not amazing. Note that a single rank (rank=1) is being calcualted as explaining 95% of the variance. This might be too simplistic. Lets try estiamting the optimal rank to explain randomly missing data (from the data that we have). What we will do is randomly drop pieces of data from the data set and optimize the rank to minimize mae/rmse.

In [None]:
# For optimal performance, the rank can be selected through cross-validation.

# Optimize rank using cross-validation
imputer_optimized = Imputer(
    data=df_missing,
    rank="auto",  # This triggers rank optimization
    tol=1e-3,
    verbose=True
)

df_imputed_optimized = imputer_optimized.fit_transform()

print(f"\nOptimized rank: {imputer_optimized.rank_}")

# Get optimization details
opt_results = imputer_optimized.get_optimization_results()
if opt_results:
    print(f"Optimal CV score: {opt_results['optimal_score']:.4f}")
    print("\nRank performance summary:")
    print(opt_results['results_df'][['rank', 'mean_rmse', 'std_rmse']].head())

Hmmm...seems the optimal rank is still 1. This tells us that we might simply not have enough information content in the dataset to do any better. Using higher rank adds to much noise back in, and we cant use less than 1 rank to explain the rest of the data...so what can we do? We can sythenticaly augment our dataset to implicilty extract information component sform our time series!

## 4. Data Augmentation for Enhanced Imputation

The package provides data augmentation methods that can improve imputation by incorporating temporal structure:

1. **Derivative augmentation**: Adds first and second differences
2. **Symmetric lag augmentation**: Includes past and future values around each time point

These methods expand the feature space and can help SVD capture more complex temporal patterns.

### 4.1 Derivative Augmentation

This method augments the data with first and second differences, helping capture trend and acceleration patterns.

In [None]:
# Apply derivative augmentation
df_augmented_deriv = create_derivative_augmented_matrix(df_missing)

print("Original data shape:", df_missing.shape)
print("Augmented data shape:", df_augmented_deriv.shape)
print("\nAugmented columns:")
print(df_augmented_deriv.columns.tolist())

# Show first few rows
print("\nFirst 5 rows of augmented data:")
df_augmented_deriv.head()

In [None]:
# Apply SVD imputation to derivative-augmented data
imputer_deriv = Imputer(
    data=df_augmented_deriv,
    rank="auto",  # This triggers rank optimization
    tol=1e-3,
    verbose=True
)

df_imputed_deriv_aug = imputer_deriv.fit_transform()

print(f"\nEstimated rank with derivative augmentation: {imputer_deriv.rank_}")

# Extract original columns from augmented result
original_cols = ['Series_A', 'Series_B', 'Series_C', 'Series_D']
df_imputed_deriv = df_imputed_deriv_aug[original_cols].copy()

# Align indices (derivative augmentation loses first 2 rows)
df_missing_aligned = df_missing.loc[df_imputed_deriv.index]
df_truth_aligned = df_truth.loc[df_imputed_deriv.index]

In [None]:
# Plot derivative augmentation results
plot_timeseries_comparison(df_missing_aligned, df_truth_aligned, df_imputed_deriv,
                          title=f"Derivative Augmentation (Rank = {imputer_deriv.rank_})")

Doing alot better for SiteB (which only has an offset + noise). Not much diff for SiteC (also has a lag). And SiteD is terrible (as expected; no correlation). 

### 4.2 Symmetric Lag Augmentation

This method includes past and future values around each time point, which can be particularly effective for interpolating gaps in the middle of series.

In [None]:
# Apply symmetric lag augmentation with window=2 (includes t-2, t-1, t, t+1, t+2)
window_size = 2
df_augmented_lag = create_symmetric_augmented_matrix(df_missing, window=window_size)

print("Original data shape:", df_missing.shape)
print("Lag-augmented data shape:", df_augmented_lag.shape)
print(f"\nWindow size: {window_size} (includes {2*window_size + 1} time points per variable)")

# Show column structure
print("\nAugmented columns (first 10):")
print(df_augmented_lag.columns.tolist()[:10])

# Show first few rows
print("\nFirst 5 rows of lag-augmented data (showing subset of columns):")
df_augmented_lag.iloc[:5, :8]

In [None]:
# Apply SVD imputation to lag-augmented data
imputer_lag = Imputer(
    data=df_augmented_lag,
    rank="auto",  # This triggers rank optimization
    tol=1e-3,
    verbose=True
)

df_imputed_lag_aug = imputer_lag.fit_transform()

print(f"\nEstimated rank with lag augmentation: {imputer_lag.rank_}")

# Extract original columns (lag+0 columns) from augmented result
lag0_cols = [col for col in df_imputed_lag_aug.columns if '_lag+0' in col]
df_imputed_lag = df_imputed_lag_aug[lag0_cols].copy()

# Rename columns to original names
df_imputed_lag.columns = [col.replace('_lag+0', '') for col in df_imputed_lag.columns]

# Align indices (lag augmentation loses first and last 'window' rows)
df_missing_lag_aligned = df_missing.loc[df_imputed_lag.index]
df_truth_lag_aligned = df_truth.loc[df_imputed_lag.index]

In [None]:
# Plot lag augmentation results
plot_timeseries_comparison(df_missing_lag_aligned, df_truth_lag_aligned, df_imputed_lag,
                          title=f"Symmetric Lag Augmentation (Rank = {imputer_lag.rank_})")

Wow...even SiteD is doing well....how? Because we have information about what happens before/after missing data to informin the imputation. Sweet az..

In [None]:
# Multiple Imputation with Uncertainty (Rubin's Rules)
df_imputed, df_uncertainty = imputer_lag.fit_transform(
    return_uncertainty=True, 
    n_imputations=10
)
cols = [i for i in df_imputed.columns if 'lag+0' in i]
df_imputed = df_imputed[cols]
df_imputed.columns = [col.replace('_lag+0','') for col in df_imputed.columns]
df_uncertainty = df_uncertainty[cols]
df_uncertainty.columns = [col.replace('_lag+0','') for col in df_uncertainty.columns]
# plot
fig,axs = plt.subplots(4,1, figsize=(7,6),sharex=True)

for e,ax in enumerate(axs):
    col = df_missing.columns[e]
    # Original data points
    ax.plot(df_missing.index,df_missing[col],'.',label='Input',alpha=0.5,zorder=100)
    ax.plot(df_truth.index,df_truth[col],c='k',label='Truth',alpha=1,zorder=1)
    # Imputed data line
    ax.plot(df_imputed.index,df_imputed[col],c='r',linestyle='--',label='Imputed')
    # Uncertainty intervals
    lb = df_imputed[col] - 1.96 * np.sqrt(df_uncertainty[col])
    ub = df_imputed[col] + 1.96 * np.sqrt(df_uncertainty[col])
    ax.fill_between(df_imputed.index, lb, ub, color='0.5', alpha=0.3, label='95% CI', zorder=0) 
    ax.set_title(col)
ax.legend(loc='lower left',fontsize=8)
fig.tight_layout()

## 5. Performance Comparison

Let's compare the performance of different approaches.

In [None]:
# Calculate metrics for all approaches
metrics_deriv = calculate_imputation_metrics(df_truth_aligned, df_missing_aligned, df_imputed_deriv)
metrics_lag = calculate_imputation_metrics(df_truth_lag_aligned, df_missing_lag_aligned, df_imputed_lag)

# Create comparison table
comparison_data = []

methods = {
    'Basic SVD': metrics_basic,
    'Derivative Aug': metrics_deriv, 
    'Lag Aug': metrics_lag
}

ranks = {
    'Basic SVD': imputer_basic.rank_,
    'Derivative Aug': imputer_deriv.rank_,
    'Lag Aug': imputer_lag.rank_
}

print("Performance Comparison")
print("=" * 80)

# Print header
print(f"{'Method':<15} {'Rank':<6} {'Series':<10} {'RMSE':<8} {'MAE':<8} {'Correlation':<12}")
print("-" * 80)

for method_name, method_metrics in methods.items():
    rank = ranks[method_name]
    for i, (series, metrics) in enumerate(method_metrics.items()):
        if i == 0:
            print(f"{method_name:<15} {rank:<6} {series:<10} {metrics['RMSE']:<8.3f} {metrics['MAE']:<8.3f} {metrics['Correlation']:<12.3f}")
        else:
            print(f"{'':>22} {series:<10} {metrics['RMSE']:<8.3f} {metrics['MAE']:<8.3f} {metrics['Correlation']:<12.3f}")
    print("-" * 80)

# Calculate average performance
print("\nAverage Performance Across Series:")
print("-" * 50)
for method_name, method_metrics in methods.items():
    avg_rmse = np.mean([m['RMSE'] for m in method_metrics.values()])
    avg_mae = np.mean([m['MAE'] for m in method_metrics.values()])
    avg_corr = np.mean([m['Correlation'] for m in method_metrics.values()])
    print(f"{method_name:<15} RMSE: {avg_rmse:.3f}  MAE: {avg_mae:.3f}  Corr: {avg_corr:.3f}")

## Summary

This notebook demonstrated the key features of the SVD time series imputer:

1. **Basic Usage**: Simple imputation with automatic rank estimation
2. **Data Augmentation**: Enhanced imputation using derivative and lag augmentation
3. **Performance Evaluation**: Quantitative comparison of different approaches
4. **Rank Optimization**: Automated rank selection through cross-validation