# svd_imputer how-to: quick start

This short tutorial shows how to impute multivariate time series with missing values using the `svd_imputer` package. We'll:
- Generate synthetic data for 10 sites with seasonality, trend, and interannual variability.
- Introduce complex missing value patterns (random, continuous blocks).
- Try 3 configurations of the `Imputer`:
  1) Automatic rank via `variance_threshold`
  2) Fixed rank
  3) Automatic rank estimation
- Estimate uncertainty using Monte Carlo iterations.

In [None]:
# Imports
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make repo root importable
repo_root = os.path.abspath(os.path.join('..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from svd_imputer import Imputer
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

## Generate Synthetic Data
We generate synthetic data for 10 sites. Each site has a shared base component, plus site-specific seasonality (annual and interannual), trend, and noise. We then introduce missing values: random missingness for some sites, and large continuous gaps for others.

In [None]:
# Helper: create synthetic data with ~15% NaNs per column
def create_synthetic_data(n=180, n_sites=10, seed=42):
    rng = np.random.default_rng(seed)
    idx = pd.date_range('2020-01-01', periods=n, freq='D')
    t = np.arange(n)
    
    data_list = []
    columns = []
    
    # Shared component
    base = rng.normal(0, 0.1, size=n)
    
    for i in range(n_sites):
        # Randomize parameters slightly for each site
        phase = 0 #rng.uniform(0, 2*np.pi)
        amp = rng.uniform(1.5, 2.5)
        trend = 0.01 #rng.uniform(0.0, 0.03)
        offset = rng.uniform(5, 15)
        
        # Interannual parameters
        phase_2 = rng.uniform(0, 2*np.pi)
        amp_2 = rng.uniform(1.0, 2.0)
        
        # Signal: Offset + Trend + Seasonality + Interannual
        signal = (offset + 
                  amp*np.sin(2*np.pi*t/30 + phase) + 
                  amp_2*np.sin(2*np.pi*t/90 + phase_2) + 
                  trend*t)
        
        # Add noise
        noise = rng.normal(0, 0.05, size=n)
        
        # Combine
        site_data = signal + noise + base
        data_list.append(site_data)
        columns.append(f'Site_{i+1}')
        
    data = np.stack(data_list, axis=1)
    df = pd.DataFrame(data, index=idx, columns=columns)
    df_truth = df.copy()
    
    # Introduce missing values
    # Random missing for first 30% of sites
    m = int(0.30 * n)
    #for col in columns[:3]:
    #    miss_idx = rng.choice(n, size=m, replace=False)
    #    df.loc[idx[miss_idx], col] = np.nan

    # Continuous missing for next 30% of sites
    if len(columns) > 3:
        for col in columns[3:6]:
            df.iloc[-m:, df.columns.get_loc(col)] = np.nan
        
    # Random chunks for others
    if len(columns) > 6:
        for col in columns[:]:
             miss_idx = rng.choice(n-m-1, size=m//2, replace=False)
             df.loc[idx[miss_idx], col] = np.nan

    return df,df_truth

# Helper: plot imputation results

def plot_imputation(df_original,df_truth, df_imputed=None, title='', columns=None,bounds=None):
    cols = columns or df_original.columns.tolist()
    fig, axes = plt.subplots(len(cols), 1, figsize=(7, 2*len(cols)), sharex=True)
    if len(cols) == 1:
        axes = [axes]
    
    for ax, col in zip(axes, cols):
        missing_mask = df_original[col].isna()
        # Observed scatter
        ax.scatter(df_truth.index, df_truth.loc[:,col],
                   s=18, color="#FC0404", alpha=0.9, label='Truth', zorder=3)
        ax.scatter(df_original.index[~missing_mask], df_original.loc[~missing_mask, col],
                   s=18, color="#000000", alpha=0.9, label='Observed', zorder=3)

        if df_imputed is not None:
            # Imputed line
            ax.plot(df_imputed.index, df_imputed[col], color='#2b6cb0', lw=2, label='Imputed')

            # Optional shading for missing regions
            if bounds is not None:
                lb,ub = bounds
                colidx = df_imputed.columns.get_loc(col)
                rowidx = np.where(missing_mask)[0]
                if len(rowidx) > 0:
                    ax.fill_between(df_imputed.index[missing_mask],
                                    lb[rowidx,colidx],
                                    ub[rowidx,colidx],
                                    color='#2b6cb0', alpha=0.8, label='CI')
        ax.set_ylabel(col)
        ax.grid(True, alpha=0.25)
    axes[0].set_title(title)
    axes[-1].set_xlabel('Date')
    # De-duplicate legend handles
    handles, labels = axes[0].get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    axes[0].legend(by_label.values(), by_label.keys(), ncol=2, loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Generate data and preview
df,df_truth = create_synthetic_data(n=180*3, seed=42)
print("Shape:", df.shape)
print("Missing per column:\n", df.isna().sum())
df.head()

In [None]:
plot_imputation(df, df_truth,df_imputed=None, title='', columns=None)

## Example 1: Automatic rank (variance_threshold=0.90)
We initialize the imputer with `variance_threshold=0.90`, meaning we want to retain enough singular values to explain 90% of the variance.

In [None]:
imputer = Imputer(df,
                  variance_threshold=.90,
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='Rank from variance threshold', columns=None)

## Example 2: Fixed rank (rank=4)
Here we force the rank to be 4. This is useful if you have prior knowledge about the dimensionality of the signal.

In [None]:
imputer = Imputer(df,
                  rank=4,
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='Fixed rank = 4')

## Example 3: Automatic rank estimation
We can also let the imputer estimate the optimal rank automatically using `rank='auto'`.

In [None]:
imputer = Imputer(df,
                  rank='auto',
                  tol=1e-3,
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='rank auto')

## Estimate Uncertainty
We can estimate the uncertainty of the imputation by performing multiple imputations (Monte Carlo). This gives us a distribution of imputed values for each missing point, from which we can calculate confidence intervals.

In [None]:
# Multiple Imputation with Uncertainty (Rubin's Rules)
df_imputed, df_uncertainty = imputer.fit_transform(
    return_uncertainty=True, 
    n_imputations=10,
)

In [None]:
# plotting function for uncertainty
def plot_unc(df,df_truth, df_imputed, df_uncertainty):
    n_cols = df.shape[1]
    fig,axs = plt.subplots(n_cols,1, figsize=(7, 2*n_cols),sharex=True)

    if n_cols == 1:
        axs = [axs]

    for e,ax in enumerate(axs):
        col = df.columns[e]
        # Original data points
        ax.plot(df.index,df[col],'.',label='Original',alpha=0.5,zorder=100)
        ax.plot(df_truth.index,df_truth[col],c='r',label='Truth',alpha=0.5,zorder=100)
        # Imputed data line
        ax.plot(df_imputed.index,df_imputed[col],'--',label='Imputed')
        # Uncertainty intervals
        lb = df_imputed[col] - 1.96 * np.sqrt(df_uncertainty[col])
        ub = df_imputed[col] + 1.96 * np.sqrt(df_uncertainty[col])
        ax.fill_between(df_imputed.index, lb, ub, color='0.5', alpha=0.3, label='95% CI', zorder=0) 
        ax.set_title(col)
        ax.legend(loc='upper left',fontsize=8)
    fig.tight_layout()
    return

plot_unc(df,df_truth, df_imputed, df_uncertainty)