# svd_imputer how-to: quick start

This short tutorial shows how to impute multivariate time series with missing values using the `svd_imputer` package. We'll:
- Generate synthetic data with ~15% missing values
- Try 2 configurations of the `Imputer`:
  1) Automatic rank via `variance_threshold`
  2) Fixed rank
  3) Automatic rank through cross validation
- Plot observed vs. imputed results

In [None]:
# Imports
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Make repo root importable
repo_root = os.path.abspath(os.path.join('..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

from svd_imputer import Imputer
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

In [None]:
# Helper: create synthetic data with ~15% NaNs per column
def create_synthetic_data(n=180, seed=42):
    rng = np.random.default_rng(seed)
    idx = pd.date_range('2020-01-01', periods=n, freq='D')
    t = np.arange(n)
    
    # Smooth seasonal signals + trends
    a = 3 + 2*np.sin(2*np.pi*t/30) - 0.015*t #+ 0.5*np.cos(2*np.pi*t/15) + 0.02*t
    b = 12 + 2*np.sin(2*np.pi*t/30) - 0.01*t
    c = 12 + 2*np.sin(2*np.pi*t/30) - 0.01*t #9  + 2*np.sin(2*np.pi*t/30) + 0.015*t
    
    # Add noise with cross-correlation
    eps = rng.normal(0, 0.1, size=(n, 3))
    base = rng.normal(0, 0.1, size=n)  # shared component
    data = np.stack([a, b, c], axis=1) + eps + base[:, None]
    
    df = pd.DataFrame(data, index=idx, columns=['Site_A', 'Site_B', 'Site_C'])
    df_truth = df.copy()
    
    # Introduce ~30% missing values for Site_A
    col = "Site_A"
    m = int(0.30 * n)
    miss_idx = rng.choice(n, size=m, replace=False)
    df.iloc[miss_idx, df.columns.get_loc(col)] = np.nan

    # Introduce continuous missing values fro Site_B
    col = "Site_B"
    df.iloc[-m:, df.columns.get_loc(col)] = np.nan

    return df,df_truth

# Helper: plot imputation results

def plot_imputation(df_original,df_truth, df_imputed=None, title='', columns=None,bounds=None):
    cols = columns or df_original.columns.tolist()
    fig, axes = plt.subplots(len(cols), 1, figsize=(12, 3.2*len(cols)), sharex=True)
    if len(cols) == 1:
        axes = [axes]
    
    for ax, col in zip(axes, cols):
        missing_mask = df_original[col].isna()
        # Observed scatter
        ax.scatter(df_truth.index, df_truth.loc[:,col],
                   s=18, color="#FC0404", alpha=0.9, label='Truth', zorder=3)
        ax.scatter(df_original.index[~missing_mask], df_original.loc[~missing_mask, col],
                   s=18, color="#000000", alpha=0.9, label='Observed', zorder=3)

        if df_imputed is not None:
            # Imputed line
            ax.plot(df_imputed.index, df_imputed[col], color='#2b6cb0', lw=2, label='Imputed')

            # Optional shading for missing regions
            if bounds is not None:
                lb,ub = bounds
                colidx = df_imputed.columns.get_loc(col)
                rowidx = np.where(missing_mask)[0]
                if len(rowidx) > 0:
                    ax.fill_between(df_imputed.index[missing_mask],
                                    lb[rowidx,colidx],
                                    ub[rowidx,colidx],
                                    color='#2b6cb0', alpha=0.08, label='CI')
        ax.set_ylabel(col)
        ax.grid(True, alpha=0.25)
    axes[0].set_title(title)
    axes[-1].set_xlabel('Date')
    # De-duplicate legend handles
    handles, labels = axes[0].get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    axes[0].legend(by_label.values(), by_label.keys(), ncol=2, loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Generate data and preview
df,df_truth = create_synthetic_data(n=180*3, seed=42)
print("Shape:", df.shape)
print("Missing per column:\n", df.isna().sum())
df.head()

In [None]:
df_truth

In [None]:
plot_imputation(df, df_truth,df_imputed=None, title='', columns=None)

## Example 1 — Automatic rank (variance_threshold=0.95)

In [None]:
imputer = Imputer(df,
                  variance_threshold=.95,
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='Automatic rank (variance_threshold=0.95)')

## Example 2 — Fixed rank (rank=1)

In [None]:
imputer = Imputer(df,
                  rank=1,
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='Fixed rank = 1')

## Example 3 — Fixed rank (rank=1)

In [None]:
imputer = Imputer(df,
                  rank='auto',
                  verbose=True)      # validate_dataframe() + preprocessing ONCE
imputer.fit()                        # Pure computation, cached SVD components
results = imputer.transform()        # Uses cached data + SVD components  

plot_imputation(df,df_truth, results, title='Fixed rank = 1')

In [None]:
imputer.rank_

# with uncertainty

In [None]:
optres = imputer.estimate_uncertainty(frac=0.1)
mcres = np.array(optres['raw_imputed'])
mcres.shape

In [None]:
plot_imputation(df,df_truth, results, title='Rank=1',bounds=[mcres.min(axis=0), mcres.max(axis=0)])

In [None]:
fig,axs = plt.subplots(3,1, figsize=(7,6),sharex=True)

for e,ax in enumerate(axs):
    ax.plot(df.index,df.iloc[:,e],'r.',zorder=100)
    ax.plot(results.index,results.iloc[:,e],'b-',zorder=1)
    [ax.plot(df.index, mcres[i,:,e],c="0.5",alpha=0.3,zorder=0) for i in range(mcres.shape[0])]
fig.tight_layout()