In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from skforecast.datasets import fetch_dataset
from skforecast.drift_detection import PopulationDriftDetector, PopulationDriftDetector
import nannyml as nml
import matplotlib.pyplot as plt

In [2]:
data = fetch_dataset('bike_sharing', verbose=False)
data.head(3)

data_train = data.iloc[: len(data)//2].copy()
data_new  = data.iloc[len(data)//2 :].copy()
print(f'Train: {data_train.shape}, Test: {data_new.shape}')
data_train['weather'] = data_train['weather'].astype('category')
data_new['weather'] = pd.Categorical(data_new['weather'], categories=data_train['weather'].cat.categories)

Train: (8772, 11), Test: (8772, 11)


In [8]:
from matplotlib import pyplot as plt
from skforecast.plot import set_dark_theme
set_dark_theme()
import seaborn as sns
import pandas as pd
import os
from PIL import Image
import numpy as np

try:
    import imageio
except ImportError:
    print("imageio not installed. Install with: pip install imageio")
    raise

feature = 'temp'
ref_data = data_train[feature]
distances = []
fps = 1  # frames per second for video
output_width = 900  # width for resized frames
frames = []

# Generate chunk starts every 2 months starting from January 2011
chunk_starts = pd.date_range(start='2011-01-01', end=data_train.index.max(), freq='1MS')

for i, chunk_start in enumerate(chunk_starts[:-1]):  # Exclude the last one to avoid going beyond data
    chunk_end = min(chunk_start + pd.DateOffset(months=1) - pd.Timedelta(hours=1), data_train.index.max())
    data_chunk = data_train.loc[chunk_start:chunk_end, feature]
    
    # Plot time series (spanning first row)
    fig = plt.figure(figsize=(12, 8))
    ax_ts = plt.subplot2grid((2, 2), (0, 0), colspan=2)
    data_train.loc[:, 'temp'].plot(ax=ax_ts, label='Reference data')
    data_chunk.plot(ax=ax_ts, label=f'Chunk {i}', color='red')
    ax_ts.set_xlabel('')
    ax_ts.set_ylabel('')
    ax_ts.set_title('Distance-Based framework for temporal drift detection',
                     fontsize=18, pad=25, fontweight='semibold')
    ax_ts.legend(loc='upper left')
    
    # Compare distributions using kdeplot (second row, first column)
    ax_kde = plt.subplot2grid((2, 2), (1, 0))
    chunk_data = data_new.loc[chunk_start:chunk_end, feature]
    sns.kdeplot(ref_data, label='Reference data', color='blue', ax=ax_kde)
    sns.kdeplot(data_chunk, label=f'Chunk {i}', color='red', ax=ax_kde)
    ax_kde.set_title(f'Distribution Comparison for Chunk {i}')
    ax_kde.set_xlabel('')
    #ax_kde.legend(loc='upper left')

    # Histogram of calculated distances (second row, second column)
    from scipy.stats import ks_2samp
    ks_statistic, p_value = ks_2samp(ref_data, data_chunk)
    distances.append(ks_statistic)
    
    # Empirical distribution of distances
    ax_hist = plt.subplot2grid((2, 2), (1, 1))
    sns.kdeplot(distances, ax=ax_hist, fill=True)
    # add rug plot
    sns.rugplot(distances, ax=ax_hist, color='white')
    ax_hist.set_title('Empirical Distribution of KS Distances')
    ax_hist.set_xlabel('KS Statistic')
    ax_hist.set_ylabel('')

    # If it is the last chunk, plot final histogram with highlighted quantiles
    if i == len(chunk_starts) - 2:
        quantile_95 = np.quantile(distances, 0.95)
        ax_hist.axvline(quantile_95, color='white', linestyle='--', label='95th Percentile')

        # Add annotation with arrow
        ax_hist.annotate(
            '95th Percentile',
            xy=(quantile_95, ax_hist.get_ylim()[1] * 0.8),       # Point to the line
            xytext=(quantile_95 + (ax_hist.get_xlim()[1] - ax_hist.get_xlim()[0]) * 0.05, 
                    ax_hist.get_ylim()[1] * 0.9),                 # Place text slightly to the right/top
            arrowprops=dict(facecolor='white', shrink=0.05, width=1, headwidth=6),
            color='white',
            fontsize=10,
            ha='left',
            va='center'
        )
    
    fig.tight_layout()
    #plt.show();

    # Save frame
    filename = f"temp_{chunk_start}.png"
    fig.savefig(filename, dpi=120, bbox_inches='tight')
    plt.close(fig)

    img = Image.open(filename)
    img = img.resize((output_width, int(output_width * img.height / img.width)))
    frames.append(np.array(img))
    os.remove(filename)

# Save as GIF (slower and loop infinitely)
gif_filename = 'population_drift_detection.gif'
# Duplicate the last frame to extend its display time
frames.append(frames[-1])
frames.append(frames[-1])
frames.append(frames[-1])

imageio.mimsave(
    gif_filename,
    frames,
    format='GIF',
    duration=900,  # 900 milliseconds per frame
    loop=0
)
print(f'GIF saved as {gif_filename}')

# Save as MP4 video
mp4_filename = 'population_drift_detection.mp4'
imageio.mimsave(
    mp4_filename,
    frames,
    fps=fps,  # frame rate
    codec='libx264',  # high-quality H.264 codec
    quality=10,       # 10 = best quality, 0 = worst
    ffmpeg_params=[
        '-crf', '17',         # lower CRF = higher quality (range: 0–51)
        '-pix_fmt', 'yuv420p',# ensures wide compatibility
        '-preset', 'slow'     # better compression efficiency (options: ultrafast → placebo)
    ]
)
print(f'High-quality MP4 saved as {mp4_filename}')

  sns.kdeplot(distances, ax=ax_hist, fill=True)


GIF saved as population_drift_detection.gif


Multiple -pix_fmt options specified for stream 0, only the last option '-pix_fmt yuv420p' will be used.


High-quality MP4 saved as population_drift_detection.mp4
