In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter


def simulate_learning_curve(
    days: int = 60,
    base: float = 30,
    scale: float = 20,
    noise_std: float = 5,
    random_seed: int = 42
) -> pd.DataFrame:
    """
    Simulate a learning curve with an underlying logarithmic trend plus Gaussian noise.

    Returns a DataFrame with columns: day, true_score, observed_score.
    """
    np.random.seed(random_seed)
    day = np.arange(1, days + 1)
    true_score = base + scale * np.log1p(day)
    noise = np.random.normal(loc=0, scale=noise_std, size=days)
    observed_score = true_score + noise
    return pd.DataFrame({
        "day": day,
        "true_score": true_score,
        "observed_score": observed_score
    })


def save_simulated_data(
    df: pd.DataFrame,
    filepath: str = "../data/simulated_learning.csv"
) -> None:
    """Save simulated data to CSV."""
    df.to_csv(filepath, index=False)
    print(f"Data saved to {filepath}")


def plot_learning_curve(
    df: pd.DataFrame,
    window_length: int = 11,
    polyorder: int = 3
) -> None:
    """
    Plot observed vs. smoothed vs. true learning curves.
    """
    smoothed = savgol_filter(
        df["observed_score"], window_length=window_length, polyorder=polyorder
    )
    plt.figure(figsize=(10, 5))
    plt.plot(df['day'], df['observed_score'], label='Observed (Noisy)', alpha=0.6)
    plt.plot(df['day'], smoothed, label=f'Smoothed (window={window_length})', color='orange')
    plt.plot(df['day'], df['true_score'], label='True Score', linestyle='--', color='green')
    plt.title('Simulated Learning Curve')
    plt.xlabel('Day')
    plt.ylabel('Performance Score')
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    df = simulate_learning_curve()
    save_simulated_data(df)
    plot_learning_curve(df)
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

# Simulate learning data
np.random.seed(42)
days = np.arange(1, 61)
true_learning = 30 + 20 * np.log1p(days)
noise = np.random.normal(0, 5, size=len(days))
observed_scores = true_learning + noise

# Save to CSV
df = pd.DataFrame({
    'day': days,
    'true_score': true_learning,
    'observed_score': observed_scores
})
df.to_csv('../data/simulated_learning.csv', index=False)

# Plot
plt.figure(figsize=(10, 5))
plt.plot(df['day'], df['observed_score'], label='Observed (Noisy)', alpha=0.6)
plt.plot(df['day'], savgol_filter(df['observed_score'], 11, 3), label='Smoothed (Savitzky-Golay)', color='orange')
plt.plot(df['day'], df['true_score'], label='True Score (Hidden)', linestyle='--', color='green')
plt.legend()
plt.title('Simulated Learning Curve')
plt.xlabel('Day')
plt.ylabel('Performance Score')
plt.grid(True)
plt.tight_layout()
plt.show()