# MLOps Plan: KROM Bank Indonesia Stock - Notebook
# Notebook ini berisi rencana MLOps, EDA cepat, baseline modeling, dan langkah produksi.
# Dataset: files ditemukan di folder `dataset/` (mis: `BBSI.JK.csv`).


In [None]:
## Persiapan lingkungan & install library

# Perbarui pip dan install dependencies yang dibutuhkan. Jalankan cell ini di venv proyek (lihat `configure_python_environment`).

import sys
import subprocess

def run(cmd):
    print('>',' '.join(cmd))
    subprocess.check_call(cmd)

# Update pip
try:
    run([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
except Exception as e:
    print('Gagal update pip:', e)

# Install packages (guarded installs)
packages = [
    'pandas', 'numpy', 'matplotlib', 'seaborn', 'scikit-learn', 'statsmodels',
    'prophet>=1.1', 'tensorflow', 'mlflow', 'optuna', 'joblib', 'pandera', 'lightgbm'
]
try:
    run([sys.executable, '-m', 'pip', 'install'] + packages)
except Exception as e:
    print('Instalasi paket gagal (lanjutkan jika sudah terpasang):', e)


In [None]:
# Imports umum untuk notebook
import os
from pathlib import Path
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from IPython.display import display
import warnings

sns.set_style('whitegrid')

# Paths
ROOT = Path('.')
DATA_DIR = ROOT / 'dataset'
MODELS_DIR = ROOT / 'models'
ARTIFACTS_DIR = ROOT / 'artifacts'
for d in [DATA_DIR, MODELS_DIR, ARTIFACTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print('DATA_DIR:', DATA_DIR.resolve())
print('Found files:', list(DATA_DIR.glob('*'))[:10])

In [None]:
# Helper: cari CSV utama (prioritaskan daily CSV)
from glob import glob
csv_candidates = sorted(glob(str(DATA_DIR / '*.csv')))
print('CSV candidates:', csv_candidates)

# Preferensi: file yang mengandung '.JK.csv' atau the largest daily file
def get_csv_path():
    for p in csv_candidates:
        if p.endswith('.csv') and '.JK' in os.path.basename(p) and '_monthly' not in p and '_weekly' not in p:
            return Path(p)
    # fallback: first csv
    return Path(csv_candidates[0]) if csv_candidates else None

CSV_PATH = get_csv_path()
print('Using CSV:', CSV_PATH)

# Muat CSV jika ada
if CSV_PATH and CSV_PATH.exists():
    df = pd.read_csv(CSV_PATH)
    print('Loaded rows,cols:', df.shape)
    display(df.head())
    display(df.info())
else:
    print('CSV not found in dataset/, pastikan sudah diunduh.')


In [None]:
# Simpan salinan raw
RAW_PATH = DATA_DIR / 'raw.csv'
if 'df' in globals():
    df.to_csv(RAW_PATH, index=False)
    print('Saved raw snapshot to', RAW_PATH)

# Basic EDA: missing, describe, date range
if 'df' in globals():
    print('\nMissing per column:\n', df.isna().sum())
    display(df.describe())
    # Cari kolom tanggal yang paling umum (case-insensitive)
    date_cols = [c for c in df.columns if 'date' in c.lower() or 'tanggal' in c.lower()]
    print('Date columns candidates:', date_cols)
    if date_cols:
        date_col = date_cols[0]
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        if df[date_col].isna().all():
            warnings.warn(f"Kolom '{date_col}' tidak dapat diparse ke datetime (semua NaT). Periksa kolom lain atau format tanggal.")
        else:
            print('Date range:', df[date_col].min(), 'to', df[date_col].max())
    else:
        warnings.warn('Tidak menemukan kolom tanggal otomatis; pastikan file memiliki kolom tanggal.')

In [None]:
# Preprocessing: set date index, sort, resample daily, fill missing
# This cell creates `df_daily` used by feature engineering.
if 'df' in globals():
    # detect date column again (from EDA)
    possible_dates = [c for c in df.columns if 'date' in c.lower() or 'tanggal' in c.lower()]
    date_col = possible_dates[0] if possible_dates else df.columns[0]
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    if df[date_col].isna().sum() > 0:
        warnings.warn('Beberapa tanggal tidak ter-parse; baris akan di-drop sebelum resample.')
        df = df.loc[df[date_col].notna()].copy()
    df = df.sort_values(date_col).set_index(date_col)
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    if not numeric_cols:
        warnings.warn('Tidak ada kolom numerik; feature engineering akan dibatalkan.')
    else:
        # resample ke harian dan forward-fill untuk missing timestamps
        df_daily = df[numeric_cols].resample('D').ffill()
        print('df_daily shape:', df_daily.shape)
else:
    warnings.warn('`df` tidak ditemukan; pastikan CSV sudah dimuat.')

In [None]:
# Feature engineering: returns, lags, rolling
if 'df_daily' in globals():
    df_feat = df_daily.copy()
    # ensure 'Close' exists
    close_col = None
    for c in ['Close','close','Adj Close','adj_close','Close*']:
        if c in df_feat.columns:
            close_col = c
            break
    if close_col is None:
        # fallback to first numeric column
        close_col = df_feat.select_dtypes('number').columns[0]
        warnings.warn(f"Tidak menemukan kolom 'Close' eksplisit. Menggunakan '{close_col}' sebagai target.")
    # require enough data for lags and rolling calculations
    min_required = 21
    if len(df_feat) < min_required:
        raise ValueError(f"Data terlalu sedikit untuk feature engineering (butuh >= {min_required}, punya {len(df_feat)})")
    df_feat['return'] = df_feat[close_col].pct_change()
    for lag in [1,2,3,5,7]:
        df_feat[f'lag_{lag}'] = df_feat[close_col].shift(lag)
    df_feat['sma_7'] = df_feat[close_col].rolling(7).mean()
    df_feat['sma_21'] = df_feat[close_col].rolling(21).mean()
    df_feat = df_feat.dropna().copy()
    display(df_feat[[close_col,'return','sma_7','sma_21']].head())
else:
    warnings.warn('`df_daily` tidak tersedia; jalankan sel preprocessing terlebih dahulu.')

In [None]:
# Train/val/test split (time-series aware) and baseline
from sklearn.model_selection import TimeSeriesSplit

# safe metric functions
def rmse(a,b):
    return sqrt(mean_squared_error(a,b))

def mape(a,b):
    a = np.array(a, dtype=float)
    b = np.array(b, dtype=float)
    # avoid division by zero: ignore positions where a == 0
    mask = a != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((a[mask]-b[mask])/a[mask]))*100

def compute_metrics(y_true, y_pred):
    return {
        'rmse': rmse(y_true, y_pred),
        'mae': mean_absolute_error(y_true, y_pred),
        'mape': mape(y_true, y_pred)
    }

if 'df_feat' in globals():
    y = df_feat[close_col]
    X = df_feat.drop(columns=[close_col])
    n = len(df_feat)
    train_end = int(n*0.7)
    val_end = train_end + int(n*0.15)
    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
    X_val, y_val = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
    X_test, y_test = X.iloc[val_end:], y.iloc[val_end:]
    print('Train/Val/Test sizes:', len(X_train), len(X_val), len(X_test))

# Baseline: naive forecast (predict t+1 = t)
if 'y_test' in globals() and len(y_test)>0:
    naive_pred = y_test.shift(1).fillna(method='bfill')
    print('Naive metrics:', compute_metrics(y_test.values, naive_pred.values))
else:
    print('Tidak ada data test untuk mengevaluasi baseline.')

In [None]:
# RandomForest baseline + save model
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib

rf_rmse = None

if 'X_train' in globals() and len(X_test)>0:
    pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    metrics = compute_metrics(y_test.values, preds)
    rf_rmse = metrics['rmse']
    print('RF metrics:', metrics)
    model_path = MODELS_DIR / f'rf_baseline.pkl'
    joblib.dump(pipe, model_path)
    print('Saved model to', model_path)
else:
    warnings.warn('Tidak cukup data untuk melatih RandomForest baseline.')

In [None]:
# Prophet example (fallback if not installed) and quick LSTM skeleton
try:
    from prophet import Prophet
    prophet_available = True
except Exception as e:
    prophet_available = False
    print('Prophet not available:', e)

if prophet_available and 'df' in globals():
    # robust creation of df for Prophet: ensure a ds and y
    pdf = df.reset_index().rename(columns={df.index.name if df.index.name else df.columns[0]: 'ds'})
    if close_col not in pdf.columns:
        warnings.warn('Kolom target tidak ditemukan pada dataframe untuk Prophet.')
    else:
        pdf = pdf[['ds', close_col]].rename(columns={close_col: 'y'})
        pdf = pdf.dropna()
        train_size = int(len(pdf)*0.7)
        p_train = pdf.iloc[:train_size]
        p_test = pdf.iloc[train_size:]
        try:
            m = Prophet()
            m.fit(p_train)
            future = m.make_future_dataframe(periods=len(p_test), freq='D')
            fcst = m.predict(future)
            fc_pred = fcst.set_index('ds')['yhat'].iloc[-len(p_test):]
            print('Prophet RMSE:', rmse(p_test['y'].values, fc_pred.values))
        except Exception as e:
            warnings.warn(f'Prophet run failed: {e}')

# LSTM skeleton (tensorflow)
try:
    import tensorflow as tf
    tf_available = True
except Exception as e:
    tf_available = False
    print('TensorFlow not available:', e)

if tf_available and 'df_feat' in globals() and len(df_feat) > 50:
    # prepare simple windowed dataset
    values = df_feat[close_col].values
    window = 21
    Xw, yw = [], []
    for i in range(window, len(values)):
        Xw.append(values[i-window:i])
        yw.append(values[i])
    Xw = np.array(Xw)[..., np.newaxis]
    yw = np.array(yw)
    # split
    split = int(0.7*len(Xw))
    Xtr, ytr = Xw[:split], yw[:split]
    Xte, yte = Xw[split:], yw[split:]
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(window,1)),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    # NOTE: training LSTM can be slow; ensure you have resources
    # train briefly
    history = model.fit(Xtr, ytr, epochs=5, validation_split=0.1)
    # save model
    model.save(MODELS_DIR / 'lstm_baseline')
    print('LSTM trained and saved')
else:
    if tf_available:
        warnings.warn('Data tidak cukup besar untuk LSTM (butuh >50 baris fitur).')

In [None]:
# MLflow quick setup & logging example (local)
try:
    import mlflow
    mlflow.set_tracking_uri('file://' + str(ARTIFACTS_DIR / 'mlruns'))
    print('MLflow tracking URI:', mlflow.get_tracking_uri())
    with mlflow.start_run(run_name='rf_baseline'):
        mlflow.log_param('model', 'RandomForest')
        if 'rf_rmse' in globals() and rf_rmse is not None:
            mlflow.log_metric('rf_rmse', float(rf_rmse))
        else:
            mlflow.log_metric('rf_rmse', -1)
        if 'model_path' in globals() and Path(model_path).exists():
            mlflow.log_artifact(str(model_path))
        else:
            warnings.warn('model_path tidak ada; tidak ada artefak model yang dilog.')
except Exception as e:
    print('MLflow not available or failed:', e)


## Next steps & checklist

- Validasi schema dengan `pandera`.
- Tambahkan unit tests (`pytest`) untuk fungsi preprocessing & feature engineering.
- Tambahkan CI (GitHub Actions) untuk lint, tests, build docker image.
- Tambahkan Dockerfile dan FastAPI endpoint untuk `predict`.
- Setup monitoring: drift detection dan retrain triggers.

---

**Catatan:** jalankan notebook cell per cell di environment yang benar; beberapa paket (Prophet, TensorFlow) mungkin membutuhkan dependencies OS tambahan.


In [None]:
# CNN model skeleton (Conv1D) + save preprocessing artifacts
# This cell does NOT run automatically; run manually after EDA & preprocessing cells.
import joblib
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models
    tf_available = True
except Exception as e:
    tf_available = False
    print('TensorFlow not available:', e)

if 'df_feat' in globals():
    # robustly determine target and features
    if 'close_col' not in globals():
        # try to infer
        possible = [c for c in df_feat.columns if 'close' in c.lower() or 'adj' in c.lower()]
        close_col_local = possible[0] if possible else df_feat.select_dtypes('number').columns[0]
    else:
        close_col_local = close_col
    feature_cols = [c for c in df_feat.columns if c != close_col_local]

    # Save preprocessed dataframe
    ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
    preproc_path = ARTIFACTS_DIR / 'preprocessed.parquet'
    df_feat.to_parquet(preproc_path)
    print('Saved preprocessed data to', preproc_path)

    # Save feature list
    (ARTIFACTS_DIR / 'feature_list.txt').write_text('\n'.join(feature_cols))
    print('Saved feature list to', ARTIFACTS_DIR / 'feature_list.txt')

    # Prepare data for CNN: use sliding window on features to predict next close
    window = 21
    values_X = df_feat[feature_cols].values
    values_y = df_feat[close_col_local].values
    Xs, ys = [], []
    for i in range(window, len(values_y)):
        Xs.append(values_X[i-window:i])
        ys.append(values_y[i])
    Xs = np.array(Xs)
    ys = np.array(ys)
    print('Prepared CNN dataset shapes', Xs.shape, ys.shape)

    # Fit a StandardScaler on features and save
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # reshape for scaler: combine windows
    n_samples, n_steps, n_features = Xs.shape
    Xs_2d = Xs.reshape(-1, n_features)
    scaler.fit(Xs_2d)
    joblib.dump(scaler, ARTIFACTS_DIR / 'scaler.joblib')
    print('Saved scaler to', ARTIFACTS_DIR / 'scaler.joblib')

    # Optionally train a small Conv1D model if TF available
    if tf_available and len(Xs) > 0:
        # scale X
        Xs_scaled = scaler.transform(Xs_2d).reshape(n_samples, n_steps, n_features)
        split = int(0.7 * n_samples)
        Xtr, Xte = Xs_scaled[:split], Xs_scaled[split:]
        ytr, yte = ys[:split], ys[split:]

        model = models.Sequential([
            layers.Input(shape=(n_steps, n_features)),
            layers.Conv1D(32, kernel_size=3, activation='relu'),
            layers.MaxPooling1D(pool_size=2),
            layers.Conv1D(64, kernel_size=3, activation='relu'),
            layers.GlobalAveragePooling1D(),
            layers.Dense(32, activation='relu'),
            layers.Dense(1)
        ])
        model.compile(optimizer='adam', loss='mse')
        # NOTE: keep epochs small for smoke test
        model.fit(Xtr, ytr, epochs=5, validation_data=(Xte, yte))
        cnn_path = MODELS_DIR / 'cnn_conv1d'
        model.save(cnn_path)
        print('Saved CNN model to', cnn_path)
    else:
        print('Skipping CNN training (TensorFlow not available).')
else:
    print('df_feat not found; run preprocessing/feature engineering cells first.')
