In [32]:
import os
import typing

# Import your dependencies
import joblib
import pandas as pd
import scipy
import sklearn.metrics
import warnings
from arch.univariate.base import ConvergenceWarning
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)


X_train = pd.read_parquet("X_train.parquet")
y_train = pd.read_parquet("y_train.parquet")

In [60]:
import numpy as np
import pandas as pd
from scipy import stats
from multiprocessing import Pool, cpu_count
import numpy as np, pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from scipy.stats import median_abs_deviation
from scipy.stats import wasserstein_distance
from arch import arch_model  
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
import lightgbm as lgb
from catboost import CatBoostClassifier
from scipy.stats import linregress

import numpy as np

def count_local_extrema(x):
    """
    Count number of local extrema (maxima + minima) in 1D array x.
    Returns 0 if length < 3 (no interior points).
    """
    x = np.asarray(x)
    if len(x) < 3:
        return 0
    
    maxima = ((x[1:-1] > x[:-2]) & (x[1:-1] > x[2:])).sum()
    minima = ((x[1:-1] < x[:-2]) & (x[1:-1] < x[2:])).sum()
    return maxima + minima

def rolling_std_features(x, windows=[5,10,20]):
    results = {}
    s = pd.Series(x)
    for w in windows:
        if len(s) >= w:
            roll_std = s.rolling(window=w).std().dropna()
            results[f'rollstd_mean_w{w}'] = roll_std.mean()
            results[f'rollstd_std_w{w}'] = roll_std.std()
        else:
            results[f'rollstd_mean_w{w}'] = np.nan
            results[f'rollstd_std_w{w}'] = np.nan
    return results

def extract_features(df):
    # df = DataFrame with multiindex (id, time) and columns ['value', 'period']
    feats = []
    ids = df.index.get_level_values(0).unique()
    for i in ids:
        s = df.loc[i]
        before = s[s['period'] == 0]['value'].values
        after  = s[s['period'] == 1]['value'].values
        def agg(x):
            if len(x)==0:
                return dict(mean=np.nan, std=np.nan, median=np.nan)
            return dict(mean=np.mean(x), std=np.std(x, ddof=1),
                        median=np.median(x),
                        skew=stats.skew(x), kurt=stats.kurtosis(x),
                        q25=np.percentile(x,25), q75=np.percentile(x,75),
                        q10=np.percentile(x,10), q90=np.percentile(x,90),
                        outliers=(np.abs(x - np.mean(x)) > 3*np.std(x)).sum()/len(x))
        b = agg(before); a = agg(after)

        # Rolling std features
        b_rollstd = rolling_std_features(before)
        a_rollstd = rolling_std_features(after)

        row = {'id': i}
        for k,v in b.items(): row[f'b_{k}'] = v
        for k,v in a.items(): row[f'a_{k}'] = v

        for k, v in b_rollstd.items():
            row[f'b_{k}'] = v
        for k, v in a_rollstd.items():
            row[f'a_{k}'] = v  

        # differences & statistical tests
        row['mean_diff'] = b['mean'] - a['mean']
        row['std_ratio'] = (a['std'] / (b['std'] + 1e-9))

        # t-test and ks (handle degenerate)
        try:
            tstat, tp = stats.ttest_ind(before, after, equal_var=False, nan_policy='omit')
        except:
            tstat, tp = np.nan, np.nan
        row['tstat'] = tstat; row['tp'] = tp

        try:
            ks, ksp = stats.ks_2samp(before, after)
        except:
            ks, ksp = np.nan, np.nan
        row['ks'] = ks

        feats.append(row)
    return pd.DataFrame(feats).set_index('id')

X = extract_features(X_train.loc[:5000])

y = y_train.loc[X.index].astype(int)

model = Pipeline([
    ('scaler', RobustScaler()),          # optional but recommended for robustness
    ('catboost', CatBoostClassifier(verbose=0))  # silent training
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)

print(f"CatBoost ROC AUC CV mean: {scores.mean():.4f} ± {scores.std():.4f}")


CatBoost ROC AUC CV mean: 0.6862 ± 0.0169
