In [1]:

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from itertools import product
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score,mean_absolute_error, median_absolute_error,roc_auc_score
from sklearn.svm import OneClassSVM
from pyod.models.knn import KNN
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', None)

## DATA PREP

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/sebastiansossah/TFM/main/data/df.zip', index_col=0)

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df = df.set_index('Timestamp')
df = df.sort_index()

In [None]:
df.shape

(116166, 99)

In [None]:
def create_sliding_windows(df, window_size, stride):
    """
    Crea ventanas deslizantes separando normal vs anomalía
    + metadata para mapeo ventana → evento
    
    Returns:
        windows_train: lista de ventanas normales
        windows_test: lista de ventanas con anomalías
        metadata_train: dict con info de cada ventana train
        metadata_test: dict con info de cada ventana test
    """
    if 'Block_ID' not in df.columns or 'Anomaly_Event' not in df.columns:
        print("Necesitas Block_ID y Anomaly_Event")
        return None, None, None, None
    
    windows_train = []
    windows_test = []
    metadata_train = []
    metadata_test = []
    
    train_idx = 0
    test_idx = 0
    
    for block_id in sorted(df['Block_ID'].unique()):
        block_data = df[df['Block_ID'] == block_id].copy()
        
        for i in range(0, len(block_data) - window_size + 1, stride):
            window = block_data.iloc[i:i+window_size]
            
            meta = {
                'block_id': block_id,
                'start_time': window.index[0],
                'end_time': window.index[-1],
                'start_idx_in_block': i,
                'anomaly_event': int(window['Anomaly_Event'].max()),
                'has_anomaly': bool(window['Anomaly'].any()),
                'n_anomaly_records': int(window['Anomaly'].sum())
            }
            
            if window['Anomaly_Event'].max() > 0:
                meta['window_idx'] = test_idx
                meta['split'] = 'test'
                windows_test.append(window)
                metadata_test.append(meta)
                test_idx += 1
            else:
                meta['window_idx'] = train_idx
                meta['split'] = 'train'
                windows_train.append(window)
                metadata_train.append(meta)
                train_idx += 1
    

    return windows_train, windows_test, metadata_train, metadata_test

In [None]:
def prepare_data_model_with_pca(
        windows_train, 
        windows_test, 
        pca_components=None
    ):
    """
    Prepara datos con normalización diferenciada para sensores/actuadores
    y añade PCA opcional sobre las ventanas flatten.
    """
    continuous_features = [
        'FIT101', 'LIT101',
        'AIT201', 'AIT202', 'AIT203', 'FIT201',
        'AIT301', 'AIT302', 'AIT303', 'DPIT301', 'FIT301', 'LIT301',
        'AIT401', 'AIT402', 'FIT401', 'LIT401',
        'AIT501', 'AIT502', 'AIT503', 'AIT504',
        'FIT501', 'FIT502', 'FIT503', 'FIT504',
        'PIT501', 'PIT502', 'PIT503',
        'FIT601', 'FIT602', 'LIT601', 'LIT602'
    ]
    
    binary_features = [
        'MV101', 'P101', 'P102',
        'MV201', 'P201', 'P202', 'P203', 'P204', 'P205', 'P206',
        'MV301', 'MV302', 'MV303', 'MV304', 'P301', 'P302',
        'P401', 'P402', 'P403', 'P404', 'UV401',
        'MV501', 'MV502', 'MV503', 'MV504', 'P501', 'P502',
        'P601', 'P602', 'P603'
    ]
    
    all_features = continuous_features + binary_features

    X_train = np.array([w[all_features].values for w in windows_train])
    X_test = np.array([w[all_features].values for w in windows_test])
    
    n_cont = len(continuous_features)

    X_train_cont = X_train[:, :, :n_cont]
    X_train_bin  = X_train[:, :, n_cont:]
    X_test_cont  = X_test[:, :, :n_cont]
    X_test_bin   = X_test[:, :, n_cont:]
    
    scaler = StandardScaler()
    
    n_train, window_size, _ = X_train_cont.shape
    
    X_train_cont_rs = X_train_cont.reshape(-1, n_cont)
    X_test_cont_rs  = X_test_cont.reshape(-1, n_cont)
    
    X_train_cont_scaled = scaler.fit_transform(X_train_cont_rs)
    X_test_cont_scaled  = scaler.transform(X_test_cont_rs)
    
    X_train_cont = X_train_cont_scaled.reshape(n_train, window_size, n_cont)
    X_test_cont  = X_test_cont_scaled.reshape(X_test_cont.shape[0], window_size, n_cont)
    
    X_train_final = np.concatenate([X_train_cont, X_train_bin], axis=2)
    X_test_final  = np.concatenate([X_test_cont, X_test_bin], axis=2)

    if pca_components is not None:
        print("\nAplicando PCA...")

        n_features_total = X_train_final.shape[2]
        
        X_train_flat = X_train_final.reshape(X_train_final.shape[0], window_size * n_features_total)
        X_test_flat  = X_test_final.reshape(X_test_final.shape[0], window_size * n_features_total)

        # PCA
        pca = PCA(n_components=pca_components)
        X_train_pca = pca.fit_transform(X_train_flat)
        X_test_pca  = pca.transform(X_test_flat)

        print("Dimensión original:", window_size * n_features_total)
        print("Dimensión PCA:", X_train_pca.shape[1])
        print("Varianza explicada acumulada:", pca.explained_variance_ratio_.sum())

        return X_train_pca, X_test_pca, scaler, pca, all_features
    
    X_train_flat = X_train_final.reshape(X_train_final.shape[0], -1)
    X_test_flat  = X_test_final.reshape(X_test_final.shape[0],  -1)

    return X_train_flat, X_test_flat, scaler, all_features


In [None]:
def get_f1(model, X_train, X_test, y_test):
    """
    Calculates F1 Score correctly.
    
    Args:
        model: The initialized model (e.g., IsolationForest)
        X_train: The data to train on (ONLY Normal data)
        X_test: The data to test on (Mixture of Unseen Normal + Anomalies)
        y_test: True labels for X_test (0=Normal, 1=Anomaly)
    """
    
    # 1. Train only on X_train
    model.fit(X_train)
    
    # 2. Predict ONLY on X_test
    # We do NOT look at training scores for evaluation
    y_pred_raw = model.predict(X_test)
    
    # 3. Convert predictions to 0/1
    # IsolationForest/OCSVM return:  1 = Normal, -1 = Anomaly
    # We want:                       0 = Normal,  1 = Anomaly
    y_pred = [1 if x == -1 else 0 for x in y_pred_raw]
    
    # 4. Calculate F1
    # We compare y_test (ground truth) vs y_pred (model guess)
    return f1_score(y_test, y_pred, zero_division=0)

In [None]:
EXPERIMENT_CONFIG = {
    'strides': [5,15,30],
    'pca':[10,20,61],
    'window_size': [60],
    "kernel": ["rbf", "sigmoid"],
    "nu": [0.001, 0.005, 0.01, 0.05],
    "gamma": ["scale", "auto", 0.001, 0.01, 0.1]}

In [None]:

results_svm = []
for stride, pca_param,window_size in product(EXPERIMENT_CONFIG['strides'], EXPERIMENT_CONFIG['pca'],EXPERIMENT_CONFIG['window_size']):
    print(window_size)
    windows_train, windows_test, metadata_train, metadata_test = create_sliding_windows(
                    df, 
                    window_size=window_size, 
                    stride=stride)

    X_train, X_test, scaler, pca, all_features = prepare_data_model_with_pca(
        windows_train, 
        windows_test, 
        pca_components=pca_param)

    for kernel, nu, gamma in product(
            EXPERIMENT_CONFIG["kernel"],
            EXPERIMENT_CONFIG["nu"],
            EXPERIMENT_CONFIG["gamma"]):
        
        ocsvm = OneClassSVM(kernel=kernel, nu=nu, gamma=gamma)
        f1= get_f1(ocsvm, X_train, X_test, score=0) 
        results_svm.append({'kernel': kernel, 'nu': nu, 'gamma': gamma,'stride':stride, 
                            'pca':pca_param,'f1': f1})



60


KeyboardInterrupt: 

In [None]:
df_resultados = pd.DataFrame(results_svm).sort_values(by='f1', ascending=False)

In [None]:
df_resultados[df_resultados['pca']==10]

Unnamed: 0,kernel,nu,gamma,stride,pca,f1
19,rbf,0.050,0.1,5,10,0.902475
16,rbf,0.050,auto,5,10,0.902475
14,rbf,0.010,0.1,5,10,0.891351
11,rbf,0.010,auto,5,10,0.891351
9,rbf,0.005,0.1,5,10,0.879954
...,...,...,...,...,...,...
33,sigmoid,0.010,0.01,5,10,0.000000
143,sigmoid,0.001,0.01,15,10,0.000000
153,sigmoid,0.010,0.01,15,10,0.000000
146,sigmoid,0.005,auto,15,10,0.000000


In [None]:
best_by_pca = df_resultados.loc[df_resultados.groupby(["pca", "stride"])["f1"].idxmax()]

In [None]:
best_by_pca

Unnamed: 0,kernel,nu,gamma,stride,pca,f1
19,rbf,0.05,0.1,5,10,0.902475
133,rbf,0.01,0.01,15,10,0.857917
258,rbf,0.05,0.01,30,10,0.832526
53,rbf,0.01,0.01,5,20,0.920629
178,rbf,0.05,0.01,15,20,0.885895
298,rbf,0.05,0.01,30,20,0.840163
93,rbf,0.01,0.01,5,61,0.888169
218,rbf,0.05,0.01,15,61,0.832626
338,rbf,0.05,0.01,30,61,0.816061
