In [13]:
import pandas as pd
import h5py #archivos h5
from tsfresh import extract_features, select_features #extrae features en series temporales

In [14]:
with h5py.File('test.h5', 'r') as f:
    x_h5_test= f['x'][:]

x_h5_test_df = pd.DataFrame(x_h5_test[:,0,:])

print(x_h5_test_df)

        0         1         2         3         4         5         6      \
0    0.029938  0.061432  0.072388  0.081329  0.078033  0.089691  0.071564   
1    0.000275  0.001160  0.001556  0.001740  0.002380  0.003693  0.004883   
2   -0.002899 -0.003510  0.000458  0.002991  0.005920  0.007355  0.008728   
3    0.008636  0.018158  0.015717  0.015991  0.015442  0.015564  0.014679   
4   -0.011475 -0.022217 -0.017456 -0.018890 -0.019714 -0.021301 -0.025055   
..        ...       ...       ...       ...       ...       ...       ...   
200 -0.051788 -0.099670 -0.074982 -0.069702 -0.053253 -0.044342 -0.030121   
201 -0.034637 -0.058685 -0.029419 -0.077911 -0.086914 -0.061798 -0.052765   
202 -0.006989 -0.013123 -0.010986 -0.012695 -0.012299 -0.011810 -0.012512   
203  0.022736  0.047821  0.042450  0.042908  0.037659  0.035675  0.031708   
204  0.013214  0.028412  0.026215  0.026337  0.024078  0.023010  0.019562   

        7         8         9      ...     18520     18521     18522  \
0  

In [15]:
import numpy as np
def parser_time_series(ds):
    n_rows, n_cols = ds.shape
    ids = np.repeat(np.arange(n_rows), n_cols)
    times = np.tile(np.arange(n_cols), n_rows)
    values = ds.values.flatten()
    return pd.DataFrame({'id': ids, 'time': times, 'value': values})


In [16]:
import os
import glob
import pandas as pd
import numpy as np
import h5py
import psutil
from pathlib import Path
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

os.environ['OMP_NUM_THREADS'] = "1"
os.environ['MKL_NUM_THREADS'] = "1"
os.environ['OPENBLAS_NUM_THREADS'] = "1"

_PARQUET_ENGINE_AVAILABLE = True
try:
    import pyarrow
except ImportError:
    try:
        import fastparquet
    except ImportError:
        _PARQUET_ENGINE_AVAILABLE = False
        print("Warning: ni pyarrow ni fastparquet disponibles")

# Extract autoajustada por batches
def extract_features_batch(x_df: pd.DataFrame, target_ram_gb: float = 8.0, sample_ids: int = 50, output_dir: str = "features_blocks", n_jobs: int = None, memory_margin: float = 0.8) -> pd.DataFrame:
    from multiprocessing import cpu_count

    total_cores = cpu_count()
    n_jobs = n_jobs or max(1, total_cores - 1)

    avail_bytes = psutil.virtual_memory().available
    target_bytes = target_ram_gb * (1024 ** 3) * memory_margin
    max_use_bytes = min(avail_bytes * memory_margin, target_bytes)

    unique_ids = x_df['id'].unique()
    num_ids = len(unique_ids)

    samp = unique_ids[:min(sample_ids, num_ids)]
    samp_df = x_df[x_df['id'].isin(samp)]
    sample_feats = extract_features(
        samp_df,
        column_id='id',
        column_sort='time',
        default_fc_parameters=EfficientFCParameters(),
        n_jobs=1
    )
    mem_usage = sample_feats.memory_usage(deep=True).sum()
    per_id = mem_usage / len(samp)

    block_size = max(1, int(max_use_bytes / per_id))
    block_size = min(block_size, num_ids)
    print(f"Usable RAM estimada: {max_use_bytes/1e9:.2f} GB, memoria/ID: {per_id/1e6:.2f} MB -> block_size = {block_size}")

    os.makedirs(output_dir, exist_ok=True)

    for idx in range(0, num_ids, block_size):
        batch = unique_ids[idx: idx + block_size]
        block_df = x_df[x_df['id'].isin(batch)]
        feats = extract_features(
            block_df,
            column_id='id',
            column_sort='time',
            default_fc_parameters=EfficientFCParameters(),
            n_jobs=n_jobs,
            chunksize=max(1, block_size // n_jobs)
        )
        fname_base = f"{output_dir}/block_{idx//block_size:04d}"
        if _PARQUET_ENGINE_AVAILABLE:
            feats.to_parquet(fname_base + ".parquet")
        else:
            feats.to_csv(fname_base + ".csv", index=False)
        print(f"Guardado bloque {idx//block_size + 1} con {len(batch)} IDs")

    all_frames = []
    for fpath in glob.glob(f"{output_dir}/block_*.{'parquet' if _PARQUET_ENGINE_AVAILABLE else 'csv'}"):
        if fpath.endswith('.parquet'):
            all_frames.append(pd.read_parquet(fpath))
        else:
            all_frames.append(pd.read_csv(fpath))

    all_feats = pd.concat(all_frames, ignore_index=True)
    return all_feats

In [17]:
X_test_parser = parser_time_series(x_h5_test_df)

In [18]:
print(X_test_parser)

          id   time     value
0          0      0  0.029938
1          0      1  0.061432
2          0      2  0.072388
3          0      3  0.081329
4          0      4  0.078033
...      ...    ...       ...
3798645  204  18525 -0.023773
3798646  204  18526 -0.022858
3798647  204  18527 -0.022125
3798648  204  18528 -0.020447
3798649  204  18529 -0.019043

[3798650 rows x 3 columns]


In [26]:
X_test_features = extract_features_batch(X_test_parser)
print(X_test_features)
print(X_test_features.shape)

Feature Extraction: 100%|██████████| 50/50 [01:25<00:00,  1.71s/it]

Usable RAM estimada: 3.83 GB, memoria/ID: 0.01 MB -> block_size = 205



Feature Extraction: 100%|██████████| 10/10 [01:21<00:00,  8.15s/it]

Guardado bloque 1 con 205 IDs
     value__variance_larger_than_standard_deviation  value__has_duplicate_max  \
0                                               0.0                       0.0   
1                                               0.0                       0.0   
2                                               0.0                       0.0   
3                                               0.0                       0.0   
4                                               0.0                       0.0   
..                                              ...                       ...   
200                                             0.0                       0.0   
201                                             0.0                       0.0   
202                                             0.0                       0.0   
203                                             0.0                       1.0   
204                                             0.0                       0.0  




In [None]:
# NO Estandarizar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_test_features)
X_scaled_test_df = pd.DataFrame(X_scaled)

print("Features estandarizadas", X_scaled_test_df.shape)

Features estandarizadas (205, 777)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
# NOEstandarizar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_test_features)

X_scaled_df = pd.DataFrame(X_scaled, columns=X_test_features.columns)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [37]:
# Estandarizar
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_test_features)

X_scaled_test_df_full = pd.DataFrame(X_scaled, columns=X_test_features.columns)

X_train = pd.read_csv("features_40.csv")

X_scaled_test_df = X_scaled_test_df_full[X_train.columns]

print("Features estandarizadas y filtradas:", X_scaled_test_df.shape)


Features estandarizadas y filtradas: (205, 40)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [38]:
X_scaled_test_df.to_csv('x_test_40.csv', index=False)