In [1]:
import os
import numpy as np
import pandas as pd
import tqdm
import pickle

In [2]:
# Load Training Data (X)
import zipfile
# Path to your ZIP archive
zip_path = r"/Users/inan/Documents/GitHub/DIEF_BTS/data/train_X_v0.1.0.zip"
X_train_list = []
seq_len = 336
with zipfile.ZipFile(zip_path, 'r') as z:
    file_list = z.namelist()
    # Filter for .pkl files directly
    pkl_files = [f for f in file_list if f.endswith('.pkl')]
    for pickle_file_name in pkl_files:
        with z.open(pickle_file_name) as f:
            data = pickle.load(f)
            data = data["v"]
            if len(data) >= seq_len:
                data = data[:seq_len]
            else:
                data = np.pad(data, (0, (seq_len - len(data))), mode='constant', constant_values=0)
        X_train_list.append(data)

# Stack them into a 2D array: shape will be (num_samples, 336)
X_train = np.stack(X_train_list, axis=0)

In [3]:
print(X_train.shape)

(31839, 336)


In [4]:
y_train_path = "/Users/inan/Documents/GitHub/DIEF_BTS/data/train_y_v0.1.0.csv"
# - A number of target columns that we need to predict for the test set.
df_train_y = pd.read_csv(y_train_path, index_col=0)

In [5]:
df_train_y = df_train_y.replace({0: 1, -1: 0})

In [6]:
df_train_y.head()

Unnamed: 0_level_0,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,Chilled_Water_Return_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_X0.pkl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_X1.pkl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_X2.pkl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_X3.pkl,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
train_X4.pkl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
y_train = df_train_y.to_numpy()

In [8]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression, RidgeClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sktime.transformations.panel.rocket import (
    MiniRocket,
    MiniRocketMultivariate,
    MiniRocketMultivariateVariable,
    Rocket
)
from sklearn.preprocessing import StandardScaler

In [9]:
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
minirocket = MiniRocket()  # by default, MiniRocket uses ~10_000 kernels
minirocket.fit(X_train)
X_train_transform = minirocket.transform(X_train)
scaler = StandardScaler(with_mean=False)

In [10]:
# Build a One-vs-Rest multi-label classifier using RidgeClassifier as the base estimator
classifier = OneVsRestClassifier(RidgeClassifier())
X_train_scaled_transform = scaler.fit_transform(X_train_transform)
classifier.fit(X_train_scaled_transform, y_train)

: 

: 

# Prediction on Test Data

In [None]:
# Load Testing Data (X)
import zipfile
# Path to your ZIP archive
zip_path = r"/Users/inan/Documents/GitHub/DIEF_BTS/data/test_X_v0.1.0.zip"
X_test_list = []
seq_len = 336
with zipfile.ZipFile(zip_path, 'r') as z:
    file_list = z.namelist()
    # Filter for .pkl files directly
    pkl_files = [f for f in file_list if f.endswith('.pkl')]
    for pickle_file_name in pkl_files:
        with z.open(pickle_file_name) as f:
            data = pickle.load(f)
            data = data["v"]
            if len(data) >= seq_len:
                data = data[:seq_len]
            else:
                data = np.pad(data, (0, (seq_len - len(data))), mode='constant', constant_values=0)
        X_test_list.append(data)

# Stack them into a 2D array: shape will be (num_samples, 336)
X_test = np.stack(X_test_list, axis=0)

In [None]:
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
X_test_transform = minirocket.transform(X_test)
X_test_scaled_transform = scaler.transform(X_test_transform)
predictions = classifier.predict(X_test_scaled_transform)

In [None]:
# Ensure that the data files (ZIP of test inputs and the train_y file) are placed in the `data` folder.
zip_file_path = r"/Users/inan/Documents/GitHub/DIEF_BTS/data/test_X_v0.1.0.zip"
# Extract all columns except 'filename'—these are our target columns.
expected_columns = list(df_train_y.columns)
filtered_columns = sorted([col for col in expected_columns if col not in ['filename']])

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Get the complete list of files inside the ZIP.
    file_list = zip_ref.namelist()

# Filter the list to only include '.pkl' files and remove the leading directory prefix.
filenames = [f.replace("test_X/", "") for f in file_list if f.endswith('.pkl')]
# -----------------------------------------------------------
# CREATE THE SUBMISSION DATAFRAME
# -----------------------------------------------------------
# Construct a DataFrame with the same structure as the expected submission.
# It must contain a 'filename' column plus one column for each target.

df_sample = pd.DataFrame(predictions, columns=filtered_columns)
df_sample.insert(0, 'filename', filenames)  # Insert 'filename' as the first column.

In [None]:
# Convert specified columns to float
df_sample[filtered_columns] = df_sample[filtered_columns].astype(np.float16)

In [None]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315720 entries, 0 to 315719
Data columns (total 95 columns):
 #   Column                                               Non-Null Count   Dtype  
---  ------                                               --------------   -----  
 0   filename                                             315720 non-null  object 
 1   Active_Power_Sensor                                  315720 non-null  float16
 2   Air_Flow_Sensor                                      315720 non-null  float16
 3   Air_Flow_Setpoint                                    315720 non-null  float16
 4   Air_Temperature_Sensor                               315720 non-null  float16
 5   Air_Temperature_Setpoint                             315720 non-null  float16
 6   Alarm                                                315720 non-null  float16
 7   Angle_Sensor                                         315720 non-null  float16
 8   Average_Zone_Air_Temperature_Sensor                  3

In [None]:
# -----------------------------------------------------------
# SAVE THE SUBMISSION FILE
# -----------------------------------------------------------
# Save as a compressed CSV (gzip) without the index column.
submission_file_path = '../data/mr_submission_1.csv.gz'
df_sample.to_csv(submission_file_path, index=False, compression='gzip')