In [1]:
import polars as pl
import os

# PSM dataset paths
dataset_dir = "../../datasets/PSM"
train_path = os.path.join(dataset_dir, "train.csv")
test_path = os.path.join(dataset_dir, "test.csv")
test_label_path = os.path.join(dataset_dir, "test_label.csv")

# Read train data
train_df = pl.read_csv(train_path)

# Rename columns: timestamp_(min) -> timestamp, feature_X -> value_X
old_cols = train_df.columns
new_cols = []
for col in old_cols:
    if col == "timestamp_(min)":
        new_cols.append("timestamp")
    elif col.startswith("feature_"):
        # Replace "feature_" with "value_"
        new_cols.append(col.replace("feature_", "value_"))
    else:
        new_cols.append(col)

train_df = train_df.rename(dict(zip(old_cols, new_cols)))

print(f"Train data shape: {train_df.shape}")
print(f"Columns: {train_df.columns}")
train_df.head()

Train data shape: (132481, 26)
Columns: ['timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4', 'value_5', 'value_6', 'value_7', 'value_8', 'value_9', 'value_10', 'value_11', 'value_12', 'value_13', 'value_14', 'value_15', 'value_16', 'value_17', 'value_18', 'value_19', 'value_20', 'value_21', 'value_22', 'value_23', 'value_24']


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.732689,0.761748,0.606848,0.488746,0.42431,0.403609,0.519318,0.398792,0.451453,0.447077,0.463336,0.487324,0.151929,0.138458,0.201467,0.318797,0.451856,0.5715,0.469717,0.609883,0.008432,0.0,0.481838,0.006536,0.138249
1.0,0.732799,0.761855,0.607133,0.488781,0.432008,0.410256,0.511364,0.402568,0.455657,0.449474,0.459267,0.494656,0.151487,0.138011,0.20211,0.321463,0.456123,0.562226,0.466533,0.629812,0.008432,0.0,0.477218,0.006536,0.115207
2.0,0.732938,0.761594,0.606895,0.488791,0.418858,0.407724,0.488636,0.396526,0.456104,0.451282,0.471587,0.490333,0.15367,0.140763,0.203354,0.347219,0.456692,0.572002,0.487845,0.643598,0.006745,0.0,0.492623,0.008715,0.092166
3.0,0.732893,0.761656,0.606478,0.488802,0.417896,0.404242,0.5,0.405589,0.46002,0.456628,0.47691,0.480858,0.153426,0.141215,0.201345,0.361904,0.460532,0.563354,0.479512,0.64469,0.008432,0.0,0.457064,0.008715,0.142857
4.0,0.732788,0.761573,0.606777,0.4888,0.421103,0.407407,0.511364,0.399547,0.458507,0.454611,0.451032,0.458795,0.153335,0.139718,0.203097,0.359767,0.458825,0.563354,0.448298,0.629948,0.006745,0.0,0.472223,0.006536,0.170507


In [2]:
# Read test data
test_df = pl.read_csv(test_path)

# Apply same column renaming
old_cols = test_df.columns
new_cols = []
for col in old_cols:
    if col == "timestamp_(min)":
        new_cols.append("timestamp")
    elif col.startswith("feature_"):
        new_cols.append(col.replace("feature_", "value_"))
    else:
        new_cols.append(col)

test_df = test_df.rename(dict(zip(old_cols, new_cols)))

print(f"Test data shape: {test_df.shape}")
print(f"Columns: {test_df.columns}")
test_df.head()

Test data shape: (87841, 26)
Columns: ['timestamp', 'value_0', 'value_1', 'value_2', 'value_3', 'value_4', 'value_5', 'value_6', 'value_7', 'value_8', 'value_9', 'value_10', 'value_11', 'value_12', 'value_13', 'value_14', 'value_15', 'value_16', 'value_17', 'value_18', 'value_19', 'value_20', 'value_21', 'value_22', 'value_23', 'value_24']


timestamp,value_0,value_1,value_2,value_3,value_4,value_5,value_6,value_7,value_8,value_9,value_10,value_11,value_12,value_13,value_14,value_15,value_16,value_17,value_18,value_19,value_20,value_21,value_22,value_23,value_24
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
132480.0,0.775374,0.909185,0.606704,0.660626,0.449968,0.426717,0.471591,0.434668,0.479511,0.478078,0.465794,0.516738,0.157978,0.148025,0.207122,0.400617,0.480444,0.58867,0.404036,0.638957,0.020236,0.042705,0.173375,0.008715,0.105991
132481.0,0.775423,0.909142,0.607796,0.660655,0.487813,0.442545,0.505682,0.44864,0.499451,0.494367,0.488511,0.55307,0.158889,0.146784,0.208011,0.396902,0.500213,0.587041,0.421605,0.637865,0.01855,0.042705,0.191826,0.010893,0.110599
132482.0,0.775458,0.909004,0.607988,0.660623,0.469532,0.433682,0.477273,0.450906,0.487223,0.484878,0.46819,0.540978,0.15847,0.145529,0.209907,0.38927,0.487982,0.582404,0.407393,0.636091,0.01855,0.039146,0.188361,0.010893,0.115207
132483.0,0.775513,0.9092,0.607218,0.66064,0.45991,0.435581,0.482955,0.454683,0.490431,0.488316,0.477134,0.545906,0.162539,0.151759,0.212695,0.40378,0.491253,0.58466,0.407141,0.638138,0.020236,0.042705,0.186975,0.013072,0.110599
132484.0,0.775587,0.909318,0.606654,0.660643,0.462155,0.436214,0.477273,0.449396,0.494671,0.487094,0.479202,0.528634,0.157573,0.147655,0.206567,0.403636,0.495947,0.581527,0.402761,0.632678,0.021922,0.042705,0.156325,0.008715,0.110599


In [3]:
# Read test labels
test_label_df = pl.read_csv(test_label_path)

print(f"Test label shape: {test_label_df.shape}")
print(f"Columns: {test_label_df.columns}")
test_label_df.head()

Test label shape: (87841, 2)
Columns: ['timestamp_(min)', 'label']


timestamp_(min),label
f64,i64
132480.0,0
132481.0,0
132482.0,0
132483.0,0
132484.0,0
