In [1]:
import numpy as np
import torch
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import os

# Load MNIST
transform = transforms.ToTensor()
mnist = datasets.MNIST(root="./data", train=True, download=True, transform=transform)

# Select 50 zeros, 50 non-zeros
zeros = [(img, -1) for img, label in zip(mnist.data, mnist.targets) if label == 0][:50]
nonzeros = [(img, 1) for img, label in zip(mnist.data, mnist.targets) if label != 0][:50]

# Combine and normalize
images = [img.numpy().flatten() for img, _ in (zeros + nonzeros)]
labels = [label for _, label in (zeros + nonzeros)]
images = np.array(images).astype(np.float32) / 255.0
labels = np.array(labels)

# Apply PCA to retain 95% variance
pca = PCA(n_components=0.70)
images_pca = pca.fit_transform(images)
print(f"PCA reduced to {pca.n_components_} dimensions.")

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(
    images_pca, labels, test_size=0.5, stratify=labels, random_state=42
)

# Save all in a single .npy file
data_dict = {
    "x_train": x_train,
    "y_train": y_train,
    "x_test": x_test,
    "y_test": y_test
}

np.save("data/zero_vs_nonzero.npy", data_dict)

print("Saved single file at: data/zero_vs_nonzero.npy")


100.0%


KeyboardInterrupt: 

In [5]:
import numpy as np
import torch
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import os

# Load MNIST
transform = transforms.ToTensor()
mnist = datasets.MNIST(root="./data", train=True, download=True, transform=transform)

# Select 50 ones, 50 non-ones
ones = [(img, +1) for img, label in zip(mnist.data, mnist.targets) if label == 1][:50]
nonones = [(img, -1) for img, label in zip(mnist.data, mnist.targets) if label != 1][:50]

# Combine and normalize
images = [img.numpy().flatten() for img, _ in (ones + nonones)]
labels = [label for _, label in (ones + nonones)]
images = np.array(images).astype(np.float32) / 255.0
labels = np.array(labels)

# Apply PCA to retain 70% variance
pca = PCA(n_components=0.70)
images_pca = pca.fit_transform(images)
print(f"PCA reduced to {pca.n_components_} dimensions.")

# Split dataset 50/50
x_train, x_test, y_train, y_test = train_test_split(
    images_pca, labels, test_size=0.5, stratify=labels, random_state=42
)

# Save in single .npy file
os.makedirs("data", exist_ok=True)
data_dict = {
    "x_train": x_train,
    "y_train": y_train,
    "x_test": x_test,
    "y_test": y_test
}
np.save("data/one_vs_nonone.npy", data_dict)

print("Saved single file at: data/one_vs_nonone.npy")


PCA reduced to 14 dimensions.
Saved single file at: data/one_vs_nonone.npy


In [2]:
import pandas as pd

In [5]:
path = 'data/power_fault_monthly'
dirs = os.listdir(path)
dirs

['Apr_2023.csv',
 'Aug_2022.csv',
 'Dec_2022.csv',
 'Feb_2023.csv',
 'Jan_2023.csv',
 'Jul_2022.csv',
 'Jul_2023.csv',
 'Jun_2022.csv',
 'Jun_2023.csv',
 'Mar_2023.csv',
 'May_2022.csv',
 'May_2023.csv',
 'Nov_2022.csv',
 'Oct_2022.csv',
 'Sep_2022.csv']

In [10]:
df = pd.concat([pd.read_csv(os.path.join(path + '/' + file)) for file in dirs])

In [11]:
df

Unnamed: 0,Timestamp,Battery_Active_Power,Battery_Active_Power_Set_Response,PVPCS_Active_Power,GE_Body_Active_Power,GE_Active_Power,GE_Body_Active_Power_Set_Response,FC_Active_Power_FC_END_Set,FC_Active_Power,FC_Active_Power_FC_end_Set_Response,Island_mode_MCCB_Active_Power,MG-LV-MSB_AC_Voltage,Receiving_Point_AC_Voltage,Island_mode_MCCB_AC_Voltage,Island_mode_MCCB_Frequency,MG-LV-MSB_Frequency,Inlet_Temperature_of_Chilled_Water,Outlet_Temperature
0,2023/04/01 00:00:01,-0.1,0.0,0.0,110.0,87.000000,122.0,40.0,38.0,40.0,-123.0,488.0,486.0,488.0,60.040001,60.040001,15.100000,15.500000
1,2023/04/01 00:00:11,-0.3,0.0,0.0,118.0,120.000000,122.0,40.0,38.0,40.0,-87.0,488.0,486.0,488.0,60.040001,60.040001,15.100000,15.500000
2,2023/04/01 00:00:21,0.0,0.0,0.0,116.0,124.000000,122.0,40.0,38.0,40.0,-116.0,488.0,486.0,488.0,60.040001,60.040001,15.100000,15.500000
3,2023/04/01 00:00:31,-0.1,0.0,0.0,110.0,94.300003,122.0,40.0,38.0,40.0,-115.0,488.0,486.0,488.0,60.049999,60.049999,15.100000,15.500000
4,2023/04/01 00:00:41,0.0,0.0,0.0,116.0,116.000000,122.0,40.0,38.0,40.0,-128.0,488.0,486.0,488.0,60.049999,60.049999,15.100000,15.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258979,2022/09/30 23:23:11,-0.1,0.0,0.0,0.0,-0.700000,130.0,40.0,37.0,40.0,-23.0,483.0,483.0,483.0,60.049999,60.049999,20.299999,17.700001
258980,2022/09/30 23:23:21,-0.4,0.0,0.0,0.0,-1.700000,130.0,40.0,37.0,40.0,-23.0,483.0,483.0,483.0,60.040001,60.049999,20.299999,17.700001
258981,2022/09/30 23:23:31,-0.1,0.0,0.0,0.0,-0.700000,130.0,40.0,37.0,40.0,-23.0,483.0,483.0,483.0,60.049999,60.049999,20.299999,17.799999
258982,2022/09/30 23:23:41,-0.2,0.0,0.0,0.0,-1.000000,130.0,40.0,37.0,40.0,-23.0,483.0,483.0,483.0,60.049999,60.049999,20.299999,17.700001


In [12]:
df.columns

Index(['Timestamp', 'Battery_Active_Power',
       'Battery_Active_Power_Set_Response', 'PVPCS_Active_Power',
       'GE_Body_Active_Power', 'GE_Active_Power',
       'GE_Body_Active_Power_Set_Response', 'FC_Active_Power_FC_END_Set',
       'FC_Active_Power', 'FC_Active_Power_FC_end_Set_Response',
       'Island_mode_MCCB_Active_Power', 'MG-LV-MSB_AC_Voltage',
       'Receiving_Point_AC_Voltage', 'Island_mode_MCCB_AC_Voltage',
       'Island_mode_MCCB_Frequency', 'MG-LV-MSB_Frequency',
       'Inlet_Temperature_of_Chilled_Water', 'Outlet_Temperature'],
      dtype='object')