## This is starter code for single point prediction with CNNs

In [2]:
import os
import glob
import joblib

# common math imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# common torch imports
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# common sklearn imports 
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Load and process data

In [3]:
main_path = 'data'
train_folders = [f'{main_path}/train',
                 # f'{main_path}/rain-sounds', f'{main_path}/colored-noise'
]
test_folder = f'{main_path}/val'

# ---------- TRAINING DATA ----------
X_list = []
Y_list = []

for folder in train_folders:
    # find files like X_1000.npy, X_2000.npy, etc.
    X_files = sorted(glob.glob(os.path.join(folder, "X_*.npy")))
    Y_files = sorted(glob.glob(os.path.join(folder, "Y_*.npy")))

    for xf, yf in zip(X_files, Y_files):
        X_list.append(np.load(xf))
        Y_list.append(np.load(yf))

# Stack into arrays
X_train = np.vstack(X_list)
Y_train = np.concatenate(Y_list)

# Cleanup
del X_list, Y_list


# ---------- VALIDATION / TEST DATA ----------
X_test_list = []
Y_test_list = []

X_files = sorted(glob.glob(os.path.join(test_folder, "X_*.npy")))
Y_files = sorted(glob.glob(os.path.join(test_folder, "Y_*.npy")))

for xf, yf in zip(X_files, Y_files):
    X_test_list.append(np.load(xf))
    Y_test_list.append(np.load(yf))

X_test = np.vstack(X_test_list)
Y_test = np.concatenate(Y_test_list)

del X_test_list, Y_test_list


In [4]:
# --- SCALE THE DATA ---
BA, FR, TI = X_train.shape
scaler = StandardScaler()

# Wrap it in a Dataset
class NumpyDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx]  # returns NumPy arrays

# training data

X_train_2d = X_train.reshape(BA, FR * TI)
del X_train

dataset = NumpyDataset(X_train_2d)
loader = DataLoader(dataset, batch_size=1000, shuffle=True)
itr = 0
for batch in loader:
    itr += 1
    print(itr)
    # Convert batch to numpy array for StandardScaler
    scaler.partial_fit(batch)
print('Fit the standard scaler')

# --- SAVE SCALER ---
scaler_path = os.path.join(main_path, "standard_scaler.joblib")
joblib.dump(scaler, scaler_path)
print(f"Saved scaler to {scaler_path}")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
Fit the standard scaler
Saved scaler to data/standard_scaler.joblib


In [None]:
# --- SCALING ---
scaled_batches = []
itr = 0
for batch in loader:
    itr += 1
    print(itr)
    scaled_batch = scaler.transform(batch)
    scaled_batches.append(scaled_batch)
X_train_scaled = np.array(scaled_batches).reshape(BA, FR, TI)
del scaled_batches
X_train = X_train_scaled[:, np.newaxis, :, :]
del X_train_scaled
print('Transformed the training data')

# testing data

X_test_2d = X_test.reshape(BA, FR * TI)
del X_test

dataset = NumpyDataset(X_test_2d)
loader = DataLoader(dataset, batch_size=1000, shuffle=True)

scaled_batches = []
for batch in loader:
    scaled_batch = scaler.transform(batch)
    scaled_batches.append(scaled_batch)
X_test_scaled = np.array(scaled_batches).reshape(BA, FR, TI)
del scaled_batches
X_test = X_test_scaled[:, np.newaxis, :, :]
del X_test_scaled
print('Transformed the testing data')

In [None]:
# --- TO NUMPY WITH 16-bit PRECISION ---
X_train = X_train.astype(np.float16)
X_test = X_test.astype(np.float16)
Y_train = Y_train.astype(np.int16)
Y_test = Y_test.astype(np.int16)

# --- SAVE NUMPY ARRAYS ---
np.save(os.path.join(main_path, "X_train.npy"), X_train)
np.save(os.path.join(main_path, "Y_train.npy"), Y_train)
np.save(os.path.join(main_path, "X_test.npy"),  X_test)
np.save(os.path.join(main_path, "Y_test.npy"),  Y_test)

# --- SAVE SCALER ---
scaler_path = os.path.join(main_path, "standard_scaler.joblib")
joblib.dump(scaler, scaler_path)
print(f"Saved scaler to {scaler_path}")


# --- PRINT SHAPES ---
print(f"X_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")

In [None]:
# verify results
idx = 0
plt.imshow(X_train[idx,0,:,:])