# processing data

we are going to prepare and procees data for modeling 

- read tensors
- do normalization
- resampeling
- split data


In [36]:
import os
import pandas as pd
import librosa
import numpy as np
import pickle

import torch
from torch.utils.data import DataLoader, TensorDataset

## read data 

In [7]:
# read data from npy files
data_path = '../data/model_data'

X = np.load(os.path.join(data_path, 'X_train.npy'))
y = np.load(os.path.join(data_path, 'y_train.npy'))



## unblance treatment

### 1) resmapeling minority

In [22]:

# Separate samples by class
minority_class_indices = np.where(y == 1)[0]
majority_class_indices = np.where(y == 0)[0]

# Calculate the imbalance ratio
num_minority = len(minority_class_indices)
num_majority = len(majority_class_indices)

# Number of times we need to repeat the minority samples to match the majority class size
duplication_factor = num_majority // num_minority
remaining_samples = num_majority % num_minority  # Handle any remaining samples

# Oversample the minority class by duplicating the indices
oversampled_minority_indices = np.hstack([
    np.tile(minority_class_indices, duplication_factor),  # Repeat full sets
    np.random.choice(minority_class_indices, remaining_samples, replace=False)  # Add a few extras
])

# Combine with the majority indices to create a balanced dataset
balanced_indices = np.hstack([majority_class_indices, oversampled_minority_indices])

# Shuffle the combined indices to mix classes
np.random.shuffle(balanced_indices)

# Use the balanced indices to create balanced X and y
X_balanced = X[balanced_indices]
y_balanced = y[balanced_indices]

In [29]:
y_balanced.shape

(5322,)

## split, Normalization, and make tensors


In [30]:
from sklearn.model_selection import train_test_split

# Train-test split (80-20 split as an example)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Now, X_train, X_test, y_train, y_test are your split datasets.


In [31]:
# Normalize features (zero mean, unit variance)
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

# Normalize the training and test data
X_train_normalized = (X_train - mean) / std
X_test_normalized = (X_test - mean) / std


In [33]:
# Convert the balanced data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

## Dataset and data loader

In [37]:
train_dataset_balanced = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset_balanced, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### save data as tensors

In [39]:

output_path = '../data/tensors/'
torch.save((X_train_tensor, y_train_tensor), output_path + 'train_data.pt')
torch.save((X_test_tensor, y_test_tensor), output_path + 'test_data.pt')

# Optionally, save the normalization parameters (mean and std)
torch.save((mean, std), output_path + 'normalization_params.pt')