In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Custom DataLoader for Numerical Data (from Pandas)

To load numerical data (e.g., from a CSV file or a Pandas DataFrame), we create a custom dataset class that extends torch.utils.data.Dataset. This class should implement three key methods:

* __init__: Initializes the dataset.
* __len__: Returns the size of the dataset.
* __getitem__: Retrieves a sample from the dataset.

In [8]:
class NumericalDataset(Dataset):
    def __init__(self, data, labels):
        """
        Args:
            data (pd.DataFrame or np.array): Features (independent variables).
            labels (pd.Series or np.array): Labels (dependent variables).
        """

        # If data is a pandas DataFrame or Series, use .values to convert to NumPy arrays
        if isinstance(data, pd.DataFrame):
            data = data.values
        if isinstance(labels, pd.Series):
            labels = labels.values

        self.data = torch.tensor(data, dtype=torch.float32)  # Convert to torch tensor
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert to torch tensor (classification)

    def __len__(self):
        return len(self.data)  # Returns the total number of samples

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]  # Returns one sample and its label


In [1]:
# Creating dummy values to get this loaded from pytorch custom dataloader

import csv
import os
import pandas as pd

data_list = [
    [0.374540, 0.950714, 0.731994, 0.598658, 0],
    [0.156019, 0.155995, 0.058084, 0.866176, 1],
    [0.601115, 0.708073, 0.020584, 0.969910, 0],
    [0.832443, 0.212339, 0.181825, 0.183405, 2],
    [0.304242, 0.524756, 0.431945, 0.291229, 1],
    [0.611853, 0.139494, 0.292145, 0.366362, 2],
    [0.456070, 0.785176, 0.199674, 0.514234, 1],
    [0.592415, 0.046450, 0.607545, 0.170524, 0],
    [0.065052, 0.948886, 0.965632, 0.808397, 2],
    [0.304614, 0.097672, 0.684233, 0.440152, 1],
]

# Write the list to a CSV file
cwd = os.getcwd()
csv_file = f'{cwd}/numerical_data_from_list.csv'

with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['feature1', 'feature2', 'feature3', 'feature4', 'target'])
    # Write data rows
    writer.writerows(data_list)

In [17]:
# Example numerical dataset
cwd = os.getcwd()
csv_file = f'{cwd}/numerical_data_from_list.csv'

df = pd.read_csv(csv_file)  # Replace with your CSV file
X = df.drop('target', axis=1)  # Features
y = df['target']  # Labels

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize features using StandardScaler (optional)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(X_train[0, :])
print(type(X_train))
print(y_train.values)
print(type(y_train))
# Create Dataset objects
train_dataset = NumericalDataset(X_train, y_train)
val_dataset = NumericalDataset(X_val, y_val)
test_dataset = NumericalDataset(X_test, y_test)

# Create DataLoader objects
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

# Example: Iterating through batches of the train_loader
for data_batch, labels_batch in train_loader:
    print(data_batch.shape, labels_batch.shape)


[ 0.41665375 -1.19766889  1.05757176 -0.94518329]
<class 'numpy.ndarray'>
[0 0 1 1 2 1]
<class 'pandas.core.series.Series'>
torch.Size([6, 4]) torch.Size([6])


# NumericalDataset

This custom class defines how to read, preprocess, and handle data (e.g., loading, applying transformations).

# DataLoader

Manages batching, shuffling, and parallel processing for efficient training.