<a href="https://colab.research.google.com/github/rslab-ntua/MSc_GBDA/blob/master/GBDA_LabML2_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download Indian Pines data, unzip
!gdown https://drive.google.com/uc?id=1XxBBah4J3wmSAMFq8lBFc06vGWFiy1TZ
!unzip GBDA2020_ML1.zip0.90

In [None]:
# Define the root directory where the data is located
DATA_ROOT = "partB/"

# Define the class names corresponding to the dataset
CLASS_NAMES = [
    "Alfalfa",
    "Corn-notill",
    "Corn-mintill",
    "Corn",
    "Grass-pasture",
    "Grass-trees",
    "Grass-pasture-mown",
    "Hay-windrowed",
    "Oats",
    "Soybean-notill",
    "Soybean-mintill",
    "Soybean-clean",
    "Wheat",
    "Woods",
    "Buildings-Grass-Trees-Drives",
    "Stone-Steel-Towers"
]


In [None]:
import torch  # Importing PyTorch
from torch.utils.data import Dataset, DataLoader, random_split  # Importing necessary modules from PyTorch
from sklearn.preprocessing import StandardScaler  # Importing StandardScaler from scikit-learn for data preprocessing
import numpy as np  # Importing numpy for numerical operations
import os  # Importing os for operating system related operations
from copy import copy  # Importing copy module for creating deep copies of objects

# Build a custom PyTorch Dataset-compatible class
class IndianPinesDataset(Dataset):
    def __init__(self, data_root, transforms=[]):
        '''
        Constructor method for the IndianPinesDataset class.

        Parameters:
        - data_root (str): Root directory where the dataset is located.
        - transforms (list): List of transformations to be applied to the dataset.
        '''
        super().__init__()
        self.transforms: list = copy(transforms)
        self._build(data_root)

    def _build(self, data_root) -> None:
        '''
        Method to load and preprocess the dataset.

        Parameters:
        - data_root (str): Root directory where the dataset is located.

        Returns:
        - None
        '''
        # Load the dataset
        img = np.load(os.path.join(data_root, "indianpinearray.npy"))
        gt_img = np.load(os.path.join(data_root, "IPgt.npy"))

        # Create a mask to filter out invalid data points
        valid_mask = gt_img > 0

        # Preprocess the dataset
        self.X = img[valid_mask].reshape(-1, 200).astype(np.float32)
        self.y = gt_img[valid_mask].reshape(-1).astype(int) - 1  # Subtract 1 to adjust class labels

    def apply_std_scaler(self, indices):
        '''
        Method to apply StandardScaler to the dataset.

        Parameters:
        - indices (list): List of indices to select samples for fitting the scaler.

        Returns:
        - None
        '''
        scaler = StandardScaler()
        scaler.fit(self.X[np.array(indices)])
        self.X = scaler.transform(self.X)

    def __getitem__(self, index):
        '''
        Method to retrieve dataset elements.

        Parameters:
        - index (int): Index of the dataset element to retrieve.

        Returns:
        - X (numpy.ndarray): Input data.
        - y (int): Target label.
        '''
        X, y = self.X[index], self.y[index]
        for T in self.transforms:
            X, y = T(X, y)
        return X, y

    def __len__(self) -> int:
        '''
        Method to retrieve the total number of samples in the dataset.

        Returns:
        - int: Total number of samples in the dataset.
        '''
        return len(self.X)


In [None]:
# Initialize a dataset instance
dset = IndianPinesDataset(DATA_ROOT)
print("Samples in dataset: ", len(dset))

# Split the dataset into training and validation sets
train_dset, val_dset = random_split(dset, [int(0.7*len(dset)), len(dset)-int(0.7*len(dset))], generator=torch.Generator().manual_seed(2022))

# Print the maximum value for the first sample in the validation set before and after scaling
print("Max value for the first sample in 'val' (before scaling): ", val_dset[0][0].max())
dset.apply_std_scaler(train_dset.indices)
print("Max value for the first sample in 'val' (after scaling)\t: ", val_dset[0][0].max())

# Initialize dataloaders (batching / tensor-casting / shuffling / etc.)
BATCH_SIZE = 64
train_dloader = DataLoader(train_dset, batch_size=BATCH_SIZE, shuffle=True)
val_dloader = DataLoader(val_dset, batch_size=BATCH_SIZE, shuffle=False)

# Inspect the first batch of samples
for s in train_dloader:
    X, y = s
    print(f"Sample's X type: {type(X)}, dtype: {X.dtype}, shape: {X.size()}")
    print(f"Sample's y type: {type(y)}, dtype: {y.dtype}, shape: {y.size()}")
    break


In [None]:
from torch import nn  # Importing necessary module from PyTorch

class MLP(nn.Module):
    def __init__(self, in_features: int, num_classes: int):
        '''
        Constructor method for the MLP class.

        Parameters:
        - in_features (int): Number of input features.
        - num_classes (int): Number of classes for classification.
        '''
        super().__init__()

        # Define the neural network architecture
        self.model = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        '''
        Forward-pass method.

        Parameters:
        - x (torch.Tensor): Input tensor.

        Returns:
        - torch.Tensor: Output tensor.
        '''
        return self.model(x)

# Instantiate the MLP model
model = MLP(200, 16)

# Compile the model for faster training & inference (new in PyTorch 2+)
compiled_model = torch.compile(model)

# Print the output shape of the MLP model
print("MLP's output shape: ", compiled_model(next(iter(val_dloader))[0]).size())


In [None]:
from torch.optim import Adam  # Importing Adam optimizer from PyTorch
from torch.nn import functional as F  # Importing functional module from PyTorch for loss functions

LEARNING_RATE = 1e-4  # Setting the learning rate for optimization
NUM_EPOCHS = 100  # Setting the number of epochs for training

# Transfer model to GPU
compiled_model = compiled_model.cuda()

# Define an optimizer
optimizer = Adam(compiled_model.parameters(), lr=LEARNING_RATE)

# Lists to store training and validation losses, and validation overall accuracy
train_losses = []
val_losses = []
val_oa = []

# Training loop
for e in range(NUM_EPOCHS):

    # Loop over training samples + train for one epoch
    total_loss = 0
    compiled_model.train()
    for batch_sample in train_dloader:
        X = batch_sample[0].cuda()
        y = batch_sample[1].cuda()

        # Clear gradients
        optimizer.zero_grad()

        # Infer with the model
        preds = compiled_model(X)

        # Compute negative log likelihood loss
        loss = F.nll_loss(F.log_softmax(preds, dim=-1), y)

        # Back-propagation
        loss.backward()

        # Step optimizer
        optimizer.step()

        total_loss += loss.detach().cpu()
    train_losses.append(float(total_loss)/len(train_dloader))

    # Validation step
    correct = 0
    total = 0
    total_loss = 0

    compiled_model.eval()
    for batch_sample in val_dloader:
        X = batch_sample[0].cuda()
        y = batch_sample[1].cuda()

        # Infer with the model
        with torch.no_grad():
            preds = compiled_model(X)

        # Compute cross-entropy loss
        loss = F.cross_entropy(preds, y, reduction='mean')

        total_loss += loss.detach().cpu()
        correct += float((torch.argmax(preds, dim=-1) == y).sum())
        total += len(y)

    # Append validation loss and overall accuracy
    val_losses.append(float(total_loss)/len(val_dloader))
    val_oa.append(float(correct/total))

    # Print progress every 10 epochs
    if (e+1) % 10 == 0:
        print("Epoch ", e+1)
        print("Total training loss: ", train_losses[-1])
        print("Total validation loss: ", val_losses[-1])
        print("Overall accuracy: ", val_oa[-1])


In [None]:
from matplotlib import pyplot as plt  # Importing matplotlib for plotting

# Plotting the training and validation losses versus epochs
plt.figure()
plt.title("Loss v epochs")
plt.plot(range(1, NUM_EPOCHS+1), train_losses, '-r', label='Training Loss')
plt.plot(range(1, NUM_EPOCHS+1), val_losses, '-g', label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

# Plotting the validation overall accuracy versus epochs
plt.figure()
plt.title("Validation Overall Accuracy v epochs")
plt.plot(range(1, NUM_EPOCHS+1), val_oa, '-g')
plt.xlabel("Epochs")
plt.ylabel("Validation Overall Accuracy")


In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix  # Importing necessary functions from scikit-learn
import numpy as np  # Importing numpy for numerical operations

# Lists to store predictions and targets
predictions_list = []
targets_list = []

# Evaluation mode for the model
compiled_model.eval()

# Iterate over validation dataloader
for batch_sample in val_dloader:
    X = batch_sample[0].cuda()
    y = batch_sample[1].numpy()
    targets_list.append(y)

    # Infer with the model
    with torch.no_grad():
        preds = compiled_model(X)

    # Convert predictions to numpy arrays and append to predictions list
    predictions_list.append(torch.argmax(preds, dim=-1).cpu().numpy())

# Concatenate predictions and targets
predictions = np.concatenate(predictions_list, axis=0)
targets = np.concatenate(targets_list, axis=0)

# Compute confusion matrix
cM = confusion_matrix(targets, predictions)

# Normalize confusion matrix to precision metric
cm_prec = cM / cM.sum(axis=0)

# Normalize confusion matrix to recall metric
cm_rec = (cM.T / cM.sum(axis=1)).T

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cM, display_labels=CLASS_NAMES)
plt.figure(figsize=(20,20), dpi=100)
ax = plt.axes()
disp.plot(ax=ax)
plt.xticks(rotation=90)
plt.show()

# Print classification report
print(classification_report(targets, predictions, target_names=CLASS_NAMES))
