# EEE4113F 2024 Machine Learning Project

## Necesary imports and setup

In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from torchvision import transforms

import pandas as pd
from torchvision import io
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch import flatten

import os

from PIL import Image

## Define classes

In [2]:
classes = ("Snare", "Trumpet", "Violin")

## Create dataset class


In [3]:


class InstrumentsDataset(Dataset):
    """Instruments dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.instrument_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.instrument_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.instrument_frame.iloc[idx, 0])
        # image = io.read_image(img_name, mode=ImageReadMode.GRAY)
        # image = io.read_image(img_name)
        image = Image.open(img_name)

        image = image.convert('RGB')


        if self.transform:
            image = self.transform(image)

        
        instrument = self.instrument_frame.iloc[idx, 1]
        instrument_id = classes.index(instrument)

        return image, instrument_id


##  CNN models

In [4]:
class SimpleModel(nn.Module):
    def __init__(self, numChannels,numClasses):
  
        super(SimpleModel, self).__init__()
        #First layer (convolutional)
        self.conv1 = nn.Conv2d(numChannels, 6, kernel_size=(1, 4))
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(4, 4)


        # Linear layers
        self.fc1 = nn.LazyLinear(120)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.LazyLinear(numClasses)
        self.logsoftmax = nn.LogSoftmax(dim=1)


    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        # x = self.maxpool1(x)
        # Max pooling over a (2, 2) window
        x = flatten(x, start_dim=1, end_dim=-1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        x = self.logsoftmax(x)
        return x

class ComplexModel(nn.Module):
    def __init__(self, numChannels,numClasses):
  
        super(ComplexModel, self).__init__()
        #First layer (convolutional)
        self.conv1 = nn.Conv2d(numChannels, 6, 3)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(2, 2)

        # Second layer (also convolutional)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(16, 32, 3)
        self.relu3 = nn.ReLU()
        self.maxpool3 = nn.MaxPool2d(2, 2)

        # an affine operation: y = Wx + b
        self.fc1 = nn.LazyLinear(120)
        self.relu4 = nn.ReLU()

        self.fc2 = nn.LazyLinear(50)
        self.relu5 = nn.ReLU()

        self.fc3 = nn.LazyLinear(numClasses)
        self.logsoftmax = nn.LogSoftmax(dim=1)


    def forward(self, x):
        # conv layer 1
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)

        # conv layer 2
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)

        # conv layer 3
        x = self.conv3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)

        # linear layer 1
        x = flatten(x, start_dim=1, end_dim=-1)
        x = self.fc1(x)
        x = self.relu4(x)

        # linear layer 2
        x = self.fc2(x)
        x = self.relu5(x)

        # linear layer 3
        x = self.fc3(x)
        x = self.logsoftmax(x)

        return x

## Create datasets and dataloaders

In [5]:
# transform = transforms.Compose(
#     [transforms.Grayscale(),
#      transforms.ToTensor(),
#      transforms.Normalize((0.5), (0.5)),
#      transforms.Resize((64, 64))])

trans = transforms.Compose([
    # transforms.Grayscale(),
    transforms.ToTensor()
    # transforms.Resize((64, 64))
    ])

# trans = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Resize((64, 64))
#     ])

# train_val_dataset = InstrumentsDataset(csv_file = "../Training-data/hann.csv", root_dir = ".", transform=trans)
train_val_dataset = InstrumentsDataset(csv_file="../Training-data/mfcc.csv", root_dir = ".", transform=trans)
test_dataset = InstrumentsDataset(csv_file = "../Test-data/hann.csv", root_dir = ".", transform = trans)

train_size = int(0.8 * len(train_val_dataset))
val_size = len(train_val_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_val_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16)
val_loader = DataLoader(val_dataset, batch_size=16)

test_loader = DataLoader(test_dataset, batch_size=16)



In [6]:
image, instrument_id = train_dataset[0]

print(image)
print(image.shape)

tensor([[[0.9569, 0.9569, 0.9569,  ..., 0.9569, 0.9569, 0.9569],
         [0.9569, 0.9569, 0.9569,  ..., 0.9569, 0.9569, 0.9569],
         [0.9569, 0.9569, 0.9569,  ..., 0.9569, 0.9569, 0.9569],
         ...,
         [0.9373, 0.9373, 0.9373,  ..., 0.8353, 0.8353, 0.8353],
         [0.9373, 0.9373, 0.9373,  ..., 0.8353, 0.8353, 0.8353],
         [0.9373, 0.9373, 0.9373,  ..., 0.8353, 0.8353, 0.8353]],

        [[0.6039, 0.6039, 0.6039,  ..., 0.6039, 0.6039, 0.6039],
         [0.6039, 0.6039, 0.6039,  ..., 0.6039, 0.6039, 0.6039],
         [0.6039, 0.6039, 0.6039,  ..., 0.6039, 0.6039, 0.6039],
         ...,
         [0.8078, 0.8078, 0.8078,  ..., 0.8588, 0.8588, 0.8588],
         [0.8078, 0.8078, 0.8078,  ..., 0.8588, 0.8588, 0.8588],
         [0.8078, 0.8078, 0.8078,  ..., 0.8588, 0.8588, 0.8588]],

        [[0.4824, 0.4824, 0.4824,  ..., 0.4824, 0.4824, 0.4824],
         [0.4824, 0.4824, 0.4824,  ..., 0.4824, 0.4824, 0.4824],
         [0.4824, 0.4824, 0.4824,  ..., 0.4824, 0.4824, 0.

## Go and run

In [7]:
model = SimpleModel(3, len(classes)) # 1 channel for grayscale at this stage, or 3 for the MFCCs
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

def evaluate(model, loader):
    model.eval()
    # initialise evaluation parameters
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad(): # evaluating so don't produce gradients
        for data in loader:
            inputs, labels = data # get data from dataloader
            outputs = model(inputs) # predict outputs
            loss = criterion(outputs, labels) # calculate current loss
            _, predicted = torch.max(outputs.data, 1) # calculate predicted data
            total += labels.size(0) # total number of labels in the current batch
            correct += (predicted == labels).sum().item() # number of labels that are correct
            
            running_loss += loss.item() # loss? not 100% sure
        
    # Return mean loss, accuracy
    return running_loss / len(loader), correct / total

loss_history = {
        'train_loss': [],
        'val_loss': []
}


for epoch in range(50):  # loop over the dataset multiple times
    print("Starting Epoch: {}".format(epoch+1))
    
    running_loss = 0.0
    
    for i, data in enumerate(train_loader, 0):
        model.train()
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 5 == 4:    # print every 5 mini-batches
            mean_loss = running_loss / 100
            loss_history['train_loss'].append(mean_loss)
            print('# mini-batch {}\ntrain loss: {}'.format(
                  i + 1, mean_loss))
            running_loss = 0.0
            
            # evaluate on validation dataset
            mean_loss, val_acc = evaluate(model, val_loader)
            loss_history['val_loss'].append(mean_loss)
                  
            print("validation loss: {} validation accuracy: {}\n".format(mean_loss, val_acc))

print('Finished Training')


# Notes for tomorrow:
# Check what type of data is being input / output
# Look at a tutorial on custom pytorch image recognition?
# https://medium.com/analytics-vidhya/implementing-cnn-in-pytorch-with-custom-dataset-and-transfer-learning-1864daac14cc
# https://glassboxmedicine.com/2021/02/06/designing-custom-2d-and-3d-cnns-in-pytorch-tutorial-with-code/

Starting Epoch: 1


