# Age Prediction from Spectrogram Input

Necessary Imports

In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import zipfile
from torchvision import models
import torch.nn as nn
from tqdm.auto import tqdm
import numpy as np
import argparse

Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Extracting Spectrogram Images

In [None]:
with zipfile.ZipFile("/content/drive/MyDrive/spectrograms.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/")

In [None]:
len(os.listdir("/content/drive/MyDrive/spectrograms"))

10839

Dataset

Make sure to change file paths to accurately match your file structure.

In [None]:
class VoiceDataset(Dataset):
    def __init__(self,
                 csv_file = "/content/drive/MyDrive/FINAL_AUDIO_FEATURES_10194.csv",
                 root_dir = "/content/drive/MyDrive/spectrograms", transform=None):

        self.meta = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, f"{self.meta.iloc[idx, 30][:-4]}png")
        image = Image.open(img_name).convert('RGB')
        age = self.meta.iloc[idx, 31]

        if self.transform:
            image = self.transform(image)

        return image, age

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),  # Random brightness and contrast adjustments
])

VGG-16 Model

In [None]:
class AgePredictionVGG16(nn.Module):
    def __init__(self):
        super(AgePredictionVGG16, self).__init__()
        vgg16 = models.vgg16(pretrained=True)
        self.features = vgg16.features
        self.avgpool = vgg16.avgpool
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

Accuracy Function

In [None]:
def map_to_age_band(age):
    return (age // 5) * 5

def calculate_accuracy(predictions, labels, tolerance):
    correct = 0
    total = len(labels)
    for pred, label in zip(predictions, labels):
        if abs(pred - label) <= tolerance:
            correct += 1
    accuracy = (correct / total) * 100
    return round(accuracy, 2)

Training, Validation, and Testing Loop for VGG-16

In [None]:
def train_one_epoch(voice_model, train_data, optimizer, criterion) -> None:
    """
    Trains VGG16 for one epoch, for age prediction
    :param voice_model: VGG16 model
    :param train_data: training data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    voice_model.train()

    for imgs, targets in tqdm(train_data):

        imgs = imgs.to(device)
        targets = targets.to(torch.float32).to(device)

        optimizer.zero_grad()
        outputs = voice_model(imgs)
        outputs = outputs.to(torch.float32)
        mse_loss = criterion(outputs.squeeze(), targets.float())
        rmse_loss = torch.sqrt(mse_loss) # Compute the RMSE from MSE
        # total_loss = rmse_loss + reg_term
        rmse_loss.backward()
        optimizer.step()
        print(rmse_loss.item())



def evaluate_one_epoch(voice_model, val_data, epoch, criterion):
    """
    Evaluates VGG16 for one epoch, for age prediction
    :param voice_model: VGG16 model
    :param train_data: validation data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device: ", device)
    voice_model.eval()

    with torch.no_grad():
        for imgs, targets in tqdm(val_data):

            imgs = imgs.to(device)
            targets = targets.to(torch.float32).to(device)


            preds = voice_model(imgs)
            preds = preds.to(torch.float32)
            mse_loss = criterion(preds.squeeze(), targets.float())
            rmse_loss = torch.sqrt(mse_loss)
            print(rmse_loss.item())




def test_voice_model(voice_model, test_data, criterion):
    """
    Tests VGG16 for one epoch, for age prediction
    :param voice_model: VGG16 model
    :param train_data: test data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    voice_model.eval()

    acc = []

    with torch.no_grad():
        for imgs, targets in tqdm(test_data):

            imgs = imgs.to(device)
            targets = targets.to(torch.float32).to(device)

            preds = voice_model(imgs)
            preds = preds.to(torch.float32)
            mse_loss = criterion(preds.squeeze(), targets.float())
            rmse_loss = torch.sqrt(mse_loss)
            print(rmse_loss.item())

            batch_acc = calculate_accuracy(preds.cpu().squeeze().detach().numpy().tolist(), targets.cpu().detach().numpy().tolist(), 6)

            acc.append(batch_acc)


    final_acc = np.average(acc)
    print("FINAL ACCURACY:", final_acc)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

voice_dataset = VoiceDataset(transform=transform)
train_data_tmp, val_data_tmp, test_data_tmp = torch.utils.data.random_split(voice_dataset, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(42))

criterion = nn.MSELoss()

train_data = DataLoader(
    train_data_tmp,
    batch_size=128,
    shuffle=False,
)
val_data = DataLoader(
    val_data_tmp,
    batch_size=128,
    shuffle=False,
)
test_data = DataLoader(
    test_data_tmp,
    batch_size=128,
    shuffle=False,
)
print("train_data:", train_data)
print("val_data:", val_data)
print("test_data:", test_data)

voice_model = AgePredictionVGG16()
voice_model = voice_model.to(device)

params = voice_model.parameters()
optimizer = torch.optim.Adam(
    params, lr=0.0005, weight_decay=0.0005
)

# Training Loop
for epoch in range(40):
    print("training epoch:", epoch)
    train_one_epoch(voice_model, train_data, optimizer, criterion)
    print("evaluating epoch:", epoch)
    evaluate_one_epoch(voice_model, val_data, epoch, criterion)
    pass

torch.save(voice_model, "/content/drive/MyDrive/vgg16_v6.pt")

# Testing Loop

voice_model_trained = torch.load("/content/drive/MyDrive/vgg16_v6.pt", map_location=torch.device('cpu'))
voice_model_trained.to(device)
print("testing model: ", "vgg16_v6.pt")
test_voice_model(voice_model_trained, test_data, criterion)

MobileNetv2 Model

In [None]:
class MobileNetV2(nn.Module):
    def __init__(self, num_classes=1):
        super(MobileNetV2, self).__init__()
        self.model = models.mobilenet_v2(pretrained=True)
        self.model.classifier[1] = nn.Linear(1280, 1)

    def forward(self, x):
        return self.model(x)

Training, Validation, and Testing Loop for MobileNetv2

In [None]:
import torch.nn as nn
from tqdm.auto import tqdm
import numpy as np
import argparse
import torch

def train_one_epoch(voice_model, train_data, optimizer, criterion) -> None:
    """
    Trains MobileNetV2 for one epoch, for age prediction
    :param voice_model: MobileNetV2 model
    :param train_data: training data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    voice_model.train()

    for imgs, targets in tqdm(train_data):

        imgs = imgs.to(device)
        targets = targets.to(torch.float32).to(device)

        optimizer.zero_grad()
        outputs = voice_model(imgs)
        outputs = outputs.to(torch.float32)
        mse_loss = criterion(outputs.squeeze(), targets.float())
        rmse_loss = torch.sqrt(mse_loss) # Compute the RMSE from MSE
        rmse_loss.backward()
        optimizer.step()
        print(rmse_loss.item())



def evaluate_one_epoch(voice_model, val_data, epoch, criterion):
    """
    Evaluates MobileNetV2 for one epoch, for age prediction
    :param voice_model: MobileNetV2 model
    :param train_data: validation data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device: ", device)
    voice_model.eval()

    with torch.no_grad():
        for imgs, targets in tqdm(val_data):

            imgs = imgs.to(device)
            targets = targets.to(torch.float32).to(device)


            preds = voice_model(imgs)
            preds = preds.to(torch.float32)
            mse_loss = criterion(preds.squeeze(), targets.float())
            rmse_loss = torch.sqrt(mse_loss)
            print(rmse_loss.item())




def test_voice_model(voice_model, test_data, criterion):
    """
    Tests MobileNetV2 for one epoch, for age prediction
    :param voice_model: MobileNetV2 model
    :param train_data: test data
    :param optimizer: optimizer
    :param criterion: loss criterion
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    voice_model.eval()

    acc = []

    with torch.no_grad():
        for imgs, targets in tqdm(test_data):

            imgs = imgs.to(device)
            targets = targets.to(torch.float32).to(device)

            preds = voice_model(imgs)
            preds = preds.to(torch.float32)
            mse_loss = criterion(preds.squeeze(), targets.float())
            rmse_loss = torch.sqrt(mse_loss)
            print(rmse_loss.item())

            batch_acc = calculate_accuracy(preds.cpu().squeeze().detach().numpy().tolist(), targets.cpu().detach().numpy().tolist(), 6)

            acc.append(batch_acc)


    final_acc = np.average(acc)
    print("FINAL ACCURACY:", final_acc)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

voice_dataset = VoiceDataset(transform=transform)
train_data_tmp, val_data_tmp, test_data_tmp = torch.utils.data.random_split(voice_dataset, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(42))
criterion = nn.MSELoss()

train_data = DataLoader(
    train_data_tmp,
    batch_size=128,
    shuffle=False,
)
val_data = DataLoader(
    val_data_tmp,
    batch_size=128,
    shuffle=False,
)
test_data = DataLoader(
    test_data_tmp,
    batch_size=128,
    shuffle=False,
)
print("train_data:", train_data)
print("val_data:", val_data)
print("test_data:", test_data)

voice_model = MobileNetV2()
voice_model = voice_model.to(device)

params = voice_model.parameters()
optimizer = torch.optim.Adam(
    params, lr=0.00004, weight_decay=0.0005
)

# Training Loop
for epoch in range(40):
    print("training epoch:", epoch)
    train_one_epoch(voice_model, train_data, optimizer, criterion)
    print("evaluating epoch:", epoch)
    evaluate_one_epoch(voice_model, val_data, epoch, criterion)
    pass

torch.save(voice_model, "/content/drive/MyDrive/mbnetv4.pt")

# Testing Loop

voice_model_trained = torch.load("/content/drive/MyDrive/mbnetv4.pt", map_location=torch.device('cpu'))
voice_model_trained.to(device)
print("testing model: ", "mbnetv4.pt")
test_voice_model(voice_model_trained, test_data, criterion)