# Model #1 - The Classic Model

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
import math
import torch.nn as nn
from torch.optim import Adam
from dataloader import CustomDataloader
from networks import LinearRegressionModel
import torch.nn.functional as F
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv('data/UTKFaceAugmented.csv')

# Encode categorical variables (gender, race, age_range)
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['race'] = label_encoder.fit_transform(df['race'])
df['age_range'] = label_encoder.fit_transform(df['age_range'])

# Encode binary columns (has_tiktok, remembers_disco, uses_skincare)
binary_columns = ['has_tiktok', 'remembers_disco', 'uses_skincare']
df[binary_columns] = df[binary_columns].apply(lambda x: x.map({'yes': 1, 'no': 0}))

# Handle missing values
df = df.dropna()

# Select features and target variable
features = ['num_haircuts_life', 'has_tiktok', 'remembers_disco', 'uses_skincare', 'max_annual_earnings']
X = df[features]
y = df['age']

# Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Convert data to tensors
X_train_tensor, y_train_tensor = torch.Tensor(X_train), torch.Tensor(y_train.values)
X_test_tensor, y_test_tensor = torch.Tensor(X_test), torch.Tensor(y_test.values)


# Create CustomDataloader instances
training_dataloader = CustomDataloader(x=X_train_tensor, y=y_train_tensor, batch_size=64, randomize=True)
testing_dataloader = CustomDataloader(x=X_test_tensor, y=y_test_tensor, batch_size=64, randomize=False)

# Create a linear regression model
model = LinearRegressionModel(input_size=X_train.shape[1])
optimizer = Adam(model.parameters(), lr=0.01)

# Loss function
criterion = nn.MSELoss()

# Number of epochs
total_epochs = 100

# Lists to store training and validation losses
training_losses = []
validation_losses = []

for epoch in range(total_epochs):
    model.train()
    total_loss = 0

    for _ in range(len(training_dataloader)):
        optimizer.zero_grad()
        train_batch = training_dataloader.fetch_batch()
        X_batch, y_batch = train_batch['x_batch'], train_batch['y_batch']

        predictions = model(X_batch)
        training_loss = criterion(predictions.squeeze(), y_batch)

        # Add L2 regularization
        l2_regularization = 0.01
        l2_loss = 0
        for param in model.parameters():
            l2_loss += torch.norm(param, p=2)
        loss = training_loss + l2_regularization * l2_loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    training_losses.append(total_loss / len(training_dataloader))

    # Validation phase
    model.eval()
    total_validation_loss = 0

    with torch.no_grad():
        for _ in range(len(testing_dataloader)):
            validation_batch_set = testing_dataloader.fetch_batch()
            X_validation_batch_set, y_validation_batch_set = validation_batch_set['x_batch'], validation_batch_set['y_batch']
            predictions = model(X_validation_batch_set)
            validation_loss = criterion(predictions.squeeze(), y_validation_batch_set)
            total_validation_loss += validation_loss.item()

    validation_losses.append(total_validation_loss / len(testing_dataloader))

    print(f'Epoch Number [{epoch + 1}/{total_epochs}] - Training Loss: {training_losses[-1]:.6f}, Validation Loss: {validation_losses[-1]:.6f}')


model.eval()
torch.save(training_losses, 'training_model_one.pt')
torch.save(validation_losses, 'validation_model_one.pt')

Epoch Number [1/100] - Training Loss: 1325.538940, Validation Loss: 1030.392212
Epoch Number [2/100] - Training Loss: 1055.930800, Validation Loss: 1064.283691
Epoch Number [3/100] - Training Loss: 856.404335, Validation Loss: 769.828125
Epoch Number [4/100] - Training Loss: 697.816964, Validation Loss: 674.327148
Epoch Number [5/100] - Training Loss: 580.048274, Validation Loss: 379.456848
Epoch Number [6/100] - Training Loss: 474.170437, Validation Loss: 373.378754
Epoch Number [7/100] - Training Loss: 388.001859, Validation Loss: 324.836090
Epoch Number [8/100] - Training Loss: 325.413252, Validation Loss: 262.487305
Epoch Number [9/100] - Training Loss: 268.108595, Validation Loss: 172.812164
Epoch Number [10/100] - Training Loss: 229.032202, Validation Loss: 156.908463
Epoch Number [11/100] - Training Loss: 193.130687, Validation Loss: 114.643303
Epoch Number [12/100] - Training Loss: 171.381532, Validation Loss: 132.949066
Epoch Number [13/100] - Training Loss: 153.152680, Valida

In [None]:
# Execute this code to calculate statistics on the test set
"""
predicted_age = []
actual_age = []

with torch.no_grad():
    for batch in testing_dataloader:
        inputs, labels = batch['image'], batch['age']
        outputs = model(inputs)
        predicted_age.extend(outputs.cpu().numpy().flatten())
        actual_age.extend(labels.cpu().numpy())

predicted_age = np.array(predicted_age)
actual_age = np.array(actual_age)

torch.save(predicted_age, 'predicted_age_model_one.pt') 
torch.save(actual_age, 'actual_age_model_one.pt')
"""

# Model 2 - A Neural Network

In [5]:
import numpy as np
import pandas as pd
from pathlib import Path
import os.path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from dataloader import NeuralNetworkData
from networks import SimpleCNN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.nn.functional as F
from sklearn.metrics import r2_score

# Load the dataset
data_dir = 'data/images'
image_dir = Path(data_dir)

# Create a DataFrame with file paths and ages
filepaths = pd.Series(list(image_dir.glob(r'**/*.jpg')), name='Filepath').astype(str)
ages = pd.Series(filepaths.apply(lambda x: int(os.path.splitext(os.path.basename(x))[0].split('_')[0])), name='Age')
images = pd.concat([filepaths, ages], axis=1).sample(frac=1.0, random_state=1).reset_index(drop=True)

image_amount = 23700 # Use 5000 images to speed up training time
image_df = images.sample(image_amount, random_state=1).reset_index(drop=True)
train_df, val_df = train_test_split(image_df, test_size=0.2, shuffle=True, random_state=1)
train_df, test_df = train_test_split(train_df, test_size=0.25, shuffle=True, random_state=1)

# Image transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create datasets and dataloaders
train_dataset = NeuralNetworkData(train_df, transform=transform)
val_dataset = NeuralNetworkData(val_df, transform=transform)
test_dataset = NeuralNetworkData(test_df, transform=transform)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False)


# Instantiate the model, loss function, and optimizer
model = SimpleCNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Lists to store training and validation losses
train_losses = []
val_losses = []
total_epochs = 15

# Training the model
for epoch in range(total_epochs):  # Change the number of epochs as needed
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        inputs, labels = batch['image'], batch['age']
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1).float())
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    # Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            inputs, labels = batch['image'], batch['age']
            outputs = model(inputs)
            val_loss = criterion(outputs, labels.view(-1, 1).float())
            total_val_loss += val_loss.item()

    # Calculate average validation loss for the epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    print(f'Epoch {epoch + 1}/{total_epochs} - Training Loss: {train_losses[-1]:.6f}, Validation Loss: {val_losses[-1]:.6f}')

model.eval()
torch.save(train_losses, 'training_model_two.pt')
torch.save(val_losses, 'validation_model_two.pt')


# Test the model on the test set
model.eval()
predicted_ages = []
actual_ages = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch['image'], batch['age']
        outputs = model(inputs)
        predicted_ages.extend(outputs.numpy().flatten())
        actual_ages.extend(labels.numpy())

predicted_ages = np.array(predicted_ages)
true_ages = np.array(actual_ages)
torch.save(predicted_ages, 'predicted_age_model_two.pt') 
torch.save(actual_ages, 'actual_age_model_two.pt')



Epoch 1/15 - Training Loss: 326.673378, Validation Loss: 181.896944
Epoch 2/15 - Training Loss: 228.971071, Validation Loss: 151.311580
Epoch 3/15 - Training Loss: 210.231485, Validation Loss: 206.273746
Epoch 4/15 - Training Loss: 186.824242, Validation Loss: 130.630297
Epoch 5/15 - Training Loss: 178.893163, Validation Loss: 132.763902
Epoch 6/15 - Training Loss: 168.986976, Validation Loss: 131.021248
Epoch 7/15 - Training Loss: 162.327910, Validation Loss: 118.788414
Epoch 8/15 - Training Loss: 151.167130, Validation Loss: 113.514016
Epoch 9/15 - Training Loss: 149.961242, Validation Loss: 137.317859
Epoch 10/15 - Training Loss: 145.951772, Validation Loss: 114.626364
Epoch 11/15 - Training Loss: 139.700369, Validation Loss: 125.629638
Epoch 12/15 - Training Loss: 129.121845, Validation Loss: 118.124087
Epoch 13/15 - Training Loss: 131.864146, Validation Loss: 107.297155
Epoch 14/15 - Training Loss: 123.556334, Validation Loss: 108.460568
Epoch 15/15 - Training Loss: 122.018830, Va

# Model 3 - A Multimodal Neural Network

In [7]:
import matplotlib.pyplot as plt
import torch
from torchvision import transforms
from torch.utils.data import DataLoader
from dataloader import CustomImageDataset
from networks import MultiModalCNN
import torch.optim as optim
import torch.nn as nn
import torch

# Define transformations
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load dataset and create DataLoader
dataset = CustomImageDataset(csv_file='data/UTKFaceAugmented.csv', img_dir='data/images', transform=transform)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


# Initialize the model
num_features = len(dataset[0][1])
model = MultiModalCNN(num_features, 1)  # 1 output since it's regression
model.to(torch.device('gpu' if torch.cuda.is_available() else 'cpu'))

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

model.to(torch.device('gpu' if torch.cuda.is_available() else 'cpu'))
model.to(device)

# Training loop
num_epochs = 1  # Increase epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, features, labels in dataloader:
        images, features, labels = images.to(device), features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images, features)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(dataloader)
    train_losses.append(avg_train_loss)
 
# Validation phase
model.eval()
total_val_loss = 0
with torch.no_grad():
    for batch in val_dataloader:
        images, features, labels = images.to(device), features.to(device), labels.to(device)        
        outputs = model(images, features)
        loss = criterion(outputs.squeeze(), labels)
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(dataloader)
    train_losses.append(avg_train_loss)

    # Calculate average validation loss for the epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    print(f'Epoch {epoch + 1}/{total_epochs} - Training Loss: {train_losses:.6f}, Validation Loss: {val_losses:.6f}')


# Save the model
torch.save(model.state_dict(), 'model_three.pt')

# Save the model
torch.save(avg_train_loss, 'training_model_three.pt')
torch.save(avg_val_loss, 'validation_model_three.pt')

# Model evaluation and plotting
model.eval()
actual_ages = []
predicted_ages = []

with torch.no_grad():
    for images, features, labels in dataloader:
        images, features, labels = images.to(device), features.to(device), labels.to(device)
        outputs = model(images, features)
        actual_ages.extend(labels.cpu().numpy())
        predicted_ages.extend(outputs.squeeze().cpu().numpy())

# Plotting actual and predicted ages
plt.figure(figsize=(10, 6))

# Move data to the specified device
images, features, labels = images.to(device), features.to(device), labels.to(device)

torch.save(predicted_ages, 'predicted_age_model_three.pt') 
torch.save(actual_ages, 'actual_age_model_three.pt')

NameError: name 'device' is not defined