In [None]:
# Part 1: Resnet Model Training / Fine tuning for better feature extraction

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from PIL import Image, ImageFile
import os
import json
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
from tqdm.auto import tqdm
import random

# Parameters
batch_size = 32
learning_rate = 1e-3
num_epochs = 50
checkpoint_interval = 25
max_images_per_class = 25000
resnet_model = 'ResNet50'

# Setup directories and paths
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M")
identifier = f"softmax-{resnet_model}_{num_epochs}-ep_{batch_size}-bs_{max_images_per_class}-images_{current_time}"
class_names = ['Boston', 'Charlotte', 'Manhattan', 'Pittsburgh']
folders = {
    'Boston': '../data/ma-boston/buildings',
    'Charlotte': '../data/nc-charlotte/buildings',
    'Manhattan': '../data/ny-manhattan/buildings',
    'Pittsburgh': '../data/pa-pittsburgh/buildings'
}
output_folder = os.path.join('softmax-output', identifier)
checkpoint_dir = os.path.join(output_folder, 'checkpoints')
model_save_path = os.path.join(output_folder, f'trained-model_{identifier}.pth')

# Create output directories
os.makedirs(output_folder, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

# Dataset and model setup
normalize_mean = [0.485, 0.456, 0.406]
normalize_std = [0.229, 0.224, 0.225]
num_classes = len(class_names)
weight_decay = 1e-5

ImageFile.LOAD_TRUNCATED_IMAGES = True

class CityDataset(Dataset):
    def __init__(self, folders, transform=None, max_images_per_class=max_images_per_class):
        self.image_paths = []
        self.labels = []
        self.transform = transform
        self.class_to_idx = {class_name: idx for idx, class_name in enumerate(folders.keys())}

        print("Building dataset...")
        for class_name, folder in tqdm(folders.items(), desc="Loading classes"):
            # Filter out macOS system files and get only image files
            class_images = [
                os.path.join(folder, f) for f in os.listdir(folder) 
                if (f.lower().endswith(('.jpg', '.jpeg', '.png')) and 
                    not f.startswith('._') and 
                    not f.startswith('.DS_Store'))
            ]
            
            print(f"\nFound {len(class_images)} images for {class_name}")
            
            if len(class_images) > max_images_per_class:
                class_images = random.sample(class_images, max_images_per_class)
            
            self.image_paths.extend(class_images)
            self.labels.extend([self.class_to_idx[class_name]] * len(class_images))
        
        print("\nDataset statistics:")
        print(f"Total images: {len(self.image_paths)}")
        for class_name in folders.keys():
            class_count = self.labels.count(self.class_to_idx[class_name])
            print(f"{class_name}: {class_count} images")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        
        try:
            with Image.open(image_path) as img:
                image = img.convert('RGB')
            
            if self.transform:
                image = self.transform(image)
            
            return image, label
        except Exception as e:
            print(f"Error loading image {image_path}: {str(e)}")
            raise e

# Enhanced transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomCrop(224, padding=4),
    transforms.RandomAffine(degrees=15, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=normalize_mean, std=normalize_std),
])

# Create dataset
print("\nInitializing dataset...")
dataset = CityDataset(folders, transform=transform)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else 
                     "mps" if torch.backends.mps.is_available() else 
                     "cpu")
print(f"\nUsing device: {device}")

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        ce_loss = F.cross_entropy(input, target, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss)
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss.sum()

def mixup_data(x, y, alpha=0.2):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

def train_final_model(dataset):
    print("\nSplitting dataset into train/val sets...")
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    print(f"Training set size: {len(train_dataset)}")
    print(f"Validation set size: {len(val_dataset)}")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    print(f"\nInitializing {resnet_model}...")
    if resnet_model == 'ResNet18':
        weights = models.ResNet18_Weights.DEFAULT
        model = models.resnet18(weights=weights)
    elif resnet_model == 'ResNet50':
        weights = models.ResNet50_Weights.DEFAULT
        model = models.resnet50(weights=weights)
    
    model.fc = nn.Sequential(
        nn.Linear(model.fc.in_features, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(512, num_classes)
    )
    model.to(device)
    
    criterion = FocalLoss(gamma=2.0)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    scaler = GradScaler('cuda' if torch.cuda.is_available() else 'cpu')
    
    best_val_loss = float('inf')
    patience = 10
    epochs_without_improvement = 0
    
    epoch_pbar = tqdm(range(num_epochs), desc="Training Progress", position=0)
    
    for epoch in epoch_pbar:
        model.train()
        running_loss = 0.0
        per_class_correct = torch.zeros(num_classes)
        per_class_total = torch.zeros(num_classes)
        
        batch_pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}", 
                         leave=False, position=1)
        
        for images, labels in batch_pbar:
            images, labels = images.to(device), labels.to(device)
            
            images, targets_a, targets_b, lam = mixup_data(images, labels)
            
            with autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
                outputs = model(images)
                loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
            
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()
            current_loss = loss.item()
            batch_pbar.set_postfix({'loss': f'{current_loss:.4f}'})
            
            with torch.no_grad():
                _, predicted = torch.max(model(images), 1)
                for label, pred in zip(labels, predicted):
                    per_class_correct[label] += (label == pred).item()
                    per_class_total[label] += 1
        
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        val_pbar = tqdm(val_loader, desc="Validation", 
                       leave=False, position=1)
        
        with torch.no_grad():
            for images, labels in val_pbar:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
                current_val_loss = loss.item()
                val_pbar.set_postfix({'val_loss': f'{current_val_loss:.4f}'})
        
        train_loss = running_loss / len(train_loader)
        val_loss = val_loss / len(val_loader)
        accuracy = correct / total
        
        scheduler.step(val_loss)
        
        epoch_pbar.set_postfix({
            'train_loss': f'{train_loss:.4f}',
            'val_loss': f'{val_loss:.4f}',
            'accuracy': f'{accuracy:.4f}'
        })
        
        print(f"\nEpoch {epoch + 1} Complete:")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Val Accuracy: {accuracy:.4f}")
        
        print("\nPer-class accuracies:")
        for i in range(num_classes):
            if per_class_total[i] > 0:
                class_acc = per_class_correct[i] / per_class_total[i]
                print(f"{class_names[i]}: {class_acc:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
            print(f"\nSaving best model with val_loss: {val_loss:.4f}")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'accuracy': accuracy
            }, model_save_path)
        else:
            epochs_without_improvement += 1
            
        if epochs_without_improvement >= patience:
            print("\nEarly stopping triggered!")
            break
        
        if (epoch + 1) % checkpoint_interval == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch + 1}.pth')
            torch.save(model.state_dict(), checkpoint_path)
            print(f"\nCheckpoint saved: {checkpoint_path}")
    
    return model

if __name__ == "__main__":
    print("\nStarting training...")
    final_model = train_final_model(dataset)
    print(f"\nTraining complete! Model saved to {model_save_path}")


Initializing dataset...
Building dataset...


Loading classes:   0%|          | 0/4 [00:00<?, ?it/s]


Found 24995 images for Boston

Found 24995 images for Charlotte

Found 25064 images for Manhattan

Found 24998 images for Pittsburgh

Dataset statistics:
Total images: 99988
Boston: 24995 images
Charlotte: 24995 images
Manhattan: 25000 images
Pittsburgh: 24998 images

Using device: cuda

Starting training...

Splitting dataset into train/val sets...
Training set size: 79990
Validation set size: 19998

Initializing ResNet50...


Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 1 Complete:
Train Loss: 0.1708
Val Loss: 0.0210
Val Accuracy: 0.9939

Per-class accuracies:
Boston: 0.6105
Charlotte: 0.6184
Manhattan: 0.6192
Pittsburgh: 0.6202

Saving best model with val_loss: 0.0210


Epoch 2:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 2 Complete:
Train Loss: 0.1143
Val Loss: 0.0168
Val Accuracy: 0.9944

Per-class accuracies:
Boston: 0.6388
Charlotte: 0.6433
Manhattan: 0.6367
Pittsburgh: 0.6431

Saving best model with val_loss: 0.0168


Epoch 3:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 3 Complete:
Train Loss: 0.1228
Val Loss: 0.0074
Val Accuracy: 0.9963

Per-class accuracies:
Boston: 0.6381
Charlotte: 0.6386
Manhattan: 0.6378
Pittsburgh: 0.6393

Saving best model with val_loss: 0.0074


Epoch 4:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 4 Complete:
Train Loss: 0.1085
Val Loss: 0.0103
Val Accuracy: 0.9948

Per-class accuracies:
Boston: 0.6445
Charlotte: 0.6430
Manhattan: 0.6480
Pittsburgh: 0.6437


Epoch 5:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 5 Complete:
Train Loss: 0.1177
Val Loss: 0.0092
Val Accuracy: 0.9953

Per-class accuracies:
Boston: 0.6145
Charlotte: 0.6215
Manhattan: 0.6204
Pittsburgh: 0.6203


Epoch 6:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 6 Complete:
Train Loss: 0.1041
Val Loss: 0.0184
Val Accuracy: 0.9937

Per-class accuracies:
Boston: 0.6488
Charlotte: 0.6505
Manhattan: 0.6562
Pittsburgh: 0.6511


Epoch 7:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 7 Complete:
Train Loss: 0.0942
Val Loss: 0.0208
Val Accuracy: 0.9881

Per-class accuracies:
Boston: 0.6386
Charlotte: 0.6414
Manhattan: 0.6415
Pittsburgh: 0.6407


Epoch 8:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 8 Complete:
Train Loss: 0.0949
Val Loss: 0.0095
Val Accuracy: 0.9947

Per-class accuracies:
Boston: 0.6316
Charlotte: 0.6333
Manhattan: 0.6365
Pittsburgh: 0.6385


Epoch 9:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 9 Complete:
Train Loss: 0.0992
Val Loss: 0.0067
Val Accuracy: 0.9977

Per-class accuracies:
Boston: 0.6426
Charlotte: 0.6381
Manhattan: 0.6471
Pittsburgh: 0.6433

Saving best model with val_loss: 0.0067


Epoch 10:   0%|          | 0/2500 [00:00<?, ?it/s]

Validation:   0%|          | 0/625 [00:00<?, ?it/s]


Epoch 10 Complete:
Train Loss: 0.0863
Val Loss: 0.0337
Val Accuracy: 0.9912

Per-class accuracies:
Boston: 0.6316
Charlotte: 0.6292
Manhattan: 0.6316
Pittsburgh: 0.6360

Training complete! Model saved to softmax-output/softmax-ResNet50_10-ep_32-bs_25000-images_2024-12-05_16-10/trained-model_softmax-ResNet50_10-ep_32-bs_25000-images_2024-12-05_16-10.pth


In [9]:
# Part 2: Softmax classifier predictions with Test Time Augmentation using the fine tuned resnet model

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
from PIL import Image
import json
import os
import random
import numpy as np

# Parameters
resnet_model = 'ResNet50'
class_names = ['Boston', 'Charlotte', 'Manhattan', 'Pittsburgh']
num_classes = len(class_names)

# Transform for prediction (no augmentation)
normalize_mean = [0.485, 0.456, 0.406]
normalize_std = [0.229, 0.224, 0.225]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=normalize_mean, std=normalize_std),
])

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else 
                     "mps" if torch.backends.mps.is_available() else 
                     "cpu")
print(f"Using device: {device}")

def load_model(model_path):
    """Load the trained model with the improved architecture"""
    if resnet_model == 'ResNet18':
        weights = models.ResNet18_Weights.DEFAULT
        model = models.resnet18(weights=weights)
    elif resnet_model == 'ResNet50':
        weights = models.ResNet50_Weights.DEFAULT
        model = models.resnet50(weights=weights)
    
    # Use the same improved classifier head as in training
    model.fc = nn.Sequential(
        nn.Linear(model.fc.in_features, 512),
        nn.BatchNorm1d(512),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(512, num_classes)
    )
    
    # Load trained weights
    checkpoint = torch.load(model_path, map_location=device)
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        model.load_state_dict(checkpoint)
    
    model = model.to(device)
    model.eval()  # Set to evaluation mode
    return model

def predict_single_pass(model, image_tensor):
    """Make a single prediction pass"""
    outputs = model(image_tensor)
    # Temperature scaling for sharper predictions
    temperature = 1.5
    scaled_outputs = outputs / temperature
    probabilities = F.softmax(scaled_outputs, dim=1)[0]
    return probabilities

def predict_with_tta(model, image_path, num_augmentations=5):
    """Predict with Test Time Augmentation"""
    predictions = []
    
    try:
        # Load and prepare image
        image = Image.open(image_path).convert('RGB')
        
        with torch.no_grad():  # Disable gradient computation
            # Base prediction
            base_tensor = transform(image).unsqueeze(0).to(device)
            base_pred = predict_single_pass(model, base_tensor)
            predictions.append(base_pred)
            
            # TTA predictions
            tta_transforms = [
                transforms.RandomHorizontalFlip(p=1.0),
                transforms.RandomRotation(10),
                transforms.ColorJitter(brightness=0.1),
                transforms.RandomAffine(5, translate=(0.05, 0.05)),
            ]
            
            for _ in range(num_augmentations):
                aug_tensor = base_tensor.clone()
                for t in random.sample(tta_transforms, 2):  # Apply 2 random transforms
                    aug_tensor = t(aug_tensor)
                aug_pred = predict_single_pass(model, aug_tensor)
                predictions.append(aug_pred)
            
            # Average all predictions
            final_pred = torch.mean(torch.stack(predictions), dim=0)
            predicted_class = torch.argmax(final_pred).item()
            
            return final_pred.cpu().numpy(), predicted_class
    
    except Exception as e:
        print(f"Error predicting image {image_path}: {str(e)}")
        return None, None

def predict_batch(model_path, image_paths, output_file=None):
    """Predict cities for multiple images using TTA"""
    # Load model
    model = load_model(model_path)
    
    results = []
    for image_path in image_paths:
        probabilities, predicted_class = predict_with_tta(model, image_path)
        
        if probabilities is not None:
            result = {
                'image_path': image_path,
                'predicted_class': class_names[predicted_class],
                'probabilities': {
                    class_name: float(prob) 
                    for class_name, prob in zip(class_names, probabilities)
                }
            }
            results.append(result)
            
            # Print results
            print(f"\nPredictions for {image_path}:")
            print(f"Predicted class: {class_names[predicted_class]}")
            print("Class probabilities:")
            for class_name, prob in zip(class_names, probabilities):
                print(f"{class_name}: {prob:.4f}")
    
    # Save results if output file specified
    if output_file and results:
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=4)
        print(f"\nPredictions saved to {output_file}")
    
    return results

# Example usage
if __name__ == "__main__":
    # Replace these with your actual paths
    model_path = "models/softmax-ResNet50_10-ep_32-bs_25000-images_2024-12-05_16-10.pth"
    image_paths = [
        "../data/ny-brooklyn/buildings/buildings_1370.jpg",
        "../data/ny-brooklyn/buildings/buildings_152277.jpg"
    ]
    output_file = "softmax-output/resnet-softmax-predictions-test.json"
    
    results = predict_batch(model_path, image_paths, output_file)

Using device: cuda


  checkpoint = torch.load(model_path, map_location=device)



Predictions for ../data/ny-brooklyn/buildings/buildings_1370.jpg:
Predicted class: Manhattan
Class probabilities:
Boston: 0.1877
Charlotte: 0.1211
Manhattan: 0.6134
Pittsburgh: 0.0778

Predictions for ../data/ny-brooklyn/buildings/buildings_152277.jpg:
Predicted class: Manhattan
Class probabilities:
Boston: 0.2418
Charlotte: 0.1805
Manhattan: 0.4724
Pittsburgh: 0.1053

Predictions saved to softmax-output/resnet-softmax-predictions-test.json
