In [None]:
"""model_7_s3_class_weighting.ipynb

Model 7 from scratch using S3 images, CNN, 256×256 input, and class weighting to address imbalance.
"""

'model_7_s3_class_weighting.ipynb\n\nModel 7 from scratch using S3 images, CNN, 256×256 input, and class weighting to address imbalance.\n'

In [None]:
!pip install boto3
!pip install mlflow
!pip install datetime

Collecting boto3
  Downloading boto3-1.37.24-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.38.0,>=1.37.24 (from boto3)
  Downloading botocore-1.37.24-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.37.24-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.6/139.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.37.24-py3-none-any.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.11.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m11.5 MB/s[0m eta [36m0

In [None]:
import os
import random
import mlflow
import mlflow.pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
import numpy as np
import boto3
import datetime
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from collections import defaultdict
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# -------------------------------------------------------------------
# MLflow configuration
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'https://s3.us-west-2.amazonaws.com'
os.environ['AWS_ACCESS_KEY_ID'] = '*************************'
os.environ['AWS_SECRET_ACCESS_KEY'] = '********************************'

In [None]:
mlflow.set_tracking_uri("http://*******:5000")  # Replace with your MLflow server URI if different
mlflow.set_experiment("Pytorch_CNN_from_Scratch_Pavement_Surface_Classification")
mlflow.pytorch.autolog()



In [None]:
# -------------------------------------------------------------------
# Description for MLflow
description = (
    "This version of model_7 loads images from an S3 bucket (instead of local disk) "
    "and applies class weighting in the loss function to address class imbalance. "
    "Otherwise, it retains the original CNN architecture (PavementNet) and transformations "
    "of model_7. It processes grayscale images resized/cropped to 256×256 via data augmentation, "
    "with a 70/15/15 split, and logs metrics/artifacts to MLflow."
)

In [None]:
# -------------------------------------------------------------------
# For reproducibility
seed = 42
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7c91b00a4f30>

In [None]:
# ---------------------------
# 1) Define S3 dataset
class S3ImageDataset(Dataset):
    """
    Expects a bucket structure of the form:
       s3://<bucket>/<prefix>/<class_name>/image.jpg
    """
    def __init__(self, bucket_name, prefix, transform=None):
        super().__init__()
        self.s3 = boto3.client("s3")
        self.bucket_name = bucket_name
        # Ensure prefix has no trailing slash
        self.prefix = prefix.rstrip("/")
        self.transform = transform

        self.samples = []
        self.classes = set()

        # List all objects in S3 under the given prefix
        paginator = self.s3.get_paginator("list_objects_v2")
        pages = paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix)

        for page in pages:
            if "Contents" in page:
                for obj in page["Contents"]:
                    key = obj["Key"]
                    # Check if this key points to an image
                    if key.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tif")):
                        # Typically the structure is prefix/class_name/image_file
                        parts = key.split("/")
                        if len(parts) >= 2:
                            class_name = parts[-2]
                            self.samples.append((key, class_name))
                            self.classes.add(class_name)

        self.classes = sorted(list(self.classes))
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        # Convert (key, class_name) => (key, class_index)
        self.samples = [(k, self.class_to_idx[c]) for (k, c) in self.samples]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        s3_key, label = self.samples[idx]
        s3_obj = self.s3.get_object(Bucket=self.bucket_name, Key=s3_key)
        image_bytes = s3_obj["Body"].read()
        image = Image.open(BytesIO(image_bytes))

        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
# ---------------------------
# 2) Define the original transforms from model_7
train_transform = transforms.Compose([
    transforms.Lambda(lambda img: img.convert("L")),  # Force grayscale
    transforms.Resize((280, 280)),                    # Resize to 280×280
    transforms.RandomCrop(256),                       # Random crop to 256×256
    transforms.RandomHorizontalFlip(),                # Random horizontal flip
    transforms.RandomRotation(10),                    # Random rotation
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),                            # (1, 256, 256)
    transforms.Normalize((0.5,), (0.5,))              # Normalize
])

In [None]:
test_transform = transforms.Compose([
    transforms.Lambda(lambda img: img.convert("L")),  # Force grayscale
    transforms.Resize((256, 256)),                    # Resize to 256×256
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [None]:
# --------------------------------
# 3) Load the full dataset WITHOUT transform first, so we can split
S3_BUCKET_NAME = "myfinaldata"  # <--- REPLACE with your bucket
S3_PREFIX      = "finaldata"    # <--- REPLACE with your prefix

In [None]:
full_dataset_no_transform = S3ImageDataset(
    bucket_name=S3_BUCKET_NAME,
    prefix=S3_PREFIX,
    transform=None
)
print("Classes found in S3:", full_dataset_no_transform.classes)
print("Total images found:", len(full_dataset_no_transform))

Classes found in S3: ['asphalt', 'chip-sealed', 'gravel']
Total images found: 6500


In [None]:
# --------------------------------
# 4) 70/15/15 split across classes (stratified)
label_to_indices = defaultdict(list)
for idx, (key, lbl_idx) in enumerate(full_dataset_no_transform.samples):
    label_to_indices[lbl_idx].append(idx)

In [None]:
train_indices = []
val_indices = []
test_indices = []

In [None]:
for lbl, indices in label_to_indices.items():
    random.shuffle(indices)
    n = len(indices)
    train_count = int(0.70 * n)
    val_count   = int(0.15 * n)
    # remainder -> test
    test_count  = n - train_count - val_count

    train_indices.extend(indices[:train_count])
    val_indices.extend(indices[train_count:train_count + val_count])
    test_indices.extend(indices[train_count + val_count:])

In [None]:
random.shuffle(train_indices)
random.shuffle(val_indices)
random.shuffle(test_indices)

In [None]:
def print_class_distribution(indices, dataset, subset_name):
    from collections import Counter
    labels = [dataset.samples[i][1] for i in indices]
    distribution = Counter(labels)
    print(f"{subset_name} distribution:")
    for label, count in distribution.items():
        cls_name = dataset.classes[label]
        print(f"  {cls_name}: {count}")
    print()

In [None]:
print_class_distribution(train_indices, full_dataset_no_transform, "Train Set")
print_class_distribution(val_indices,   full_dataset_no_transform, "Validation Set")
print_class_distribution(test_indices,  full_dataset_no_transform, "Test Set")

Train Set distribution:
  asphalt: 3500
  chip-sealed: 700
  gravel: 350

Validation Set distribution:
  asphalt: 750
  chip-sealed: 150
  gravel: 75

Test Set distribution:
  asphalt: 750
  chip-sealed: 150
  gravel: 75



In [None]:
# --------------------------------
# 5) Create subsets with appropriate transforms
def create_subset(dataset, indices, transform):
    # We'll create a copy that references only the subset's samples but with the new transform
    subset_ds = S3ImageDataset(dataset.bucket_name, dataset.prefix, transform=transform)
    subset_ds.classes       = dataset.classes
    subset_ds.class_to_idx  = dataset.class_to_idx
    # Filter only these indices
    subset_ds.samples       = [dataset.samples[i] for i in indices]
    return subset_ds

In [None]:
train_dataset = create_subset(full_dataset_no_transform, train_indices, train_transform)
val_dataset   = create_subset(full_dataset_no_transform, val_indices,   test_transform)
test_dataset  = create_subset(full_dataset_no_transform, test_indices,  test_transform)

In [None]:
print("Train dataset size:", len(train_dataset))
print("Val dataset size:  ", len(val_dataset))
print("Test dataset size: ", len(test_dataset))

Train dataset size: 4550
Val dataset size:   975
Test dataset size:  975


In [None]:
# --------------------------------
# 6) Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

In [None]:
# Quick shape check (comment out if not needed)
for images, labels in train_loader:
    print("Sample batch - images.shape:", images.shape, "labels.shape:", labels.shape)
    break

Sample batch - images.shape: torch.Size([32, 1, 256, 256]) labels.shape: torch.Size([32])


In [None]:
# --------------------------------
# 7) Compute class weights from training set to address imbalance
#    We'll do inverse-frequency weighting: weight ~ 1/freq.
train_labels = [full_dataset_no_transform.samples[i][1] for i in train_indices]
class_counts = np.bincount(train_labels)
print("Class sample counts (train subset):", class_counts)

Class sample counts (train subset): [3500  700  350]


In [None]:
# Inverse frequency
weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
weights = weights / weights.sum()  # optional normalization
print("Class weights (inverse-freq):", weights)

Class weights (inverse-freq): tensor([0.0625, 0.3125, 0.6250])


In [None]:
# --------------------------------
# 8) Define the original CNN architecture from model_7
class PavementNet(nn.Module):
    def __init__(self):
        super(PavementNet, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)   # Input: (1,256,256) -> (32,256,256)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # (64,256,256)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1) # (128,256,256)

        self.pool = nn.MaxPool2d(2, 2)
        # After 3 x (conv+pool), 256 -> 128 -> 64 -> 32 in H/W
        # Then adaptive average pool to (8,8)
        self.adapt_pool = nn.AdaptiveAvgPool2d((8, 8))

        # Fully connected layers
        # Flatten => 128 * 8 * 8 = 8192
        self.fc1 = nn.Linear(128 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 3)  # We have 3 classes (asphalt, chip-sealed, gravel)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = self.adapt_pool(x)    # (128,8,8)
        x = x.view(x.size(0), -1) # Flatten
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)           # logits
        return x

In [None]:
# --------------------------------
# 9) Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PavementNet().to(device)

In [None]:
# Use weighted cross-entropy
criterion = nn.CrossEntropyLoss(weight=weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 30
train_losses = []
val_losses   = []
train_accuracies = []
val_accuracies   = []

In [None]:
# A distinctive run name
run_name = f"model_7_s3_cnn_grayscale_classweight_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

In [None]:
with mlflow.start_run(run_name=run_name):
    # Log a description and hyperparams
    mlflow.set_tag("description", description)
    mlflow.log_param("num_epochs", num_epochs)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("learning_rate", 0.001)
    mlflow.log_param("input_size", (256, 256))
    mlflow.log_param("architecture", "PavementNet (3xConv->Pool + AdaptivePool->FC)")
    mlflow.log_param("s3_bucket", S3_BUCKET_NAME)
    mlflow.log_param("s3_prefix", S3_PREFIX)
    mlflow.log_param("class_weighting", "inverse_frequency")

    for epoch in range(num_epochs):
        # ------------- TRAIN -------------
        model.train()
        running_loss = 0.0
        correct_train = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * images.size(0)

            _, preds = torch.max(outputs, 1)
            correct_train += (preds == labels).sum().item()

        epoch_train_loss = running_loss / len(train_dataset)
        epoch_train_acc  = correct_train / len(train_dataset)
        train_losses.append(epoch_train_loss)
        train_accuracies.append(epoch_train_acc)

        # ------------- VALIDATION -------------
        model.eval()
        running_val_loss = 0.0
        correct_val = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * images.size(0)
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == labels).sum().item()

        epoch_val_loss = running_val_loss / len(val_dataset)
        epoch_val_acc  = correct_val / len(val_dataset)
        val_losses.append(epoch_val_loss)
        val_accuracies.append(epoch_val_acc)

        # Logging
        print(f"Epoch {epoch+1}/{num_epochs}  "
              f"Train Loss: {epoch_train_loss:.4f}  Train Acc: {epoch_train_acc:.4f}  "
              f"Val Loss: {epoch_val_loss:.4f}  Val Acc: {epoch_val_acc:.4f}")

        mlflow.log_metric("train_loss", epoch_train_loss, step=epoch)
        mlflow.log_metric("train_accuracy", epoch_train_acc, step=epoch)
        mlflow.log_metric("val_loss", epoch_val_loss, step=epoch)
        mlflow.log_metric("val_accuracy", epoch_val_acc, step=epoch)

    # --------------------------------
    # Plot & log training curves
    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_losses, label="Train Loss")
    plt.plot(range(1, num_epochs + 1), val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Validation Loss")
    plt.legend()
    loss_curve_path = "loss_curve.png"
    plt.savefig(loss_curve_path)
    mlflow.log_artifact(loss_curve_path)
    plt.close()

    plt.figure()
    plt.plot(range(1, num_epochs + 1), train_accuracies, label="Train Accuracy")
    plt.plot(range(1, num_epochs + 1), val_accuracies, label="Val Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training and Validation Accuracy")
    plt.legend()
    acc_curve_path = "accuracy_curve.png"
    plt.savefig(acc_curve_path)
    mlflow.log_artifact(acc_curve_path)
    plt.close()

    # --------------------------------
    # Evaluate on test set
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())

    # Classification report and confusion matrix
    class_names = full_dataset_no_transform.classes
    class_report = classification_report(all_labels, all_preds, target_names=class_names)
    print("Classification Report:\n", class_report)

    report_dict = classification_report(all_labels, all_preds, target_names=class_names, output_dict=True)
    mlflow.log_metric("test_accuracy", report_dict["accuracy"])

    # Per-class metrics
    for cls, metrics in report_dict.items():
        if isinstance(metrics, dict):
            mlflow.log_metric(f"{cls}_precision", metrics.get("precision", 0))
            mlflow.log_metric(f"{cls}_recall", metrics.get("recall", 0))
            mlflow.log_metric(f"{cls}_f1-score", metrics.get("f1-score", 0))

    # Save & log classification report
    report_path = "classification_report.txt"
    with open(report_path, "w") as f:
        f.write(class_report)
    mlflow.log_artifact(report_path)

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    cm_path = "confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # Log the final trained model to MLflow
    mlflow.pytorch.log_model(model, "model")

Epoch 1/30  Train Loss: 0.9940  Train Acc: 0.6051  Val Loss: 0.8714  Val Acc: 0.1815
Epoch 2/30  Train Loss: 0.8151  Train Acc: 0.6934  Val Loss: 0.5436  Val Acc: 0.8831
Epoch 3/30  Train Loss: 0.5453  Train Acc: 0.8224  Val Loss: 0.7703  Val Acc: 0.8769
Epoch 4/30  Train Loss: 0.4100  Train Acc: 0.8809  Val Loss: 0.3163  Val Acc: 0.9138
