In [2]:
# Cell 1: Setup and Install Dependencies
import subprocess
import sys

print("Installing boto3...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "boto3", "tqdm"])

import boto3
from botocore import UNSIGNED
from botocore.client import Config
import os
import json
from tqdm.auto import tqdm
import random

print("✓ All dependencies installed")
print(f"✓ boto3 version: {boto3.__version__}")

# Set random seed for reproducibility
random.seed(42)


Installing boto3...
✓ All dependencies installed
✓ boto3 version: 1.40.61


In [3]:
# Cell 2: Setup S3 Client and Directories

# Initialize S3 client (no AWS credentials needed - public bucket)
s3 = boto3.client(
    "s3", 
    config=Config(signature_version=UNSIGNED), 
    region_name="us-east-1"
)

# S3 bucket configuration
BUCKET = "aft-vbi-pds"
IMG_PREFIX = "bin-images/"
META_PREFIX = "metadata/"

# Create local directories
os.makedirs("bin-images", exist_ok=True)
os.makedirs("metadata", exist_ok=True)

print("✓ S3 client initialized")
print(f"✓ Bucket: {BUCKET}")
print(f"✓ Directories created: bin-images/ and metadata/")

# Test S3 connection
print("\nTesting S3 connection...")
try:
    response = s3.list_objects_v2(Bucket=BUCKET, Prefix=IMG_PREFIX, MaxKeys=5)
    test_files = [obj['Key'] for obj in response.get('Contents', [])]
    print(f"✓ Connection successful! Found {len(test_files)} sample files")
    print(f"  Sample: {test_files[0] if test_files else 'None'}")
except Exception as e:
    print(f"❌ Connection failed: {e}")


✓ S3 client initialized
✓ Bucket: aft-vbi-pds
✓ Directories created: bin-images/ and metadata/

Testing S3 connection...
✓ Connection successful! Found 5 sample files
  Sample: bin-images/


In [4]:
# Cell 3: List all available images in S3 bucket

print("Fetching list of all available images from S3...")
print("This may take 2-3 minutes...\n")

all_image_keys = []
continuation_token = None

# Paginate through all objects (536K+ images)
while True:
    if continuation_token:
        response = s3.list_objects_v2(
            Bucket=BUCKET, 
            Prefix=IMG_PREFIX, 
            MaxKeys=1000,
            ContinuationToken=continuation_token
        )
    else:
        response = s3.list_objects_v2(
            Bucket=BUCKET, 
            Prefix=IMG_PREFIX, 
            MaxKeys=1000
        )
    
    # Extract image keys
    for obj in response.get('Contents', []):
        key = obj['Key']
        filename = key.split('/')[-1]
        
        # Only keep .jpg files
        if filename.endswith('.jpg'):
            all_image_keys.append(key)
    
    # Print progress
    print(f"  Found {len(all_image_keys):,} images so far...", end='\r')
    
    # Check if there are more results
    if response.get('IsTruncated'):
        continuation_token = response.get('NextContinuationToken')
    else:
        break
    
    # Safety limit: stop at 100K to save time (more than enough for 10K sample)
    if len(all_image_keys) >= 100000:
        print(f"\n  Stopping at 100K images (sufficient for sampling)")
        break

print(f"\n\n✓ Total images found: {len(all_image_keys):,}")
print(f"✓ Ready to sample 10K images")


Fetching list of all available images from S3...
This may take 2-3 minutes...

  Found 100,999 images so far...
  Stopping at 100K images (sufficient for sampling)


✓ Total images found: 100,999
✓ Ready to sample 10K images


In [5]:
# Cell 4: Randomly sample 10,000 images for download

print("Selecting 10,000 random images from the 100K available...\n")

# Random sampling (seed=42 for reproducibility)
random.seed(42)
selected_keys = random.sample(all_image_keys, k=10000)

print(f"✓ Selected {len(selected_keys):,} images for download")
print(f"\nSample of selected images:")
for i in range(5):
    print(f"  {i+1}. {selected_keys[i]}")

# Extract image IDs for later use
image_ids = [key.split('/')[-1].replace('.jpg', '') for key in selected_keys]

print(f"\n✓ Ready to download 10K images + metadata")
print(f"  Estimated size: ~6-8 GB")
print(f"  Estimated time: 25-40 minutes")


Selecting 10,000 random images from the 100K available...

✓ Selected 10,000 images for download

Sample of selected images:
  1. bin-images/16685.jpg
  2. bin-images/104130.jpg
  3. bin-images/03279.jpg
  4. bin-images/179006.jpg
  5. bin-images/123472.jpg

✓ Ready to download 10K images + metadata
  Estimated size: ~6-8 GB
  Estimated time: 25-40 minutes


In [6]:
# Cell 5: Download 10,000 images and their metadata

print("="*70)
print("DOWNLOADING 10,000 IMAGES + METADATA")
print("="*70)
print("\nThis will take approximately 25-40 minutes...")
print("Progress will be shown below.\n")

downloaded_images = 0
downloaded_metadata = 0
failed_images = []
failed_metadata = []

# Download with progress bar
for key in tqdm(selected_keys, desc="Downloading", unit="file"):
    filename = key.split('/')[-1]
    image_id = filename.replace('.jpg', '')
    
    # Download image
    try:
        s3.download_file(BUCKET, key, f"bin-images/{filename}")
        downloaded_images += 1
    except Exception as e:
        failed_images.append((filename, str(e)))
    
    # Download corresponding metadata JSON
    json_key = META_PREFIX + image_id + ".json"
    try:
        s3.download_file(BUCKET, json_key, f"metadata/{image_id}.json")
        downloaded_metadata += 1
    except Exception as e:
        failed_metadata.append((image_id, str(e)))

# Summary
print(f"\n{'='*70}")
print("DOWNLOAD COMPLETE!")
print("="*70)
print(f"✓ Images downloaded: {downloaded_images:,} / 10,000")
print(f"✓ Metadata downloaded: {downloaded_metadata:,} / 10,000")

if failed_images:
    print(f"\n⚠️ Failed image downloads: {len(failed_images)}")
    if len(failed_images) <= 10:
        for fname, err in failed_images[:5]:
            print(f"  - {fname}: {err}")

if failed_metadata:
    print(f"⚠️ Failed metadata downloads: {len(failed_metadata)}")
    if len(failed_metadata) <= 10:
        for img_id, err in failed_metadata[:5]:
            print(f"  - {img_id}.json: {err}")

print(f"\n✓ Dataset ready in: ./bin-images/ and ./metadata/")


DOWNLOADING 10,000 IMAGES + METADATA

This will take approximately 25-40 minutes...
Progress will be shown below.



Downloading:   0%|          | 0/10000 [00:00<?, ?file/s]


DOWNLOAD COMPLETE!
✓ Images downloaded: 10,000 / 10,000
✓ Metadata downloaded: 10,000 / 10,000

✓ Dataset ready in: ./bin-images/ and ./metadata/


In [9]:
import torch
import numpy as np
from PIL import Image
from pathlib import Path
import json


In [10]:
# Cell 4: Custom Dataset Class for Bin Images (CORRECTED)
import os
import json
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset

class BinDataset(Dataset):
    """Dataset class for bin images and metadata from separate directories"""
    def __init__(self, img_path='bin-images', meta_path='metadata', target_size=(416, 416)):
        self.img_path = Path(img_path)
        self.meta_path = Path(meta_path)
        self.target_size = target_size
        self.samples = []
        
        # Find all image files
        image_files = sorted(list(self.img_path.glob('*.jpg')))
        print(f"Found {len(image_files)} images in {self.img_path}")
        
        # Load metadata for each image
        for img_file in image_files:
            # Extract image ID from filename (e.g., "12345.jpg" -> "12345")
            image_id = img_file.stem
            json_path = self.meta_path / f"{image_id}.json"
            
            # Check if corresponding JSON metadata exists
            if json_path.exists():
                try:
                    with open(json_path, 'r') as f:
                        metadata = json.load(f)
                    
                    # Create sample entry
                    self.samples.append({
                        'image_path': str(img_file),
                        'metadata': metadata,
                        'bin_id': metadata.get('bin_FBAIID', 'unknown'),
                        'expected_qty': int(metadata.get('expected_quantity', 1)),
                        'image_id': image_id
                    })
                except Exception as e:
                    print(f"⚠ Error loading {json_path}: {e}")
                    continue
        
        print(f"✓ Loaded {len(self.samples)} samples with metadata")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample_info = self.samples[idx]
        
        # Load image
        try:
            image = Image.open(sample_info['image_path']).convert('RGB')
            image = image.resize(self.target_size)
            image = np.array(image) / 255.0  # Normalize to [0, 1]
            image = torch.FloatTensor(image).permute(2, 0, 1)  # HWC to CHW
        except Exception as e:
            print(f"Error loading image {sample_info['image_path']}: {e}")
            # Return a blank image if loading fails
            image = torch.zeros(3, *self.target_size)
        
        # Get expected quantity
        expected_qty = float(sample_info['expected_qty'])
        
        return {
            'image': image,
            'expected_qty': torch.FloatTensor([expected_qty]),
            'bin_id': sample_info['bin_id'],
            'image_id': sample_info['image_id'],
            'metadata': sample_info['metadata']
        }

# Test dataset loading
print("\n" + "="*70)
print("TESTING DATASET LOADING")
print("="*70 + "\n")

# Verify directories exist
img_dir = Path('bin-images')
meta_dir = Path('metadata')

print(f"Image directory: {img_dir.absolute()}")
print(f"  Exists: {img_dir.exists()}")
if img_dir.exists():
    jpg_count = len(list(img_dir.glob('*.jpg')))
    print(f"  JPG files: {jpg_count}")

print(f"\nMetadata directory: {meta_dir.absolute()}")
print(f"  Exists: {meta_dir.exists()}")
if meta_dir.exists():
    json_count = len(list(meta_dir.glob('*.json')))
    print(f"  JSON files: {json_count}")

print()

# Create dataset
try:
    dataset = BinDataset(img_path='bin-images', meta_path='metadata', target_size=(416, 416))
    
    if len(dataset) > 0:
        print(f"\n✓ Dataset successfully created with {len(dataset)} images\n")
        
        # Show sample
        sample = dataset[0]
        print(f"✓ Sample shape: {sample['image'].shape}")
        print(f"✓ Expected quantity: {sample['expected_qty'].item():.1f}")
        print(f"✓ Bin ID: {sample['bin_id']}")
        print(f"✓ Image ID: {sample['image_id']}")
        
        # Display sample statistics
        print(f"\n✓ First 5 samples:")
        for i in range(min(5, len(dataset))):
            s = dataset[i]
            print(f"  {i+1}. ID: {s['image_id']}, Qty: {s['expected_qty'].item():.0f}, Bin: {s['bin_id']}")
    else:
        print("⚠ Dataset is empty! Check if both directories exist and have files")
        
except Exception as e:
    print(f"✗ Error creating dataset: {e}")
    import traceback
    traceback.print_exc()



TESTING DATASET LOADING

Image directory: /kaggle/working/bin-images
  Exists: True
  JPG files: 10000

Metadata directory: /kaggle/working/metadata
  Exists: True
  JSON files: 10000

Found 10000 images in bin-images
✓ Loaded 10000 samples with metadata

✓ Dataset successfully created with 10000 images

✓ Sample shape: torch.Size([3, 416, 416])
✓ Expected quantity: 1.0
✓ Bin ID: unknown
✓ Image ID: 00013

✓ First 5 samples:
  1. ID: 00013, Qty: 1, Bin: unknown
  2. ID: 00023, Qty: 1, Bin: unknown
  3. ID: 00025, Qty: 1, Bin: unknown
  4. ID: 00026, Qty: 1, Bin: unknown
  5. ID: 00038, Qty: 1, Bin: unknown


In [11]:
# Cell 1: Imports and device

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torch.cuda.amp import GradScaler
from torch.amp import autocast   # new API
from torchvision import transforms

import numpy as np
from tqdm.auto import tqdm
import json
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


Device: cuda
GPU: Tesla T4


In [12]:
# Cell 2: ResNet-style model from ABID (adapted)

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(out_channels)
        self.relu  = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, stride=1, padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(out_channels)

        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)
        return out


class ImprovedQuantityPredictor(nn.Module):
    """
    Light ResNet-style regressor, based on the ABID repo but adapted to 416x416 and regression.
    """
    def __init__(self):
        super().__init__()

        # Stem
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2, padding=1)
        )

        # Residual layers
        self.layer1 = self._make_layer(64,  64,  blocks=2, stride=1)
        self.layer2 = self._make_layer(64,  128, blocks=2, stride=2)
        self.layer3 = self._make_layer(128, 256, blocks=2, stride=2)
        self.layer4 = self._make_layer(256, 512, blocks=2, stride=2)

        # Global pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Regression head
        self.regressor = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(True),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(True),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def _make_layer(self, in_c, out_c, blocks, stride):
        layers = [ResidualBlock(in_c, out_c, stride)]
        for _ in range(1, blocks):
            layers.append(ResidualBlock(out_c, out_c, 1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.regressor(x)
        return x


# Quick sanity check
print("Testing model creation...")
test_model = ImprovedQuantityPredictor()
print("Params:", sum(p.numel() for p in test_model.parameters()))
dummy = torch.randn(1, 3, 416, 416)
print("Output shape:", test_model(dummy).shape)


Testing model creation...
Params: 11340865
Output shape: torch.Size([1, 1])


In [13]:
# Cell 3: Ensure BinDataset __getitem__ returns clean dict (no 'metadata' field)

# This assumes you already built self.samples in __init__ as:
#   { 'image_path', 'expected_qty', 'bin_id', 'image_id', 'metadata' }

from PIL import Image

class BinDataset(torch.utils.data.Dataset):
    def __init__(self, img_path='bin-images', meta_path='metadata', target_size=(416, 416)):
        self.img_path = Path(img_path)
        self.meta_path = Path(meta_path)
        self.target_size = target_size
        self.samples = []

        image_files = sorted(self.img_path.glob('*.jpg'))
        print(f"Found {len(image_files)} images in {self.img_path}")

        for img_file in image_files:
            image_id = img_file.stem
            json_path = self.meta_path / f"{image_id}.json"
            if not json_path.exists():
                continue
            try:
                with open(json_path, 'r') as f:
                    meta = json.load(f)
                self.samples.append({
                    'image_path': str(img_file),
                    'expected_qty': int(meta.get('EXPECTED_QUANTITY', meta.get('expected_quantity', 1))),
                    'bin_id': meta.get('BIN_FBAIID', 'unknown'),
                    'image_id': image_id,
                })
            except Exception as e:
                print("Error loading", json_path, e)

        print(f"✓ Loaded {len(self.samples)} samples")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        info = self.samples[idx]
        # image
        img = Image.open(info['image_path']).convert('RGB')
        img = img.resize(self.target_size)
        img = np.array(img) / 255.0
        img = torch.FloatTensor(img).permute(2, 0, 1)
        # label
        qty = float(info['expected_qty'])
        return {
            'image': img,
            'expected_qty': torch.tensor([qty], dtype=torch.float32),
            'bin_id': info['bin_id'],
            'image_id': info['image_id'],
        }

# Recreate dataset once with this class
dataset = BinDataset('bin-images', 'metadata', target_size=(416, 416))
print("Dataset size:", len(dataset))


Found 10000 images in bin-images
✓ Loaded 10000 samples
Dataset size: 10000


In [14]:
# Cell 4: Train/val/test split and DataLoaders

print(f"Using existing dataset with {len(dataset)} samples")

train_size = int(0.8 * len(dataset))
val_size   = int(0.1 * len(dataset))
test_size  = len(dataset) - train_size - val_size

train_ds, val_ds, test_ds = random_split(
    dataset,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42),
)

print(f"✓ Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

def collate_dict(batch):
    images = torch.stack([b['image'] for b in batch])
    targets = torch.stack([b['expected_qty'] for b in batch])
    bin_ids = [b['bin_id'] for b in batch]
    image_ids = [b['image_id'] for b in batch]
    return {
        'image': images,
        'expected_qty': targets,
        'bin_id': bin_ids,
        'image_id': image_ids,
    }

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,
                          num_workers=2, pin_memory=True,
                          collate_fn=collate_dict)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False,
                          num_workers=2, pin_memory=True,
                          collate_fn=collate_dict)
test_loader  = DataLoader(test_ds, batch_size=32, shuffle=False,
                          num_workers=2, pin_memory=True,
                          collate_fn=collate_dict)

print("✓ Data loaders created")


Using existing dataset with 10000 samples
✓ Train: 8000, Val: 1000, Test: 1000
✓ Data loaders created


In [17]:
def train_advanced_model(model, train_loader, val_loader, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    scaler = GradScaler() if torch.cuda.is_available() else None   


    print("\n" + "="*70)
    print("ADVANCED TRAINING WITH GPU")
    print("="*70)
    print("Device:", device)
    if torch.cuda.is_available():
        props = torch.cuda.get_device_properties(0)
        print("GPU:", torch.cuda.get_device_name(0))
        print(f"GPU Memory: {props.total_memory / 1e9:.2f} GB")
    print("Model params:", sum(p.numel() for p in model.parameters()))
    print("="*70 + "\n")

    best_val_mae = float('inf')
    history = {
        'train_loss': [], 'val_loss': [],
        'train_mae': [],  'val_mae': [],
        'lr': [],
    }

    for epoch in range(num_epochs):
        # ------------------ TRAIN ------------------
        model.train()
        train_loss = 0.0
        train_mae  = 0.0
        n_batches  = 0

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
        for batch in pbar:
            images = batch['image'].to(device)
            targets = batch['expected_qty'].to(device).squeeze()

            optimizer.zero_grad()

            if scaler:
                with autocast(device_type='cuda', dtype=torch.float16):
                    outputs = model(images).squeeze()
                    loss = criterion(outputs, targets)
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(images).squeeze()
                loss = criterion(outputs, targets)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            mae = torch.abs(outputs - targets).mean().item()
            train_loss += loss.item()
            train_mae  += mae
            n_batches  += 1

            pbar.set_postfix(loss=f"{loss.item():.4f}", mae=f"{mae:.2f}")

        train_loss /= n_batches
        train_mae  /= n_batches

        # ------------------ VALIDATION ------------------
        model.eval()
        val_loss = 0.0
        val_mae  = 0.0
        n_batches = 0

        with torch.no_grad():
            pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
            for batch in pbar:
                images = batch['image'].to(device)
                targets = batch['expected_qty'].to(device).squeeze()

                outputs = model(images).squeeze()
                loss = criterion(outputs, targets)
                mae  = torch.abs(outputs - targets).mean().item()

                val_loss += loss.item()
                val_mae  += mae
                n_batches += 1

                pbar.set_postfix(val_loss=f"{loss.item():.4f}", val_mae=f"{mae:.2f}")

        val_loss /= n_batches
        val_mae  /= n_batches

        scheduler.step(val_mae)
        current_lr = optimizer.param_groups[0]['lr']

        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_mae'].append(train_mae)
        history['val_mae'].append(val_mae)
        history['lr'].append(current_lr)

        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {train_loss:.4f}, MAE: {train_mae:.4f} | "
              f"Val Loss: {val_loss:.4f}, MAE: {val_mae:.4f} | "
              f"LR: {current_lr:.6f}")

        if val_mae < best_val_mae:
            best_val_mae = val_mae
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_mae': val_mae,
                'history': history,
            }, 'best_quantity_model_gpu.pth')
            print(f" ✓ Saved best model (MAE: {val_mae:.4f})")

    print("\n" + "="*70)
    print("TRAINING COMPLETE")
    print(f"Best Val MAE: {best_val_mae:.4f}")
    print("="*70)

    return history, best_val_mae


In [18]:
# Cell 6: Create model, train, and save history

model = ImprovedQuantityPredictor()

history, best_mae = train_advanced_model(
    model,
    train_loader,
    val_loader,
    num_epochs=25,   # adjust for assignment
)

import json
with open('training_history_gpu.json', 'w') as f:
    json.dump(history, f, indent=2)

print("Best MAE:", best_mae)
print("Saved best model to best_quantity_model_gpu.pth")
print("Saved history to training_history_gpu.json")



ADVANCED TRAINING WITH GPU
Device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB
Model params: 11340865



  scaler = GradScaler() if torch.cuda.is_available() else None


Epoch 1/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 1/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 1/25 | Train Loss: 23.3832, MAE: 2.8985 | Val Loss: 17.1247, MAE: 2.7324 | LR: 0.001000
 ✓ Saved best model (MAE: 2.7324)


Epoch 2/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 2/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 2/25 | Train Loss: 21.7745, MAE: 2.7379 | Val Loss: 17.5800, MAE: 2.9793 | LR: 0.001000


Epoch 3/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 3/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 3/25 | Train Loss: 21.4462, MAE: 2.6711 | Val Loss: 17.3396, MAE: 2.8664 | LR: 0.001000


Epoch 4/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 4/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 4/25 | Train Loss: 20.9025, MAE: 2.6245 | Val Loss: 15.7728, MAE: 2.4391 | LR: 0.001000
 ✓ Saved best model (MAE: 2.4391)


Epoch 5/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 5/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 5/25 | Train Loss: 20.6277, MAE: 2.6063 | Val Loss: 19.1615, MAE: 3.3241 | LR: 0.001000


Epoch 6/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 6/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1564, in _shutdown_workers
    self._pin_memory_thread.join()
  File "/usr/lib/python3.11/threading.py", line 1116, in join
    raise RuntimeError("cannot join current thread")
RuntimeError: cannot join current thread
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/

Epoch 6/25 | Train Loss: 20.4321, MAE: 2.5835 | Val Loss: 15.2158, MAE: 2.1922 | LR: 0.001000
 ✓ Saved best model (MAE: 2.1922)


Epoch 7/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 7/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 7/25 | Train Loss: 20.1516, MAE: 2.5642 | Val Loss: 15.7566, MAE: 2.3781 | LR: 0.001000


Epoch 8/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 8/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 8/25 | Train Loss: 20.0648, MAE: 2.5122 | Val Loss: 17.6662, MAE: 2.4657 | LR: 0.001000


Epoch 9/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 9/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 9/25 | Train Loss: 20.3641, MAE: 2.5660 | Val Loss: 18.8270, MAE: 2.4505 | LR: 0.001000


Epoch 10/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 10/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 10/25 | Train Loss: 20.0009, MAE: 2.5088 | Val Loss: 15.8118, MAE: 2.2202 | LR: 0.000500


Epoch 11/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 11/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 11/25 | Train Loss: 19.1465, MAE: 2.4251 | Val Loss: 15.2480, MAE: 2.1433 | LR: 0.000500
 ✓ Saved best model (MAE: 2.1433)


Epoch 12/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 12/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 12/25 | Train Loss: 19.0021, MAE: 2.3969 | Val Loss: 15.9350, MAE: 2.1581 | LR: 0.000500


Epoch 13/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 13/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 13/25 | Train Loss: 18.9918, MAE: 2.4006 | Val Loss: 15.0468, MAE: 2.1115 | LR: 0.000500
 ✓ Saved best model (MAE: 2.1115)


Epoch 14/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 14/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 14/25 | Train Loss: 19.0126, MAE: 2.3917 | Val Loss: 14.1288, MAE: 2.1381 | LR: 0.000500


Epoch 15/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 15/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 15/25 | Train Loss: 18.5831, MAE: 2.3586 | Val Loss: 14.9543, MAE: 2.1021 | LR: 0.000500
 ✓ Saved best model (MAE: 2.1021)


Epoch 16/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 16/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^^    ^^^self._shutdown_workers()^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers

      File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
if w.is_alive():    
 assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^ ^^ ^ ^  ^ ^^^^^^^^^^^^^
  Fil

Epoch 16/25 | Train Loss: 18.0632, MAE: 2.3472 | Val Loss: 14.8779, MAE: 2.1960 | LR: 0.000500


Epoch 17/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 17/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 17/25 | Train Loss: 18.2458, MAE: 2.3421 | Val Loss: 49.5115, MAE: 5.8694 | LR: 0.000500


Epoch 18/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 18/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 18/25 | Train Loss: 18.1195, MAE: 2.3203 | Val Loss: 15.0237, MAE: 2.1364 | LR: 0.000500


Epoch 19/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 19/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 19/25 | Train Loss: 17.7750, MAE: 2.2966 | Val Loss: 14.0384, MAE: 2.1294 | LR: 0.000250


Epoch 20/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 20/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 20/25 | Train Loss: 17.0532, MAE: 2.2529 | Val Loss: 14.1386, MAE: 2.1016 | LR: 0.000250
 ✓ Saved best model (MAE: 2.1016)


Epoch 21/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 21/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 21/25 | Train Loss: 16.6344, MAE: 2.1933 | Val Loss: 14.8156, MAE: 2.1046 | LR: 0.000250


Epoch 22/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
if w.is_alive():    
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x7e3f5cdfafc0>
^Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
^    ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
^    ^if w.is_alive():^^
 ^ ^ ^ ^^ ^ ^^

Epoch 22/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 22/25 | Train Loss: 16.2787, MAE: 2.1748 | Val Loss: 16.5927, MAE: 2.2216 | LR: 0.000250


Epoch 23/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 23/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 23/25 | Train Loss: 15.8741, MAE: 2.1420 | Val Loss: 17.3253, MAE: 2.3057 | LR: 0.000250


Epoch 24/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 24/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 24/25 | Train Loss: 15.0419, MAE: 2.1210 | Val Loss: 16.2388, MAE: 2.2466 | LR: 0.000125


Epoch 25/25 [Train]:   0%|          | 0/250 [00:00<?, ?it/s]

Epoch 25/25 [Val]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 25/25 | Train Loss: 14.2919, MAE: 2.0276 | Val Loss: 15.6870, MAE: 2.1949 | LR: 0.000125

TRAINING COMPLETE
Best Val MAE: 2.1016
Best MAE: 2.101631049066782
Saved best model to best_quantity_model_gpu.pth
Saved history to training_history_gpu.json


In [20]:
torch.save(model.state_dict(), 'quantity_model.pth')
