## Import libraries 

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Sampler
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
from tqdm import tqdm  # Import tqdm for progress visualization
from models.cnn_models import SimpleCNN
from collections import Counter
import random

## Define filepaths as constant

In [3]:
# Define file paths as constants
CSV_FILE_PATH = r'C:\Users\Sandhra George\avalanche\data\dataset.csv'
ROOT_DIR_PATH = r'C:\Users\Sandhra George\avalanche'

csv_file = r'C:\Users\Sandhra George\avalanche\data\dataset.csv'  # Path to the CSV file
root_dir = r'C:\Users\Sandhra George\avalanche\caxton_dataset\print0'  # Path to the image directory

## Load data into DataFrame

In [4]:
# Load data into a DataFrame for easier processing
data = pd.read_csv(CSV_FILE_PATH)

# Limit dataset to the first 3084 images (excluding header)
data_limited = data.iloc[0:3085].reset_index(drop=True)

# Filter the dataset to only include images containing "print0"
data_filtered = data_limited[data_limited.iloc[:, 0].str.contains('print0', na=False)]

# Update the first column to contain only the image filenames
data_filtered.iloc[:, 0] = data_filtered.iloc[:, 0].str.replace(r'.*?/(image-\d+\.jpg)', r'\1', regex=True)

# Display the updated DataFrame
print("First rows of filtered DataFrame:")
print(data_filtered.head())

# Display the last few rows of the updated DataFrame
print("\nLast rows of filtered DataFrame:")
print(data_filtered.tail())

First rows of filtered DataFrame:
       img_path               timestamp  flow_rate  feed_rate  z_offset  \
0   image-6.jpg  2020-10-08T13:12:50-34        100        100       0.0   
1   image-7.jpg  2020-10-08T13:12:50-80        100        100       0.0   
2   image-8.jpg  2020-10-08T13:12:51-27        100        100       0.0   
3   image-9.jpg  2020-10-08T13:12:51-74        100        100       0.0   
4  image-10.jpg  2020-10-08T13:12:52-20        100        100       0.0   

   target_hotend  hotend    bed  nozzle_tip_x  nozzle_tip_y  img_num  \
0          205.0  204.13  65.74           531           554        5   
1          205.0  204.13  65.74           531           554        6   
2          205.0  204.24  65.84           531           554        7   
3          205.0  204.24  65.84           531           554        8   
4          205.0  204.24  65.84           531           554        9   

   print_id  flow_rate_class  feed_rate_class  z_offset_class  hotend_class  \
0  

### Analysing the hotend temperature column

In [5]:
# Extract unique temperatures in the 'target_hotend' column and sort them
unique_temperatures = sorted(data_filtered['target_hotend'].unique())  # Sort temperatures in ascending order

# Calculate the full range of temperatures (min and max)
temperature_min = data_filtered['target_hotend'].min()
temperature_max = data_filtered['target_hotend'].max()

# Print the unique temperatures (sorted), count, and full range
print("\nUnique target hotend temperatures in the dataset (sorted):")
print(unique_temperatures)
print(f"\nNumber of unique target hotend temperatures: {len(unique_temperatures)}")
print(f"Temperature range: {temperature_min}° to {temperature_max}°")


Unique target hotend temperatures in the dataset (sorted):
[181.0, 183.0, 187.0, 189.0, 194.0, 196.0, 205.0, 206.0, 209.0, 210.0, 214.0, 220.0, 226.0, 227.0, 228.0, 229.0, 230.0]

Number of unique target hotend temperatures: 17
Temperature range: 181.0° to 230.0°


## Creating a "random" temperature sub list

In [32]:
# Check if we have enough unique temperatures to select from
if len(unique_temperatures) >= 13:
    # Select the lowest and highest temperatures
    temperature_sublist = [temperature_min, temperature_max]

    # Remove the lowest and highest temperatures from the unique temperatures list
    remaining_temperatures = [temp for temp in unique_temperatures if temp != temperature_min and temp != temperature_max]

    # Randomly select 11 other temperatures from the remaining ones
    random_temperatures = random.sample(remaining_temperatures, 11)

    # Add the random temperatures to the temperature_sublist
    temperature_sublist.extend(random_temperatures)
    
    # Sort from lowest to highest hotend temperature
    temperature_sublist = sorted(temperature_sublist)

    # Print the temperature sublist
    print("\nTemperature sublist:")
    print(temperature_sublist)
else:
    print("Not enough unique temperatures to select from. At least 13 unique temperatures are required.")


Temperature sublist:
[181.0, 183.0, 187.0, 189.0, 194.0, 196.0, 206.0, 209.0, 210.0, 220.0, 226.0, 228.0, 230.0]


## Split the dataset into separate DataFrames for each class

In [33]:
# Initialise a dictionary to store DataFrames for each class and temperature combination
class_temperature_datasets = {}

# Iterate over all temperatures in the temperature_sublist
for temp in temperature_sublist:
    print(f"Processing temperature: {temp}°")
    
    # Filter the dataset for the current temperature
    temp_filtered = data_filtered[data_filtered['target_hotend'] == temp]
    
    # Now, iterate over all classes (0, 1, 2)
    for class_id in [0, 1, 2]:  # Ensure we process all classes: 0, 1, 2
        # Filter the data for the current class
        class_temp_data = temp_filtered[temp_filtered['hotend_class'] == class_id]
        
        if class_temp_data.empty:
            # If there are no images for this class at the current temperature, print a message
            print(f"Class {class_id} at {temp}° dataset size: 0")
        else:
            # Shuffle the data (if needed) and store it in the dictionary
            class_temperature_datasets[(class_id, temp)] = class_temp_data.sample(frac=1, random_state=42)
            print(f"Class {class_id} at {temp}° dataset size: {len(class_temp_data)}")
    
# Print the size of each class-temperature dataset (even if the size is 0)
for temp in temperature_sublist:
    print(f"\nSummary for Temperature: {temp}°")
    for class_id in [0, 1, 2]:
        # Retrieve the data for the current temperature and class from the dictionary
        if (class_id, temp) in class_temperature_datasets:
            print(f"Class {class_id} at {temp}° dataset size: {len(class_temperature_datasets[(class_id, temp)])}")
        else:
            # If no data for this class-temperature combination, print 0
            print(f"Class {class_id} at {temp}° dataset size: 0")

# OPTIONAL: Process the minimum class sizes and print the class with the minimum size
min_class_size = float('inf')  # Start with infinity as a comparison baseline
min_class_id = None  # Variable to hold the class ID with the minimum size
min_temp = None  # Variable to hold the temperature with the minimum size

# Iterate over class-temperature datasets to find the minimum class size and its corresponding class-temperature
for (class_id, temp), df in class_temperature_datasets.items():
    class_size = len(df)
    if class_size < min_class_size:
        min_class_size = class_size
        min_class_id = class_id
        min_temp = temp

# Print the minimum class size and the corresponding class ID and temperature
print(f"\nMinimum class size: {min_class_size} (Class: {min_class_id}, Temperature: {min_temp}°)")

Processing temperature: 181.0°
Class 0 at 181.0° dataset size: 116
Class 1 at 181.0° dataset size: 20
Class 2 at 181.0° dataset size: 0
Processing temperature: 183.0°
Class 0 at 183.0° dataset size: 119
Class 1 at 183.0° dataset size: 16
Class 2 at 183.0° dataset size: 0
Processing temperature: 187.0°
Class 0 at 187.0° dataset size: 135
Class 1 at 187.0° dataset size: 0
Class 2 at 187.0° dataset size: 0
Processing temperature: 189.0°
Class 0 at 189.0° dataset size: 64
Class 1 at 189.0° dataset size: 26
Class 2 at 189.0° dataset size: 8
Processing temperature: 194.0°
Class 0 at 194.0° dataset size: 109
Class 1 at 194.0° dataset size: 26
Class 2 at 194.0° dataset size: 0
Processing temperature: 196.0°
Class 0 at 196.0° dataset size: 138
Class 1 at 196.0° dataset size: 125
Class 2 at 196.0° dataset size: 7
Processing temperature: 206.0°
Class 0 at 206.0° dataset size: 26
Class 1 at 206.0° dataset size: 102
Class 2 at 206.0° dataset size: 8
Processing temperature: 209.0°
Class 0 at 209.0° 

## Create a balanced dataset

In [34]:
# Define the minimum number of images required in each class for a temperature to be included
min_class_size = 3  # Adjust this to your desired minimum class size

# Initialize a list to store valid datasets for each temperature
valid_class_temperature_datasets = []

# Process each temperature in the temperature sublist
for temp in temperature_sublist:
    print(f"\nProcessing temperature: {temp}°")
    
    # Filter the dataset for the current temperature
    temp_filtered = data_filtered[data_filtered['target_hotend'] == temp]
    
    # Dictionary to store class-specific data for the current temperature
    temp_class_data = {}
    meets_criteria = True  # Assume the temperature meets criteria until proven otherwise

    # Iterate through each class (0, 1, 2)
    for class_id in [0, 1, 2]:
        # Filter by both class and temperature
        class_temp_data = temp_filtered[temp_filtered['hotend_class'] == class_id]
        
        # Check and print actual dataset size for verification
        actual_class_size = len(class_temp_data)
        print(f"Class {class_id} at {temp}° actual dataset size: {actual_class_size}")

        # Only add if the dataset size for this class meets the minimum requirement
        if actual_class_size >= min_class_size:
            # Sample exactly min_class_size images
            temp_class_data[class_id] = class_temp_data.sample(n=min_class_size, random_state=42)
        else:
            print(f"Class {class_id} at {temp}° does not have enough images ({actual_class_size}). Skipping this temperature.")
            meets_criteria = False
            break  # Stop processing this temperature if any class fails to meet min_class_size

    # If all classes at this temperature meet the criteria, add to valid datasets
    if meets_criteria:
        combined_data_for_temp = pd.concat(temp_class_data.values(), ignore_index=True)
        valid_class_temperature_datasets.append(combined_data_for_temp)
        print(f"Temperature {temp}° included with {min_class_size} images per class.")

# Combine all valid datasets for all temperatures into one DataFrame
balanced_data = pd.concat(valid_class_temperature_datasets, ignore_index=True) if valid_class_temperature_datasets else pd.DataFrame()

# Shuffle the balanced dataset if it’s not empty
if not balanced_data.empty:
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"\nTotal number of images in the balanced dataset: {len(balanced_data)}")
else:
    print("No valid data left after filtering temperatures with insufficient class sizes.")

# Print the final class and temperature counts in the balanced dataset
if not balanced_data.empty:
    print("\nClass and Temperature counts in the balanced dataset:")
    for temp in balanced_data['target_hotend'].unique():
        print(f"\nTemperature: {temp}°")
        for class_id in [0, 1, 2]:
            count = len(balanced_data[(balanced_data['hotend_class'] == class_id) & (balanced_data['target_hotend'] == temp)])
            print(f"Class {class_id}: {count} images")
else:
    print("Balanced dataset is empty.")


Processing temperature: 181.0°
Class 0 at 181.0° actual dataset size: 116
Class 1 at 181.0° actual dataset size: 20
Class 2 at 181.0° actual dataset size: 0
Class 2 at 181.0° does not have enough images (0). Skipping this temperature.

Processing temperature: 183.0°
Class 0 at 183.0° actual dataset size: 119
Class 1 at 183.0° actual dataset size: 16
Class 2 at 183.0° actual dataset size: 0
Class 2 at 183.0° does not have enough images (0). Skipping this temperature.

Processing temperature: 187.0°
Class 0 at 187.0° actual dataset size: 135
Class 1 at 187.0° actual dataset size: 0
Class 1 at 187.0° does not have enough images (0). Skipping this temperature.

Processing temperature: 189.0°
Class 0 at 189.0° actual dataset size: 64
Class 1 at 189.0° actual dataset size: 26
Class 2 at 189.0° actual dataset size: 8
Temperature 189.0° included with 3 images per class.

Processing temperature: 194.0°
Class 0 at 194.0° actual dataset size: 109
Class 1 at 194.0° actual dataset size: 26
Class 2

~

## Create training, validation, and testing datasets

In [35]:
# Print the total number of images
total_images = len(balanced_data)
print(f'Total images: {total_images}')

# Get the minimum number of samples available in any class for any temperature
min_class_counts_per_temp = (
    balanced_data.groupby(['hotend_class', 'target_hotend']).size().groupby(level=1).min()
)
min_class_counts = min_class_counts_per_temp.min()

# Calculate samples for train, validation, and test ensuring non-zero samples for each
train_samples_per_class = max(int(0.8 * min_class_counts), 1)
val_samples_per_class = max(int(0.1 * min_class_counts), 1)
test_samples_per_class = max(min_class_counts - train_samples_per_class - val_samples_per_class, 1)

# Ensure the sum does not exceed `min_class_counts`
if train_samples_per_class + val_samples_per_class + test_samples_per_class > min_class_counts:
    train_samples_per_class = min_class_counts - (val_samples_per_class + test_samples_per_class)

# Debug: Print the minimum samples per temperature and intended dataset sizes per class
print(f'Minimum samples per class per temperature: {min_class_counts}')
print(f'Samples per class - Train: {train_samples_per_class}, Validation: {val_samples_per_class}, Test: {test_samples_per_class}')

# Initialize empty DataFrames for each dataset
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

# Iterate through each temperature and sample equally from each class
for temp in balanced_data['target_hotend'].unique():
    for class_id in [0, 1, 2]:
        # Filter the data by temperature and class
        class_temp_data = balanced_data[(
            balanced_data['target_hotend'] == temp) & 
            (balanced_data['hotend_class'] == class_id)
        ]

        # Ensure the data is shuffled
        class_temp_data = class_temp_data.sample(frac=1, random_state=42).reset_index(drop=True)

        # Check if we have enough samples; take `min_class_counts` if available
        if len(class_temp_data) >= min_class_counts:
            # Split the class_temp_data into train, val, and test subsets
            train_subset = class_temp_data.iloc[:train_samples_per_class]
            val_subset = class_temp_data.iloc[train_samples_per_class:train_samples_per_class + val_samples_per_class]
            test_subset = class_temp_data.iloc[train_samples_per_class + val_samples_per_class:train_samples_per_class + val_samples_per_class + test_samples_per_class]

            # Append to respective datasets
            train_data = pd.concat([train_data, train_subset], ignore_index=True)
            val_data = pd.concat([val_data, val_subset], ignore_index=True)
            test_data = pd.concat([test_data, test_subset], ignore_index=True)
        else:
            print(f"Skipping temperature {temp} for class {class_id} due to insufficient data (only {len(class_temp_data)} images).")

# Debug: Print the sizes of the datasets after the split
print(f'Training set size: {len(train_data)}')
print(f'Validation set size: {len(val_data)}')
print(f'Testing set size: {len(test_data)}')

# Function to print class counts for a given dataset, grouped by temperature
def print_class_temperature_counts(dataset, dataset_name):
    print(f"\nClass and Temperature counts in {dataset_name}:")
    # Group by temperature and class
    grouped = dataset.groupby(['target_hotend', 'hotend_class']).size().unstack(fill_value=0)
    
    # Iterate through temperatures and print the counts for each class
    for temp in grouped.index:
        print(f"Temperature {temp}: ", end="")
        for class_id in grouped.columns:
            class_count = grouped.loc[temp, class_id]
            print(f"Class {class_id}: {class_count}", end=", ")
        print()  # New line for the next temperature

# Print class counts for each dataset, grouped by temperature
print_class_temperature_counts(train_data, "training data")
print_class_temperature_counts(val_data, "validation data")
print_class_temperature_counts(test_data, "testing data")

# Print the first five rows of each dataset
print("\nFirst five rows of training data:")
print(train_data.head())

print("\nFirst five rows of validation data:")
print(val_data.head())

print("\nFirst five rows of testing data:")
print(test_data.head())

Total images: 27
Minimum samples per class per temperature: 3
Samples per class - Train: 1, Validation: 1, Test: 1
Training set size: 9
Validation set size: 9
Testing set size: 9

Class and Temperature counts in training data:
Temperature 189.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 196.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 206.0: Class 0: 1, Class 1: 1, Class 2: 1, 

Class and Temperature counts in validation data:
Temperature 189.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 196.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 206.0: Class 0: 1, Class 1: 1, Class 2: 1, 

Class and Temperature counts in testing data:
Temperature 189.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 196.0: Class 0: 1, Class 1: 1, Class 2: 1, 
Temperature 206.0: Class 0: 1, Class 1: 1, Class 2: 1, 

First five rows of training data:
         img_path               timestamp  flow_rate  feed_rate  z_offset  \
0  image-3074.jpg  2020-10-08T13:36:39-95         91        

## Initialise model, loss function, and optimiser

In [36]:
num_classes = 3  # Number of hot end rate classes
model = SimpleCNN(num_classes=num_classes)  # Assuming SimpleCNN is defined in cnn_models
criterion = torch.nn.CrossEntropyLoss()  # Cross Entropy Loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

## Training, Validation and Testing batches

In [37]:
class CustomDataset(Dataset):
    def __init__(self, csv_file=None, root_dir=None, transform=None, data_frame=None):
        if data_frame is not None:
            self.data = data_frame
        elif csv_file is not None:
            self.data = pd.read_csv(csv_file, header=0, dtype=str)
        else:
            raise ValueError("Either csv_file or data_frame must be provided.")

        self.root_dir = root_dir
        self.transform = transform or self.default_transform()
        self.valid_indices = self.get_valid_indices()

    def default_transform(self):
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

    def get_valid_indices(self):
        valid_indices = []
        for idx in tqdm(range(len(self.data)), desc="Validating images"):
            img_name = self.data.iloc[idx, 0].strip()
            img_name = img_name.split('/')[-1]
    
            if img_name.startswith("image-"):
                try:
                    image_number = int(img_name.split('-')[1].split('.')[0])
                    if image_number <= 3084:
                        full_img_path = os.path.join(self.root_dir, img_name)
                        if os.path.exists(full_img_path):
                            valid_indices.append(idx)
                        else:
                            print(f"Image does not exist: {full_img_path}")
                except ValueError:
                    print(f"Invalid filename format for {img_name}. Skipping...")
        
        print(f"Total valid indices found: {len(valid_indices)}")  # Debugging output
        return valid_indices


    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        if isinstance(idx, list):
            # Debugging: Print length of valid indices
            # print(f"Valid indices count: {len(self.valid_indices)}")
    
            items = [self._load_sample(i) for i in idx]
            items = [item for item in items if item is not None]  # Filter out None entries
    
            if not items:
                raise RuntimeError("No valid items found in the batch.")
    
            # Unzip items and stack images
            images, labels = zip(*items)
            return torch.stack(images), torch.tensor(labels)
    
        else:
            return self._load_sample(idx)


    def _load_sample(self, idx):
        # Get the actual index from valid indices
        actual_idx = self.valid_indices[idx]
        img_name = self.data.iloc[actual_idx, 0].strip()
        full_img_path = os.path.join(self.root_dir, img_name)
    
        try:
            image = Image.open(full_img_path).convert('RGB')  # Ensure image is RGB
            label_str = self.data.iloc[actual_idx, 15]  # Assuming label is in the second column
            
            # Attempt to convert label to integer; handle exceptions
            try:
                label = int(label_str)  # Try converting to int
            except ValueError:
                print(f"Warning: Non-integer label found for image {img_name}: {label_str}")
                print()
                return None  # Skip this sample if label conversion fails
    
            image = self.transform(image)  # Apply transformation
    
            return image, label
        except Exception as e:
            print(f"Error loading image {full_img_path}: {e}")
            return None  # Handle error gracefully

In [38]:
# Assuming CustomDataset is already defined earlier (you should have this from the previous code)

class TemperatureBatchSampler(Sampler):
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.data_by_temperature = self.group_data_by_temperature()

    def group_data_by_temperature(self):
        """Group indices by temperature."""
        data_by_temp = {}
        for idx in range(len(self.dataset)):
            temp = self.dataset.data.iloc[idx]['target_hotend']  # Get temperature for each image
            if temp not in data_by_temp:
                data_by_temp[temp] = []
            data_by_temp[temp].append(idx)
        return data_by_temp

    def __iter__(self):
        """Generate batches of indices, where each batch corresponds to one temperature."""
        for temp, indices in self.data_by_temperature.items():
            # Shuffle indices within each temperature group (optional)
            random.shuffle(indices)
            # Yield batches of the specified batch size for each temperature group
            for i in range(0, len(indices), self.batch_size):
                yield indices[i:i + self.batch_size]

    def __len__(self):
        """Return the number of batches."""
        return sum([len(indices) // self.batch_size for indices in self.data_by_temperature.values()])


# Initialize datasets for training, validation, and testing
train_dataset = CustomDataset(data_frame=train_data, root_dir=root_dir, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]))
val_dataset = CustomDataset(data_frame=val_data, root_dir=root_dir, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]))
test_dataset = CustomDataset(data_frame=test_data, root_dir=root_dir, transform=transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]))

# Create custom batch samplers for training, validation, and testing data
train_sampler = TemperatureBatchSampler(train_dataset, batch_size=3)
val_sampler = TemperatureBatchSampler(val_dataset, batch_size=3)
test_sampler = TemperatureBatchSampler(test_dataset, batch_size=3)

# Create DataLoaders using the custom samplers (no batch_size here, as samplers handle it)
train_loader = DataLoader(train_dataset, sampler=train_sampler)
val_loader = DataLoader(val_dataset, sampler=val_sampler)
test_loader = DataLoader(test_dataset, sampler=test_sampler)

def print_class_distribution(data_loader, sampler, dataset):
    """Print the temperature and class distribution for all batches."""
    temperature_keys = list(sampler.data_by_temperature.keys())
    
    for batch_idx, (images, labels) in enumerate(data_loader):
        # Get the temperature for the current batch using the indices from the sampler
        batch_indices = sampler.data_by_temperature[temperature_keys[batch_idx % len(temperature_keys)]]
        
        # Ensure we only take the temperature from the first index in this batch (they all share the same temperature)
        temp = dataset.data.iloc[batch_indices[0]]['target_hotend']
        
        # Ensure labels are a 1D array and count occurrences of each class in the batch
        labels = labels.view(-1).cpu().numpy()  # Flatten the labels if needed
        
        # Count the number of occurrences of each class in the batch
        class_counts = Counter(labels)
        
        # Print the temperature and class distribution
        print(f"Batch {batch_idx + 1} contains images from temperature {temp}°")
        print(f"Class distribution in this batch: {dict(class_counts)}")

# Printing the class distribution for all batches in the train, validation, and test data
print("Training Data:")
print_class_distribution(train_loader, train_sampler, train_dataset)

print("\nValidation Data:")
print_class_distribution(val_loader, val_sampler, val_dataset)

print("\nTest Data:")
print_class_distribution(test_loader, test_sampler, test_dataset)


Validating images: 100%|██████████| 9/9 [00:00<00:00, 7694.40it/s]


Total valid indices found: 9


Validating images: 100%|██████████| 9/9 [00:00<00:00, 4513.24it/s]


Total valid indices found: 9


Validating images: 100%|██████████| 9/9 [00:00<00:00, 8463.84it/s]


Total valid indices found: 9
Training Data:
Batch 1 contains images from temperature 189.0°
Class distribution in this batch: {2: 1, 1: 1, 0: 1}
Batch 2 contains images from temperature 196.0°
Class distribution in this batch: {2: 1, 0: 1, 1: 1}
Batch 3 contains images from temperature 206.0°
Class distribution in this batch: {2: 1, 1: 1, 0: 1}

Validation Data:
Batch 1 contains images from temperature 189.0°
Class distribution in this batch: {2: 1, 0: 1, 1: 1}
Batch 2 contains images from temperature 196.0°
Class distribution in this batch: {2: 1, 0: 1, 1: 1}
Batch 3 contains images from temperature 206.0°
Class distribution in this batch: {0: 1, 1: 1, 2: 1}

Test Data:
Batch 1 contains images from temperature 189.0°
Class distribution in this batch: {1: 1, 2: 1, 0: 1}
Batch 2 contains images from temperature 196.0°
Class distribution in this batch: {0: 1, 1: 1, 2: 1}
Batch 3 contains images from temperature 206.0°
Class distribution in this batch: {2: 1, 1: 1, 0: 1}


In [40]:
# Set batch size for training, validation, and testing
batch_size = 3

# Define DataLoader for training, validation, and testing datasets
train_dataset = CustomDataset(data_frame=train_data, root_dir=root_dir, transform=None)
val_dataset = CustomDataset(data_frame=val_data, root_dir=root_dir, transform=None)
test_dataset = CustomDataset(data_frame=test_data, root_dir=root_dir, transform=None)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create and move the model to the device
model = SimpleCNN(num_classes=3).to(device)

# Initialize the criterion and optimizer
criterion = nn.CrossEntropyLoss()  # Cross-Entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def calculate_accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)  # Get the index of the max log-probability
    correct = (predicted == labels).sum().item()
    accuracy = 100 * correct / len(labels)
    return accuracy

# Debugging: Print the structure of TemperatureBatchSampler
print("Debug: TemperatureBatchSampler Data by Temperature:")
for temp_key, indices in train_sampler.data_by_temperature.items():
    print(f"Temperature {temp_key}° has {len(indices)} samples.")

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=1000):
    best_val_accuracy = 0  # Track the best validation accuracy

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        running_accuracy = 0.0

        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        # Get temperature keys in order to cycle through them
        temperature_keys = list(train_sampler.data_by_temperature.keys())

        # Iterate over training batches
        for batch_idx, (inputs, labels) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
            inputs, labels = inputs.to(device), labels.to(device)

            # Retrieve the temperature key for the current batch
            temp_key_idx = batch_idx % len(temperature_keys)
            temp_key = temperature_keys[temp_key_idx]

            # Verify that the temperature key exists in the dictionary
            if temp_key in train_sampler.data_by_temperature:
                temp = train_dataset.data.iloc[train_sampler.data_by_temperature[temp_key][0]]['target_hotend']
            else:
                temp = "Unknown"

            # Forward pass
            optimizer.zero_grad()  # Zero the gradients
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Calculate the loss

            # Backward pass
            loss.backward()  # Backward pass
            optimizer.step()  # Update the model weights

            # Update running loss and accuracy
            running_loss += loss.item()
            running_accuracy += calculate_accuracy(outputs, labels)

            # Extract predictions and true labels for printing
            _, predicted = torch.max(outputs, 1)  # Get predicted class indices

            # Print batch details
            print(f"\nBatch {batch_idx + 1} from Temperature: {temp}°")
            print(f"  Predictions: {predicted.cpu().numpy()}")
            print(f"  Labels:      {labels.cpu().numpy()}")
            # print(f"  Sample Outputs: \n{outputs[:3].cpu().detach().numpy()}")  # Show first 3 outputs for readability

        avg_train_loss = running_loss / len(train_loader)
        avg_train_accuracy = running_accuracy / len(train_loader)

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy:.2f}%")

        # Evaluate on validation set after each epoch
        val_loss, val_accuracy = evaluate(model, val_loader, criterion)
        print(f"Epoch {epoch+1}/{num_epochs} - Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

        # Save the best model based on validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), "best_model.pth")
            print("Best model saved!")

    return model

# Validation loop
def evaluate(model, dataloader, criterion):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    running_accuracy = 0.0
    with torch.no_grad():  # No gradients needed for validation
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Calculate the loss
            running_loss += loss.item()
            running_accuracy += calculate_accuracy(outputs, labels)

    avg_loss = running_loss / len(dataloader)
    avg_accuracy = running_accuracy / len(dataloader)
    return avg_loss, avg_accuracy

# Test loop
def test_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    running_accuracy = 0.0
    with torch.no_grad():  # No gradients needed for testing
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)  # Forward pass
            running_accuracy += calculate_accuracy(outputs, labels)

    avg_accuracy = running_accuracy / len(test_loader)
    print(f"Test Accuracy: {avg_accuracy:.2f}%")

# Run the training loop
trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=1000)
test_model(trained_model, test_loader)

Validating images: 100%|██████████| 9/9 [00:00<00:00, 8861.21it/s]


Total valid indices found: 9


Validating images: 100%|██████████| 9/9 [00:00<00:00, 9032.96it/s]


Total valid indices found: 9


Validating images: 100%|██████████| 9/9 [00:00<00:00, 2972.11it/s]


Total valid indices found: 9
Debug: TemperatureBatchSampler Data by Temperature:
Temperature 189.0° has 3 samples.
Temperature 196.0° has 3 samples.
Temperature 206.0° has 3 samples.

Epoch 1/1000


Training:  33%|███▎      | 1/3 [00:00<00:00,  4.27it/s]


Batch 1 from Temperature: 189.0°
  Predictions: [0 0 0]
  Labels:      [2 1 1]


Training:  67%|██████▋   | 2/3 [00:00<00:00,  4.12it/s]


Batch 2 from Temperature: 196.0°
  Predictions: [1 0 1]
  Labels:      [1 0 0]


                                                       


Batch 3 from Temperature: 206.0°
  Predictions: [1 1 1]
  Labels:      [2 2 0]
Epoch 1/1000 - Train Loss: 1.1220, Train Accuracy: 22.22%




Epoch 1/1000 - Validation Loss: 1.1013, Validation Accuracy: 33.33%
Best model saved!

Epoch 2/1000


Training:  33%|███▎      | 1/3 [00:00<00:00,  4.64it/s]


Batch 1 from Temperature: 189.0°
  Predictions: [0 0 1]
  Labels:      [1 2 1]

Batch 2 from Temperature: 196.0°
  Predictions: [1 1 1]
  Labels:      [2 0 2]


                                                       


Batch 3 from Temperature: 206.0°
  Predictions: [1 1 1]
  Labels:      [0 1 0]
Epoch 2/1000 - Train Loss: 1.1092, Train Accuracy: 22.22%




Epoch 2/1000 - Validation Loss: 1.1008, Validation Accuracy: 33.33%

Epoch 3/1000


Training:  33%|███▎      | 1/3 [00:00<00:00,  4.67it/s]


Batch 1 from Temperature: 189.0°
  Predictions: [0 1 1]
  Labels:      [1 0 2]

Batch 2 from Temperature: 196.0°
  Predictions: [1 1 1]
  Labels:      [1 2 0]


                                                       


Batch 3 from Temperature: 206.0°
  Predictions: [1 1 1]
  Labels:      [0 1 2]
Epoch 3/1000 - Train Loss: 1.1070, Train Accuracy: 22.22%




Epoch 3/1000 - Validation Loss: 1.1006, Validation Accuracy: 33.33%

Epoch 4/1000


Training:  33%|███▎      | 1/3 [00:00<00:00,  4.96it/s]


Batch 1 from Temperature: 189.0°
  Predictions: [1 1 1]
  Labels:      [1 1 2]

Batch 2 from Temperature: 196.0°

Training:  67%|██████▋   | 2/3 [00:00<00:00,  5.18it/s]


  Predictions: [0 1 1]
  Labels:      [0 2 2]

Batch 3 from Temperature: 206.0°
  Predictions: [1 1 1]
  Labels:      [0 0 1]


                                                       

Epoch 4/1000 - Train Loss: 1.1001, Train Accuracy: 44.44%




Epoch 4/1000 - Validation Loss: 1.1005, Validation Accuracy: 33.33%

Epoch 5/1000


Training:  33%|███▎      | 1/3 [00:00<00:00,  3.25it/s]


Batch 1 from Temperature: 189.0°
  Predictions: [1 1 1]
  Labels:      [0 2 0]


                                                       


Batch 2 from Temperature: 196.0°
  Predictions: [1 1 1]
  Labels:      [1 1 0]




KeyboardInterrupt: 