## Import necessary libraries 

In [270]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Sampler
from torchvision import transforms
from tqdm import tqdm  # Import tqdm for progress visualization
from models.cnn_models import SimpleCNN

## Define filepaths as constant

In [271]:
# Define file paths as constants
CSV_FILE_PATH = r'C:\Users\Sandhra George\avalanche\data\dataset.csv'
ROOT_DIR_PATH = r'C:\Users\Sandhra George\avalanche'

csv_file = r'C:\Users\Sandhra George\avalanche\data\dataset.csv'  # Path to the CSV file
root_dir = r'C:\Users\Sandhra George\avalanche\caxton_dataset\print0'  # Path to the image directory

## Load data into DataFrame

In [272]:
import pandas as pd

# Load data into a DataFrame for easier processing
data = pd.read_csv(CSV_FILE_PATH)

# Limit dataset to the first 3084 images (excluding header)
data_limited = data.iloc[0:3085].reset_index(drop=True)

# Filter the dataset to only include images containing "print0"
data_filtered = data_limited[data_limited.iloc[:, 0].str.contains('print0', na=False)]

# Update the first column to contain only the image filenames
data_filtered.iloc[:, 0] = data_filtered.iloc[:, 0].str.replace(r'.*?/(image-\d+\.jpg)', r'\1', regex=True)

# Display the updated DataFrame
print(data_filtered.head())

       img_path               timestamp  flow_rate  feed_rate  z_offset  \
0   image-6.jpg  2020-10-08T13:12:50-34        100        100       0.0   
1   image-7.jpg  2020-10-08T13:12:50-80        100        100       0.0   
2   image-8.jpg  2020-10-08T13:12:51-27        100        100       0.0   
3   image-9.jpg  2020-10-08T13:12:51-74        100        100       0.0   
4  image-10.jpg  2020-10-08T13:12:52-20        100        100       0.0   

   target_hotend  hotend    bed  nozzle_tip_x  nozzle_tip_y  img_num  \
0          205.0  204.13  65.74           531           554        5   
1          205.0  204.13  65.74           531           554        6   
2          205.0  204.24  65.84           531           554        7   
3          205.0  204.24  65.84           531           554        8   
4          205.0  204.24  65.84           531           554        9   

   print_id  flow_rate_class  feed_rate_class  z_offset_class  hotend_class  \
0         0                1         

## Split the dataset into separate DataFrames for each class

In [273]:
class_datasets = {}
for class_id in data_filtered['hotend_class'].unique():
    # Print the unique class identifier
    print(f"Processing class: {class_id}")
    
    # Create a shuffled subset for the current class
    class_datasets[class_id] = data_filtered[data_filtered['hotend_class'] == class_id].sample(frac=1, random_state=42)
    
# Print counts of each class dataset
for class_id, df in class_datasets.items():
    print(f'Class {class_id} dataset size: {len(df)}')

# Initialize variables to track the minimum class size and the corresponding class
min_class_size = float('inf')  # Start with infinity as a comparison baseline
min_class_id = None  # Variable to hold the class ID with the minimum size

# Iterate over class datasets to find the minimum class size and its corresponding class ID
for class_id, df in class_datasets.items():
    class_size = len(df)
    if class_size < min_class_size:
        min_class_size = class_size
        min_class_id = class_id

# Print the minimum class size and the corresponding class ID
print(f'Minimum class size: {min_class_size} (Class: {min_class_id})')

Processing class: 1
Processing class: 0
Processing class: 2
Class 1 dataset size: 1279
Class 0 dataset size: 710
Class 2 dataset size: 721
Minimum class size: 710 (Class: 0)


## Create a balanced dataset

In [274]:
# Create balanced datasets by taking the minimum number of images from each class
balanced_data = []
for class_id, class_data in class_datasets.items():
    # Sample from each class
    balanced_data.append(class_data.sample(n=min_class_size, random_state=42))

# Print the number of images from each class in the balanced dataset
for i, class_data in enumerate(balanced_data):
    print(f'Class {list(class_datasets.keys())[i]} dataset size: {len(class_data)}')

# Combine the balanced data into a single DataFrame
balanced_data = pd.concat(balanced_data).reset_index(drop=True)

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

Class 1 dataset size: 710
Class 0 dataset size: 710
Class 2 dataset size: 710


## Create training, validation, and testing datasets

In [275]:
# Total number of images in the balanced dataset
total_images = len(balanced_data)

# Print the total number of images
print(f'Total images: {total_images}')

# Get the minimum number of samples available in any class
min_class_counts = balanced_data['hotend_class'].value_counts().min()

# Define how many samples you want from each class in each dataset
train_samples_per_class = int(0.8 * min_class_counts)
val_samples_per_class = int(0.1 * min_class_counts)
test_samples_per_class = min_class_counts - train_samples_per_class - val_samples_per_class

# Debug: Print the sizes for each class in each dataset
print(f'Minimum samples per class: {min_class_counts}')
print(f'Samples per class - Train: {train_samples_per_class}, Validation: {val_samples_per_class}, Test: {test_samples_per_class}')

# Initialize empty DataFrames for each dataset
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

# Sample data for each class
for class_id in balanced_data['hotend_class'].unique():
    class_data = balanced_data[balanced_data['hotend_class'] == class_id]
    
    # Shuffle the class data
    class_data = class_data.sample(frac=1).reset_index(drop=True)  # Shuffle the class data

    # Split the data into train, val, and test
    train_subset = class_data.iloc[:train_samples_per_class]
    val_subset = class_data.iloc[train_samples_per_class:train_samples_per_class + val_samples_per_class]
    test_subset = class_data.iloc[train_samples_per_class + val_samples_per_class:train_samples_per_class + val_samples_per_class + test_samples_per_class]

    # Append to respective datasets
    train_data = pd.concat([train_data, train_subset], ignore_index=True)
    val_data = pd.concat([val_data, val_subset], ignore_index=True)
    test_data = pd.concat([test_data, test_subset], ignore_index=True)

# Debug: Print the sizes of the datasets after the split
print(f'Training set size: {len(train_data)}')
print(f'Validation set size: {len(val_data)}')
print(f'Testing set size: {len(test_data)}')

# Function to print class counts for a given dataset
def print_class_counts(dataset, dataset_name):
    class_counts = dataset['hotend_class'].value_counts()
    print(f"\nClass indices count in {dataset_name}:")
    for class_id, count in class_counts.items():
        print(f'Class {class_id}: {count} images')

# Print class counts for each dataset
print_class_counts(train_data, "training data")
print_class_counts(val_data, "validation data")
print_class_counts(test_data, "testing data")

# Print the first five rows of each dataset
print("\nFirst five rows of training data:")
print(train_data.head())

print("\nFirst five rows of validation data:")
print(val_data.head())

print("\nFirst five rows of testing data:")
print(test_data.head())

Total images: 2130
Minimum samples per class: 710
Samples per class - Train: 568, Validation: 71, Test: 71
Training set size: 1704
Validation set size: 213
Testing set size: 213

Class indices count in training data:
Class 1: 568 images
Class 2: 568 images
Class 0: 568 images

Class indices count in validation data:
Class 1: 71 images
Class 2: 71 images
Class 0: 71 images

Class indices count in testing data:
Class 1: 71 images
Class 2: 71 images
Class 0: 71 images

First five rows of training data:
         img_path               timestamp  flow_rate  feed_rate  z_offset  \
0   image-685.jpg  2020-10-08T13:18:06-78        109        190     -0.02   
1  image-1701.jpg  2020-10-08T13:26:00-57        110        113      0.08   
2  image-2230.jpg  2020-10-08T13:30:06-73        148         61      0.00   
3   image-717.jpg  2020-10-08T13:18:21-84        109        190     -0.02   
4  image-1629.jpg  2020-10-08T13:25:27-12        110        113      0.08   

   target_hotend  hotend    bed 

## Initialise model, loss function, and optimiser

In [276]:
num_classes = 3  # Number of hot end rate classes
model = SimpleCNN(num_classes=num_classes)  # Assuming SimpleCNN is defined in cnn_models
criterion = nn.CrossEntropyLoss()  # Cross Entropy Loss for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer

## Creating a Balanced batch sampler class

In [386]:
import numpy as np
import pandas as pd
from torch.utils.data import Sampler

class BalancedBatchSampler(Sampler):
    def __init__(self, data_source, batch_size=15):
        self.data_source = data_source
        self.batch_size = batch_size

        # Ensure the batch size is evenly divisible by the number of classes
        self.num_classes = len(data_source['hotend_class'].unique())
        if self.batch_size % self.num_classes != 0:
            raise ValueError("Batch size must be divisible by the number of classes.")
        
        self.samples_per_class = self.batch_size // self.num_classes
        
        # Group indices by class
        self.class_indices = {
            class_id: np.array(data_source.index[data_source['hotend_class'] == class_id])
            for class_id in data_source['hotend_class'].unique()
        }
        
        # Shuffle class indices
        for class_id in self.class_indices:
            np.random.shuffle(self.class_indices[class_id])

        # Debug: Print class indices and their counts
        print(f'Class indices: {self.class_indices}')

        # Initialize counters for balanced and imbalanced batches
        self.balanced_batches_count = 0
        self.imbalanced_batches_count = 0

    def __len__(self):
        # Calculate the total number of batches
        min_class_samples = min(len(indices) for indices in self.class_indices.values())
        return min_class_samples // self.samples_per_class

    def __iter__(self):
        while True:
            batch = []
            class_count = {class_id: 0 for class_id in self.class_indices.keys()}  # Initialize class count
            
            all_classes_filled = True  # Flag to check if all classes have enough samples
            
            for class_id, indices in self.class_indices.items():
                if len(indices) < self.samples_per_class:
                    all_classes_filled = False  # Not enough samples for this class
                    break  # Exit if any class doesn't have enough samples
                
                # Take samples for the batch from each class
                batch.extend(indices[:self.samples_per_class])
                class_count[class_id] += self.samples_per_class  # Count the samples added to the batch
                # Remove these samples from the class indices
                self.class_indices[class_id] = indices[self.samples_per_class:]

            if not all_classes_filled:  # Exit if not all classes have enough samples
                break
            
            # Shuffle within the batch
            np.random.shuffle(batch)

            # Print the number of images from each class in the current batch
            print(f'Batch class counts: {class_count}')

            # Check if the current batch is balanced or imbalanced
            if all(count == self.samples_per_class for count in class_count.values()):
                self.balanced_batches_count += 1
            else:
                self.imbalanced_batches_count += 1

            yield batch  # Yield the current batch

    def print_batch_counts(self):
        # Print the counts of balanced and imbalanced batches
        print(f'Total balanced batches: {self.balanced_batches_count}')
        print(f'Total imbalanced batches: {self.imbalanced_batches_count}')
        
# Usage example
# Assuming balanced_data is already defined as per your previous code
sampler = BalancedBatchSampler(balanced_data, batch_size=15)

# Example of getting all batches
for batch_indices in sampler:
    print(f'Batch indices: {batch_indices}')  # This will show all indices in the batch
    # Here you can use these indices to get the corresponding images from balanced_data
    images = balanced_data.iloc[batch_indices]  # Access the images using the batch indices

# After iterating through all batches, print the totals
sampler.print_batch_counts()  # Call this method to print the totals

Class indices: {1: array([1714,  838, 1771,  261, 2007,  425, 1969, 1625, 1102,  339, 1707,
       2045,  776,  331, 1686, 1473, 1432,  129,  289, 1142, 1294, 2055,
       1670,  707, 1675,  928,  627, 1525, 1794, 1578,   38,  203,  105,
        639,  169, 1145,  171, 1096, 1047, 1236, 1902, 1839, 1508, 1172,
       1898, 1750, 1106, 1173, 1080, 1471, 1888,  448,  661, 1131,  510,
       1195,  534,  459,  423,  736, 1505, 1516, 1603,  373,  223, 1107,
        351,   88,  586, 2052,  756, 1228,  185,  638, 1928, 1061,  865,
       1452, 1844, 2083,  597, 1553,  889, 1208,  335, 2049, 1601,  618,
        198, 1942,  938, 1761,  183, 1485, 1166,  467, 2067, 1620,  454,
        723,  895,  936, 1157,  781,  718,  823,  683,   81,  948,  396,
       2029, 1118, 1879, 1549, 1199,  991,  598,  615, 1372, 1375,  408,
        132,  165,  235, 2112, 1504,   43,  783,  975, 1083, 1250, 2046,
        565,  506,  770,  230,  520, 1353,  857,  785,  456, 1143,  115,
        582,  778,  859, 1778,  

## Training, Validation and Testing batches

In [387]:
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
from tqdm import tqdm
import os
from torchvision import transforms
import torch

class CustomDataset(Dataset):
    def __init__(self, csv_file=None, root_dir=None, transform=None, data_frame=None):
        if data_frame is not None:
            self.data = data_frame
        elif csv_file is not None:
            self.data = pd.read_csv(csv_file, header=0, dtype=str)
        else:
            raise ValueError("Either csv_file or data_frame must be provided.")

        self.root_dir = root_dir
        self.transform = transform or self.default_transform()
        self.valid_indices = self.get_valid_indices()

    def default_transform(self):
        return transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

    def get_valid_indices(self):
        valid_indices = []
        for idx in tqdm(range(len(self.data)), desc="Validating images"):
            img_name = self.data.iloc[idx, 0].strip()
            img_name = img_name.split('/')[-1]
    
            if img_name.startswith("image-"):
                try:
                    image_number = int(img_name.split('-')[1].split('.')[0])
                    if image_number <= 3085:
                        full_img_path = os.path.join(self.root_dir, img_name)
                        if os.path.exists(full_img_path):
                            valid_indices.append(idx)
                        else:
                            print(f"Image does not exist: {full_img_path}")
                except ValueError:
                    print(f"Invalid filename format for {img_name}. Skipping...")
        
        print(f"Total valid indices found: {len(valid_indices)}")  # Debugging output
        return valid_indices


    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        if isinstance(idx, list):
            # Debugging: Print length of valid indices
            print(f"Valid indices count: {len(self.valid_indices)}")
    
            items = [self._load_sample(i) for i in idx]
            items = [item for item in items if item is not None]  # Filter out None entries
    
            if not items:
                raise RuntimeError("No valid items found in the batch.")
    
            # Unzip items and stack images
            images, labels = zip(*items)
            return torch.stack(images), torch.tensor(labels)
    
        else:
            return self._load_sample(idx)


    def _load_sample(self, idx):
        # Get the actual index from valid indices
        actual_idx = self.valid_indices[idx]
        img_name = self.data.iloc[actual_idx, 0].strip()
        full_img_path = os.path.join(self.root_dir, img_name)
    
        try:
            image = Image.open(full_img_path).convert('RGB')  # Ensure image is RGB
            label_str = self.data.iloc[actual_idx, 15]  # Assuming label is in the second column
            
            # Attempt to convert label to integer; handle exceptions
            try:
                label = int(label_str)  # Try converting to int
            except ValueError:
                print(f"Warning: Non-integer label found for image {img_name}: {label_str}")
                print()
                return None  # Skip this sample if label conversion fails
    
            image = self.transform(image)  # Apply transformation
    
            return image, label
        except Exception as e:
            print(f"Error loading image {full_img_path}: {e}")
            return None  # Handle error gracefully

In [388]:
# Paths and configuration
root_dir = r'C:\Users\Sandhra George\avalanche\caxton_dataset\print0'  # Path to the image directory
batch_size = 15

# Wrap each dataset split in CustomDataset
train_dataset = CustomDataset(data_frame=train_data, root_dir=root_dir)

# Initialize the BalancedBatchSampler for the train_dataset
train_sampler = BalancedBatchSampler(train_data, batch_size=batch_size)

# Create DataLoaders with the sampler for balanced batches in training
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)

print_class_counts(train_data, "train_data")

Validating images: 100%|██████████| 1704/1704 [00:00<00:00, 1957.91it/s]


Total valid indices found: 1704
Class indices: {0: array([ 519, 1590,  190,  995, 1696, 1819, 1092,  216, 1074,  567, 1567,
       1024,   92,  180,  999, 1385,   49,  779, 1743,  575,  518, 1370,
       1218,  192, 1128,  302, 1043, 1940,  420, 1878, 1649,  249,   37,
        450, 1108,  587, 1897, 1380, 1444, 2126,  650, 2062, 1672,  822,
       1027,  858,  715,  449, 2027, 1057,  371, 2002, 1500,  782, 1698,
       2128, 1812,  388, 1989,  308,  516,  429,  182,  890,  767, 1681,
       1313, 1713, 1624,  557,  491,  870, 1987,  558, 1359, 1358, 1566,
        210,  167,  579,  503,  610,  291, 1973, 1151, 2116,   48, 1530,
        488, 1355,   78, 1621, 1205, 1659,  162, 1018,   31, 1447, 1792,
        549,   52, 1587, 1910, 1332, 2074, 1747,  843, 1463,  340, 1854,
       1412,  901,   45,  273, 1005, 1760, 1069,  237,  240,  561, 1825,
        243, 1275, 1132,  959, 1665, 1539, 1472, 1644, 1599,  666, 2072,
       1044,  372, 1260,  688, 1986,  482,  366, 1361,  686,  840, 1767,


In [389]:
def create_dataloader(csv_file, root_dir, batch_size, transform):
    # Create an instance of the CustomDataset
    dataset = CustomDataset(csv_file=csv_file, root_dir=root_dir, transform=transform)

    # Create the DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Print the total number of valid images and batches
    print(f'Total valid images: {len(dataset)}')
    print(f'Dataloader size (number of batches 1): {len(dataloader)}')

    return dataloader

In [390]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Iterate through one batch for demonstration
try:
    for images, labels in train_dataloader:
        print(f'Batch images shape: {images.shape}, Batch labels: {labels}')
        break  # Remove this break to iterate through the whole dataloader
except Exception as e:
    print(f"Error during DataLoader iteration: {e}")


Batch images shape: torch.Size([15, 3, 224, 224]), Batch labels: tensor([0, 2, 1, 0, 2, 1, 2, 0, 0, 1, 1, 2, 1, 1, 0])


In [None]:
# Now you can iterate through each DataLoader
print("Training batches:")
for images, labels in train_dataloader:
    print(f'Batch shapes - Images: {images.shape}, Labels: {labels}')

Training batches:
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([1, 0, 2, 1, 1, 0, 0, 1, 2, 0, 2, 0, 0, 0, 0])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([0, 2, 1, 1, 2, 2, 0, 2, 2, 0, 2, 1, 0, 1, 1])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 2, 2, 2, 1])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([1, 0, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 1])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([2, 2, 1, 0, 2, 2, 2, 1, 1, 0, 1, 0, 1, 0, 2])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([1, 2, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 1, 2, 2])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([0, 0, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 2])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), Labels: tensor([0, 0, 0, 1, 2, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1])
Batch shapes - Images: torch.Size([15, 3, 224, 224]), 