In [1]:
#import libraries here
# Base
import numpy as np
import PIL
import os
from datetime import datetime
import pandas as pd


# Scikit-learn
from sklearn.model_selection import train_test_split

# Display
from IPython.display import Image

# Torch
import torch
from torchvision import transforms
from torchvision.io import read_image
from torch.utils.data import DataLoader, random_split, Dataset

# Other utility functions
from accelerate import Accelerator

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#!unzip -qq data/archive.zip -d dataset/

In [None]:
Image(filename=os.path.expandvars('dataset\data\data\input_1_1_12.jpg'))

In [None]:
# GAS GAS GAS 
accelerator = Accelerator(mixed_precision="fp16")
# Check the device
device = accelerator.device

print(f"The default device is set to {torch.cuda.get_device_name(device)}")

In [None]:
#data exploration
data = pd.read_csv('dataset\chinese_mnist.csv', delimiter=',')
data.dataframeName = 'chinese_mnist_df.csv'

data.loc[(data['suite_id'] == 100) & (data['sample_id'] == 10) & (data['code'] == 10)]

In [2]:
# Create custom image dataset (so the images are labeled):
class HRDigitDataSet(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, target_transform=None, seed=69):
        df = pd.read_csv(csv_file)
        df = df.sample(n = len(df), random_state=seed)
        self.labels_df = df.reset_index(drop=True)
        self.img_dir = img_dir        
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.labels_df.shape[0]
    
    def __getitem__(self, index):
        # get image file path using information in the csv file
        image_name = os.path.join(self.img_dir, 
        f"input_{self.labels_df.iloc[index, 0]}_{self.labels_df.iloc[index, 1]}_{self.labels_df.iloc[index, 2]}.jpg")
        # get image tensor using the image file path
        image = read_image(image_name)  # read the image as a grayscale matrix 
        # get image label from the csv file
        label = self.labels_df.iloc[index, 3]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        image = torch.as_tensor(image, dtype=torch.float32) # convert matrix into Pytorch tensor
        label = torch.as_tensor(label, dtype=torch.int64) # convert label into Pytorch tensor
        return image, label

In [3]:
#Test to see if dataset object works
dataset = HRDigitDataSet(csv_file='dataset\chinese_mnist.csv',img_dir='dataset\data\data')
dataset.__getitem__(2)

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]),
 tensor(0))

In [None]:
# Split dataset into training, validation and testing portions
# 60% train / 25% val / 25% test 
def loadDataset(dataset, batch_size=32, train_split = 0.6, val_split = 0.25, test_split = 0.25, random_seed=69):
    # ensure splits add up to 1:
    assert train_split + val_split + test_split == 1.0, "Splits must add up to 1.0"
    # Determine the split sizes
    total_size = len(dataset)
    train_size = int(total_size * train_split)
    val_size = int(total_size * val_split)
    test_size = int(total_size * test_split)
    # This ensures reproducibility
    torch.manual_seed(random_seed)
    # Split the dataset and create loaders
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle= True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle= False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader 