In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torchvision
import os
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import math



class HotelDataset(Dataset):
    #Have to define a path parameter
    
    def __init__(self, root_dir="../input/hotel-id-to-combat-human-trafficking-2022-fgvc9/", data_path="train_images/", min_images=1, max_images=None, max_entries=None, transform=None):
        MAX_WINDOWS_FILENAME_CHAR_LENGTH = 260
        self.root_dir = root_dir
        self.data_path = data_path
        self.transform = transform
        self.full_data_path = os.path.join(root_dir, data_path)
        
        #We expect the following structure:
        #full_data_path contains only folders, each folder representing a hotel, and having an unique name
        #inside the hotel folders there are only image files
        
        dirs = os.listdir(self.full_data_path)
        self.num_labels = len(dirs)
        self.total_files = 0
        for directory in dirs:
            files=os.listdir(os.path.join(self.full_data_path,directory))
            files_number = len(files)
            if max_images != None and files_number > max_images:
                dirs.remove(directory)
                continue
            if files_number < min_images:
                dirs.remove(directory)
                continue
            self.total_files += files_number
            if max_entries != None and max_entries >= self.total_files:
                dirs = dirs[:max_entries]
                break
        
        self.hotel_data = np.chararray([self.total_files,2], itemsize=MAX_WINDOWS_FILENAME_CHAR_LENGTH)
        iterator = 0
        for directory in dirs:
            files=os.listdir(os.path.join(self.full_data_path,directory))
            for f in files:
                self.hotel_data[iterator] = [f, directory]
            
    def __getitem__(self, index):
        label = self.hotel_data[index,1].decode()
        hotel_image_id = self.hotel_data[index,0].decode()
        image_path = os.path.join(self.full_data_path, label, hotel_image_id)
        
        image = Image.open(image_path)
        item = image
        if self.transform != None:
            item = self.transform(image)
        return item, label
        
    def __len__(self):
        return self.total_files
#test = np.chararray([50,2], itemsize=MAX_WINDOWS_FILENAME_CHAR_LENGTH)
#test[:] = "testSomeStringLol"
#print(test)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Testing the dataset below

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import math

dataset = HotelDataset(max_entries=500)

print(dataset[0])

Testing transforms. Transforms can be composed, also some of them do have functions such as random rotation.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import math

IMAGE_SIZE_WIDTH = 250
IMAGE_SIZE_HEIGHT = 250

train_transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((IMAGE_SIZE_WIDTH, IMAGE_SIZE_HEIGHT)),
    torchvision.transforms.RandomHorizontalFlip()
])

dataset = HotelDataset(max_entries=500, transform=train_transforms)

print(dataset[0])