In [3]:
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

In [5]:
data_dir = Path.cwd().parent / "data" / "final" / "public"
img_tar_path = data_dir / "img.tar.gz"
train_path = data_dir / "train.jsonl"
dev_path = data_dir / "dev.jsonl"
test_path = data_dir / "test.jsonl"

PosixPath('/notebooks/data/final/public')

In [None]:
image_transform = torchvision.transforms.Compose(
    [
        torchvision.transforms.Resize(size=(224, 224)),
        torchvision.transforms.ToTensor()
    ]
)

# text_transform = 
# def text_clean():
#     '''text clean for product titles and descriptions
#     '''
#     def cleanhtml(raw_html):
#         cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
#         cleantext = re.sub(cleanr, ' ', raw_html)
#         cleansp = re.sub('\s+', ' ', cleantext)
#         return cleansp

#     # text clean for train and valid set
#     data_root = 'data/raw/'
#     train = pd.read_csv('data/preprocessed/train_0.tsv', sep = '\t')
#     valid = pd.read_csv('data/preprocessed/valid_0.tsv', sep = '\t')
#     train['Title'] = train['Title'].astype(str).apply(cleanhtml)
#     train['Description'] = train['Description'].astype(str).apply(cleanhtml)
#     valid['Title'] = valid['Title'].astype(str).apply(cleanhtml)
#     valid['Description'] = valid['Description'].astype(str).apply(cleanhtml)
#     train.to_csv('data/preprocessed/train_0_clean.tsv', index = False, sep = '\t')
#     valid.to_csv('data/preprocessed/valid_0_clean.tsv', index = False, sep = '\t')

#     # text clean for phase1 test set
#     X_test = pd.read_csv(data_root + 'phase1/data/x_test_task1_phase1.tsv', sep = '\t')
#     X_test['Title'] = X_test['Title'].astype(str).apply(cleanhtml)
#     X_test['Description'] = X_test['Description'].astype(str).apply(cleanhtml)
#     X_test.to_csv('data/preprocessed/x_test_task1_phase1_clean.tsv', index = False, sep = '\t')

#     # text clean for phase2 test set
#     X_test = pd.read_csv(data_root + 'phase2/x_test_task1_phase2.tsv', sep = '\t')
#     X_test['Title'] = X_test['Title'].astype(str).apply(cleanhtml)
#     X_test['Description'] = X_test['Description'].astype(str).apply(cleanhtml)
#     X_test.to_csv('data/preprocessed/x_test_task1_phase2_clean.tsv', index = False, sep = '\t')

In [None]:
class ProductDataset(Dataset):
    """
    We want the dataset to return data ready for model input, that means torch.tensors. 
    So our __getitem__ method will need to prepare:

        Images by applying image_transform
        Text by applying text_transform
    
    image_transform was introduced above, and text_transform will be the "sentence vector" created by our fastText model.
    
    We'll return our samples as dictionaries with keys for
    
        "id", the product id
        "image", the image tensor
        "text", the text tensor
        "label", the product category if it exists
    """

    def __init__(
        self,
        txt_path,
        img_path,
        image_transform,
        text_transform,
        random_state=0,
    ):

        self.samples_frame = pd.read_json(
            data_path, lines=True
        )
        self.samples_frame = self.samples_frame.reset_index(
            drop=True
        )
        self.samples_frame.img = self.samples_frame.apply(
            lambda row: (img_dir / row.img), axis=1
        )

        # https://github.com/drivendataorg/pandas-path
        if not self.samples_frame.img.path.exists().all():
            raise FileNotFoundError
        if not self.samples_frame.img.path.is_file().all():
            raise TypeError
            
        self.image_transform = image_transform
        self.text_transform = text_transform

    def __len__(self):
        """This method is called when you do len(instance) 
        for an instance of this class.
        """
        return len(self.samples_frame)

    def __getitem__(self, idx):
        """This method is called when you do instance[key] 
        for an instance of this class.
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = self.samples_frame.loc[idx, "id"]

        image = Image.open(
            self.samples_frame.loc[idx, "img"]
        ).convert("RGB")
        image = self.image_transform(image)

        text = torch.Tensor(
            self.text_transform.get_sentence_vector(
                self.samples_frame.loc[idx, "text"]
            )
        ).squeeze()

        if "label" in self.samples_frame.columns:
            label = torch.Tensor(
                [self.samples_frame.loc[idx, "label"]]
            ).long().squeeze()
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text, 
                "label": label
            }
        else:
            sample = {
                "id": img_id, 
                "image": image, 
                "text": text
            }

        return sample

In [2]:

class ECTextDataset(Dataset):

class ProductDataModule(pl.LightningDataModule):

    def __init__(self, data_dir: str = './'):
        super().__init__()
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])

        # self.dims is returned when you call dm.size()
        # Setting default dims here because we know them.
        # Could optionally be assigned dynamically in dm.setup()
        self.dims = (1, 28, 28)
        self.num_classes = 10

    def prepare_data(self):
        # download
        MNIST(self.data_dir, train=True, download=True)
        MNIST(self.data_dir, train=False, download=True)

    def setup(self, stage=None):

        # Assign train/val datasets for use in dataloaders
        if stage == 'fit' or stage is None:
            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

        # Assign test dataset for use in dataloader(s)
        if stage == 'test' or stage is None:
            self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=32)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=32)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=32)