From cad18a49660495bd7f21d05ed7d1d34ef0a9cb02 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Sun, 10 Jul 2022 17:05:50 +0100
Subject: [PATCH 01/35] Added Stereo Matching dataset interface and several
 classic datasets.

---
 torchvision/datasets/_stereo_matching.py | 479 +++++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 torchvision/datasets/_stereo_matching.py

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
new file mode 100644
index 00000000000..42535c1623b
--- /dev/null
+++ b/torchvision/datasets/_stereo_matching.py
@@ -0,0 +1,479 @@
+from abc import ABC, abstractmethod
+from functools import reduce
+from glob import glob
+from pathlib import Path
+from random import random
+import re
+import shutil
+from typing import Callable, List, Optional, Tuple, Any
+import lzma
+from torch import Tensor
+from .vision import VisionDataset
+from .utils import download_and_extract_archive, download_url, verify_str_arg
+import os
+from torch.utils.model_zoo import tqdm
+import numpy as np
+from PIL import Image
+
+__all__ = (
+    "CSEStereo"
+    "Middlebury2014"
+    "ETH3D"
+    "Kitti2012"
+    "Kitti2015"
+)
+
+
+def read_pfm_file(file_path: str) -> np.array:
+    # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
+    with open(file_path, "rb") as file:
+        header = file.readline().rstrip()
+        assert header in ["PF", "Pf"], f"{file_path} is not a valid .pfm file"
+        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
+        assert dim_match, f"{file_path} has a Malformed PFM header"
+
+        width, height = map(int, dim_match.groups())
+        channels = 3 if header == "PF" else 1
+        scale = float(file.readline().rstrip())
+        # check for endian type
+        if scale < 0:
+            scale = -scale
+            endian = '<'
+        else:
+            endian = '>'
+
+        data = np.fromfile(file, endian + 'f')
+        data = np.reshape(data, (height, width, channels))
+        data = np.flipud(data)
+
+        return data
+
+
+class StereoMatchingDataset(ABC, VisionDataset):
+    """Base interface for Stereo matching datasets"""
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None):
+        super().__init__(root=root)
+        self.transforms = transforms
+
+        self._images: List[Tuple] = []
+        self._disparities: List[Tuple] = []
+
+    def _read_img(self, file_path: str) -> Image.Image:
+        img = Image.open(file_path)
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        return img
+
+    @abstractmethod
+    def _read_disparity(self, file_path: str) -> Tuple:
+        # function that returns a disparity map and an occlusion map
+        pass
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        img_left = self._read_img(self._images[index][0])
+        img_right = self._read_img(self._images[index][1])
+
+        dsp_map_left, occ_mask_left = self._read_disparity(self._disparities[index][0])
+        dsp_map_right, occ_mask_right = self._read_disparity(self._disparities[index][1])
+
+        imgs = (img_left, img_right)
+        dsp_maps = (dsp_map_left, dsp_map_right)
+        occ_masks = (occ_mask_left, occ_mask_right)
+
+        if self.transforms is not None:
+            imgs, dsp_maps, occ_masks, = self.transforms(imgs, dsp_maps, occ_masks)
+
+        return imgs, dsp_maps, occ_masks
+
+    def __len__(self) -> int:
+        return len(self._images)
+
+
+class CRESSyntethicStereo(StereoMatchingDataset):
+    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
+
+   Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
+   """
+    DOWNLOAD_SPACE = 4 * 1024 * 1024 * 1024  # dataset requires download requires about 400 GB of free space
+
+    EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None, download: bool = True):
+        super().__init__(root, transforms)
+        # if the API user requests a dataset download check that the user can download it
+        if download:
+            statvfs = os.statvfs(root)
+            # measured in bytes
+            available_space = statvfs.f_frsize * statvfs.f_bavail
+            if available_space - self.DOWNLOAD_SPACE < 0:
+                raise ValueError(
+                    f"The storage device for {root} is too small to download the dataset), "
+                    f"an additional {self.DOWNLOAD_SPACE - self.available_space:.2f} GB are required."
+                )
+            self._download_dataset(root)
+
+    def _download_dataset(self, root: str) -> None:
+        # TODO: remove before release, used only for testing purposes
+        dirs = ["tree", "shapenet", "reflective", "hole"]
+        # create directory subtree for the download
+        for d in dirs:
+            d_path = os.path.join(root, d)
+            if not os.path.exists(d_path):
+                os.makedirs(d_path)
+
+            for i in range(self.EXPERIMENTAL_RANGE):
+                url = f"https://data.megengine.org.cn/research/crestereo/dataset/{d}/{i}.tar"
+                download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
+
+
+class Middlebury2014(StereoMatchingDataset):
+    """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
+
+    The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
+
+        root
+            Middlebury2014
+                train
+                    scene1-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    scene2-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    ...
+                additional
+                    scene1-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    ...
+                test
+                    scene1
+                        calib.txt
+                        im{0,1}.png
+                    scene2
+                        calib.txt
+                        im{0,1}.png
+                    ...
+
+
+    Args:
+        root (string): Root directory of the Middleburry 2014 Dataset.
+        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
+        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible. Sampled with equal probability.
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+    """
+
+    splits = {
+        "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
+        "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
+        "test": ['Plants', 'Classroom2E', 'Classroom2', 'Australia', 'DjembeL', 'CrusadeP', 'Crusade', 'Hoops', 'Bicycle2', 'Staircase', 'Newkuba', 'AustraliaP', 'Djembe', 'Livingroom', 'Computer']
+    }
+
+    def __init__(
+        self,
+        *,
+        root: str,
+        split: str = "train",
+        use_ambient_views: bool = False,
+        transforms: Optional[Callable] = None,
+        download: bool = False
+    ):
+        super().__init__(root, transforms)
+        verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
+
+        if download:
+            self._download_dataset(root)
+
+        root = Path(root) / "FlyingChairs"
+        if not os.path.exists(root / split):
+            raise FileNotFoundError(
+                f"The {split} directory was not found in the provided root directory"
+            )
+
+        split_scenes = self.splits[split]
+        # check that the provided root folder contains the scene splits
+        if not all(s in os.listdir(root / split) for s in split_scenes):
+            raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
+
+        imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
+        imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+
+        if split == "test":
+            dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+        else:
+
+            dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+            dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
+        self._disparities = list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+
+        self.use_ambient_views = use_ambient_views
+
+    def __getitem__(self, index: int) -> Tuple:
+        return super().__getitem__(index)
+
+    def _read_img(self, file_path: str) -> Image.Image:
+        if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
+            # initialize sampleable container
+            ambient_file_paths = list(file_path.replace("im1.png", view_name) for view_name in ["im1E.png", "im1L.png"])
+            # double check that we're not going to try to read from an invalid file path
+            ambient_file_paths = list(filter(lambda p: os.path.exists(p), ambient_file_paths))
+            # keep the original image as an option as well for uniform sampling between base views
+            ambient_file_paths.append(file_path)
+            file_path = random.choice(ambient_file_paths)
+        return super()._read_img(file_path)
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):  # case when dealing with the test split
+            return None, None
+        dsp_mask = read_pfm_file(file_path)
+        occ_mask = dsp_mask < 1e3
+        return dsp_mask, occ_mask
+
+    def _download_dataset(self, root: str):
+        base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
+        # train and additional splits have 2 different calibration settings
+        root = Path(root) / "Middlebury2014"
+        for split_name, split_scenes in self.splits.values():
+            if split_name == "test":
+                continue
+            split_root = root / split_name
+            for scene in split_scenes:
+                scene_name = f"{scene}-{calibration}"
+                for calibration in ["perfect", "imperfect"]:
+                    scene_url = f"{base_url}/{scene_name}.zip"
+                    download_and_extract_archive(url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True)
+
+        if any(s not in os.listdir(root) for s in self.splits["test"]):
+            # test split is downloaded from a different location
+            test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
+
+            # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
+            # we want to move the contents from testF into the  directory
+            download_and_extract_archive(url=test_set_url, download_root=root, remove_finished=True)
+            for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
+                for scene in scene_names:
+                    shutil.move(os.path.join(scene_dir, scene), os.path.join(root, scene))
+
+            # cleanup MiddEval3 directory
+            shutil.rmtree(os.path.join(root, "MiddEval3"))
+
+
+class ETH3D(StereoMatchingDataset):
+    """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            ETH3D
+                two_view_training
+                    scene1
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    scene2
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    ...
+                two_view_training_gt
+                    scene1
+                        disp0GT.pfm
+                        mask0nocc.png
+                    scene2
+                        disp0GT.pfm
+                        mask0nocc.png
+                    ...
+                two_view_testing
+                    scene1
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    scene2
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    ...
+
+    Args:
+        root (string): Root directory of the ETH3D Dataset.
+        split (string, optional): The dataset split of scenes, either "train" (default) or "test".
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "ETH3D"
+        img_dir = "two_view_training" if split == "train" else "two_view_testing"
+        anot_dir = "two_view_training_gt"
+
+        imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
+        imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
+
+        if split == "test":
+            dsp_masks_left, dsp_masks_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+        else:
+            dsp_masks_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            # no masks for the right view, always using left as reference
+            dsp_masks_right = list("" for _ in dsp_masks_left)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = read_pfm_file(file_path)
+        occ_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
+        occ_mask = np.array(occ_mask)
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
+
+
+class Kitti2012(StereoMatchingDataset):
+    """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
+    Uses the RGB images for consistency with Kitti 2015.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            Kitti2012
+                testing
+                    colored_0
+                    colored_1
+                training
+                    colored_0
+                    colored_1
+                    disp_noc
+                    calib
+
+    Args:
+        root (string): Root directory where Kitti2012 is located.
+        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "Kitti2012" / (split + "ing")
+        imgs_left = sorted(glob(str(root / "colored_0" / "*_10.png")))
+        imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
+
+        if split == "train":
+            dsp_masks_left = sorted(glob(str(root / "disp_noc" / "*.png")))
+            dsp_masks_right = list("" for _ in dsp_masks_left)
+        else:
+            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = np.array(Image.open(file_path)) / 256.0
+        occ_mask = dsp_mask > 0.0
+
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
+
+
+class Kitti2015(StereoMatchingDataset):
+    """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            Kitti2015
+                testing
+                    image_2
+                    image_3
+                training
+                    image_2
+                    image_3
+                    disp_noc_0
+                    disp_noc_1
+                    calib
+
+    Args:
+        root (string): Root directory where Kitti2015 is located.
+        split (string, optional): The dataset split of scenes, either "train" (default) or test.
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "Kitti2015" / (split + "ing")
+        imgs_left = sorted(glob(str(root / "image_2" / "*_10.png")))
+        imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
+
+        if split == "train":
+            dsp_masks_left = sorted(glob(str(root / "disp_noc_0" / "*.png")))
+            dsp_masks_right = sorted(glob(str(root / "disp_noc_1" / "*.png")))
+        else:
+            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = np.array(Image.open(file_path)) / 256.0
+        occ_mask = dsp_mask > 0.0
+
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)

From df6ec4ba3f1ad48a01748637213122ccbd3b73c3 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 20:19:23 +0100
Subject: [PATCH 02/35] added SceneFlow, FallingThings and CREStereo

---
 torchvision/datasets/_stereo_matching.py | 47 +++++++++++++++++++++++-
 vision                                   |  1 +
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 160000 vision

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 42535c1623b..960e443bd46 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -28,7 +28,8 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        assert header in ["PF", "Pf"], f"{file_path} is not a valid .pfm file"
+        assert header in [b"PF", b"Pf"], f"{file_path} is not a valid .pfm file"
+
         dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
         assert dim_match, f"{file_path} has a Malformed PFM header"
 
@@ -477,3 +478,47 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
+
+
+class SintelDataset(StereoMatchingDataset):
+    """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
+
+    Args:
+        root (string): Root directory where Sintel Stereo is located.
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        root = Path(root) / "Sintel"
+
+        imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
+        imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
+
+        dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
+        dsp_masks_right = list("" for _ in dps_masks_left)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dps_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        # disparity decoding as per Sintel instructions
+        dsp_mask = np.array(Image.open(file_path), dtype=np.float32)
+        r, g, b = np.split(dsp_mask, 3, axis=-1)
+        dsp_mask = r * 4 + g / (2**6) + b / (2**14)
+
+        # occlusion mask
+        occ_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) > 0
+        # out of frame mask
+        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) > 0
+        # combine the masks together
+        occ_mask = np.logical_or(off_mask, occ_mask)
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
diff --git a/vision b/vision
new file mode 160000
index 00000000000..bd19fb8ea9b
--- /dev/null
+++ b/vision
@@ -0,0 +1 @@
+Subproject commit bd19fb8ea9b1f67df2a2a1ee116874609ad3ee8c

From d0c5afbcb37f430626f77d23bf153ec044160c31 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 23:29:04 +0100
Subject: [PATCH 03/35] added SceneFlow, FallingThings and CREStereo

---
 torchvision/datasets/_stereo_matching.py | 228 ++++++++++++++++++-----
 1 file changed, 183 insertions(+), 45 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 960e443bd46..65336503b87 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,26 +1,28 @@
 from abc import ABC, abstractmethod
-from functools import reduce
 from glob import glob
 from pathlib import Path
 from random import random
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
-import lzma
 from torch import Tensor
 from .vision import VisionDataset
 from .utils import download_and_extract_archive, download_url, verify_str_arg
 import os
-from torch.utils.model_zoo import tqdm
 import numpy as np
 from PIL import Image
+import json
 
 __all__ = (
-    "CSEStereo"
+    "CREStereo"  # waiting for download
     "Middlebury2014"
     "ETH3D"
     "Kitti2012"
     "Kitti2015"
+    "Sintel"
+    "SceneFlow"  # need to find valid mask procedure
+    "FallingThings"
+    "InStereo2k"  # waiting for download
 )
 
 
@@ -71,21 +73,21 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
-        dsp_map_left, occ_mask_left = self._read_disparity(self._disparities[index][0])
-        dsp_map_right, occ_mask_right = self._read_disparity(self._disparities[index][1])
+        dsp_map_left, valid_mask_right = self._read_disparity(self._disparities[index][0])
+        dsp_map_right, valid_mask_right = self._read_disparity(self._disparities[index][1])
 
         imgs = (img_left, img_right)
         dsp_maps = (dsp_map_left, dsp_map_right)
-        occ_masks = (occ_mask_left, occ_mask_right)
+        valid_masks = (valid_mask_right, valid_mask_right)
 
         if self.transforms is not None:
-            imgs, dsp_maps, occ_masks, = self.transforms(imgs, dsp_maps, occ_masks)
+            imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
 
-        return imgs, dsp_maps, occ_masks
+        return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
 
     def __len__(self) -> int:
         return len(self._images)
@@ -100,7 +102,9 @@ class CRESSyntethicStereo(StereoMatchingDataset):
 
     EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None, download: bool = True):
+    MAX_DISP = 256.
+
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = True):
         super().__init__(root, transforms)
         # if the API user requests a dataset download check that the user can download it
         if download:
@@ -114,6 +118,32 @@ def __init__(self, root: str, transforms: Optional[Callable] = None, download: b
                 )
             self._download_dataset(root)
 
+        verify_str_arg(split, "split", valid_values=("tree", "shapenet", "reflective", "hole", "all"))
+
+        splits = {
+            "tree": ["tree"],
+            "shapenet": ["shapenet"],
+            "reflective": ["reflective"],
+            "hole": ["hole"],
+            "all": ["hole", "shapenet", "reflective", "hole"],
+        }[split]
+
+        for s in splits:
+            imgs_left = sorted(glob(str(root / s / "*_left.jpg")))
+            imgs_right = (p.replace("_left", "_right") for p in imgs_left)
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = (p.replace("_left", "_left.disp") for p in imgs_left)
+            disparity_maps_right = (p.replace("_right", "_right.disp") for p in imgs_right)
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        valid = (disparity < self.MAX_DISP) & (disparity > 0.)
+        return disparity, valid
+
     def _download_dataset(self, root: str) -> None:
         # TODO: remove before release, used only for testing purposes
         dirs = ["tree", "shapenet", "reflective", "hole"]
@@ -249,9 +279,9 @@ def _read_img(self, file_path: str) -> Image.Image:
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):  # case when dealing with the test split
             return None, None
-        dsp_mask = read_pfm_file(file_path)
-        occ_mask = dsp_mask < 1e3
-        return dsp_mask, occ_mask
+        disparity_map = read_pfm_file(file_path)
+        valid_mask = disparity_map < 1e3
+        return disparity_map, valid_mask
 
     def _download_dataset(self, root: str):
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
@@ -347,23 +377,23 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
 
         if split == "test":
-            dsp_masks_left, dsp_masks_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
-            dsp_masks_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
             # no masks for the right view, always using left as reference
-            dsp_masks_right = list("" for _ in dsp_masks_left)
+            disparity_maps_right = list("" for _ in disparity_maps_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = read_pfm_file(file_path)
-        occ_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
-        occ_mask = np.array(occ_mask)
-        return dsp_mask, occ_mask
+        disparity_map = read_pfm_file(file_path)
+        valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
+        valid_mask = np.array(valid_mask)
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -404,22 +434,22 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
 
         if split == "train":
-            dsp_masks_left = sorted(glob(str(root / "disp_noc" / "*.png")))
-            dsp_masks_right = list("" for _ in dsp_masks_left)
+            disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
+            disparity_maps_right = list("" for _ in disparity_maps_left)
         else:
-            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = np.array(Image.open(file_path)) / 256.0
-        occ_mask = dsp_mask > 0.0
+        disparity_map = np.array(Image.open(file_path)) / 256.0
+        valid_mask = disparity_map > 0.0
 
-        return dsp_mask, occ_mask
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -459,22 +489,22 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
 
         if split == "train":
-            dsp_masks_left = sorted(glob(str(root / "disp_noc_0" / "*.png")))
-            dsp_masks_right = sorted(glob(str(root / "disp_noc_1" / "*.png")))
+            disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
+            disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
         else:
-            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = np.array(Image.open(file_path)) / 256.0
-        occ_mask = dsp_mask > 0.0
+        disparity_map = np.array(Image.open(file_path)) / 256.0
+        valid_mask = disparity_map < 0.0
 
-        return dsp_mask, occ_mask
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -498,27 +528,135 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
 
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
-        dsp_masks_right = list("" for _ in dps_masks_left)
+        disparity_maps_right = list("" for _ in dps_masks_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dps_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(dps_masks_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
         # disparity decoding as per Sintel instructions
-        dsp_mask = np.array(Image.open(file_path), dtype=np.float32)
-        r, g, b = np.split(dsp_mask, 3, axis=-1)
-        dsp_mask = r * 4 + g / (2**6) + b / (2**14)
+        disparity_map = np.array(Image.open(file_path), dtype=np.float32)
+        r, g, b = np.split(disparity_map, 3, axis=-1)
+        disparity_map = r * 4 + g / (2**6) + b / (2**14)
 
         # occlusion mask
-        occ_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) > 0
+        valid_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) == 0
         # out of frame mask
-        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) > 0
+        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) == 0
         # combine the masks together
-        occ_mask = np.logical_or(off_mask, occ_mask)
-        return dsp_mask, occ_mask
+        valid_mask = np.logical_or(off_mask, valid_mask)
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
+
+
+class SceneFlowDataset(StereoMatchingDataset):
+    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
+
+    def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
+        split = split.upper()
+
+        verify_str_arg(split, "pass_name", valid_values=("clean", "final", "both"))
+
+        passes = {
+            "clean": ["frames_cleanpass"],
+            "final": ["frames_finalpass"],
+            "both": ["frames_cleanpass, frames_finalpass"],
+        }[pass_name]
+
+        root = Path(root) / split
+
+        for p in passes:
+            imgs_left = sorted(glob(str(root / p / "left" / "*" / "*.png")))
+            imgs_right = sorted(glob(str(root / p / "right" / "*" / "*.png")))
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
+            disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = read_pfm_file(file_path)
+        valid = np.ones_like(disparity)
+        return disparity, valid
+
+
+class FallingThingsDataset(StereoMatchingDataset):
+    """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            FallingThings
+                single
+                    scene1
+                        _object_settings.json
+                        _camera_settings.json
+                        image1.left.depth.png
+                        image1.right.depth.png
+                        image1.left.jpg
+                        image1.right.jpg
+                        image2.left.depth.png
+                        image2.right.depth.png
+                        image2.left.jpg
+                        image2.right
+                        ...
+                    scene2
+                    ...
+                mixed
+                    scene1
+                        _object_settings.json
+                        _camera_settings.json
+                        image1.left.depth.png
+                        image1.right.depth.png
+                        image1.left.jpg
+                        image1.right.jpg
+                        image2.left.depth.png
+                        image2.right.depth.png
+                        image2.left.jpg
+                        image2.right
+                        ...
+                    scene2
+                    ...
+    """
+
+    def __init__(self, root: str, split: str = "single", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("single", "mixed", "both"))
+        split = split.upper()
+
+        splits = {
+            "single": ["single"],
+            "mixed": ["mixed"],
+            "both": ["single", "mixed"],
+        }[split]
+
+        for s in splits:
+            imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
+            imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
+            disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        depth = Image.Open(file_path)
+        with open(os.path.split(file_path)[0] + '_camera_settings.json', 'r') as f:
+            intrinsics = json.load(f)
+            fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
+            valid = disparity > 0
+            return disparity, valid

From a5664754ee313dcfc269a3be8645ae15a0db11ba Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 23:34:27 +0100
Subject: [PATCH 04/35] "removed duplicate folder"

---
 vision | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 vision

diff --git a/vision b/vision
deleted file mode 160000
index bd19fb8ea9b..00000000000
--- a/vision
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd19fb8ea9b1f67df2a2a1ee116874609ad3ee8c

From 8ea74f202735832dd8fd2b3122da195ab5bf1f69 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 11:22:29 +0100
Subject: [PATCH 05/35] Added InStereo2k. Started working on dataset tests

---
 test/datasets_utils.py                   |  14 +-
 test/test_datasets.py                    | 552 ++++++++++++++++++++++-
 torchvision/datasets/__init__.py         |   1 +
 torchvision/datasets/_stereo_matching.py | 191 ++++++--
 4 files changed, 686 insertions(+), 72 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 88eb4e17823..f051e325968 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -561,9 +561,9 @@ def test_feature_types(self, config):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(dataset) == info["num_examples"], f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
 
-    @test_all_configs
+    @ test_all_configs
     def test_transforms(self, config):
         mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args)
         for kwarg in self._TRANSFORM_KWARGS:
@@ -587,7 +587,7 @@ class ImageDatasetTestCase(DatasetTestCase):
 
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def create_dataset(
         self,
         config: Optional[Dict[str, Any]] = None,
@@ -610,7 +610,7 @@ def create_dataset(
             with self._force_load_images():
                 yield dataset, info
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def _force_load_images(self):
         open = PIL.Image.open
 
@@ -649,7 +649,7 @@ def _set_default_frames_per_clip(self, inject_fake_data):
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @functools.wraps(inject_fake_data)
+        @ functools.wraps(inject_fake_data)
         def wrapper(tmpdir, config):
             args = inject_fake_data(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
@@ -748,7 +748,7 @@ def size(idx: int) -> Tuple[int, int, int]:
     ]
 
 
-@requires_lazy_imports("av")
+@ requires_lazy_imports("av")
 def create_video_file(
     root: Union[pathlib.Path, str],
     name: Union[pathlib.Path, str],
@@ -790,7 +790,7 @@ def create_video_file(
     return file
 
 
-@requires_lazy_imports("av")
+@ requires_lazy_imports("av")
 def create_video_folder(
     root: Union[str, pathlib.Path],
     name: Union[str, pathlib.Path],
diff --git a/test/test_datasets.py b/test/test_datasets.py
index a108479aee3..d390c30cee9 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1,3 +1,4 @@
+from abc import abstractmethod
 import bz2
 import contextlib
 import csv
@@ -10,6 +11,7 @@
 import random
 import shutil
 import string
+from typing import List, Callable, Tuple
 import unittest
 import xml.etree.ElementTree as ET
 import zipfile
@@ -23,30 +25,540 @@
 from torchvision import datasets
 
 
+class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoETH3D
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        # create the scene folder
+        image_paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with left right images
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
+        return image_paths
+
+    @staticmethod
+    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        # create scene directories
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with a random png file for occlusion mask, and a pfm file for disparity
+            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
+            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
+            paths.append(pfm_path)
+        return paths
+
+    def inject_fake_data(self, tmpdir, config):
+        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+
+        num_examples = 2 if config["split"] == "train" else 3
+
+        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
+        split_dir = os.path.join(eth3d_dir, split_name)
+        self._create_scene_folder(num_examples, split_dir)
+
+        if config["split"] == "train":
+            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
+            self._create_annotation_folder(num_examples, annot_dir)
+
+        return num_examples
+
+    def test_training_test_splits(self):
+        with self.create_dataset(split="train") as (dataset, _):
+            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            for _, _, disparity, valid_mask in dataset:
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                dh, dw, _ = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
+
+        with self.create_dataset(split="test") as (dataset, _):
+            assert all(d == ("", "") for d in dataset._disparities)
+            for _, _, disparity, valid_mask in dataset:
+                assert disparity is None
+                assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class CREStereoSynthethicTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CREStereoSynthetic
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
+        os.makedirs(crestereo_dir, exist_ok=True)
+
+        split_dir = crestereo_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+        num_examples = 4
+
+        for idx in range(num_examples):
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+
+        return num_examples
+
+    def test_splits(self):
+        for split in ("tree", "shapenet", "reflective", "hole"):
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoMiddlebury2014
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "additional"), use_ambient_views=(True, False))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
+        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
+        scene_dirs = []
+        for c in calibrations:
+            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # make normal images first
+            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
+            scene_dirs.append(scene_dir)
+        return scene_dirs
+
+    def inject_fake_data(self, tmpdir, config):
+        split_scene_map = {
+            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
+            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+        }
+
+        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
+        os.makedirs(middlebury_dir, exist_ok=True)
+
+        split_dir = middlebury_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+        for idx in range(num_examples):
+            # special case for test_bad_input
+            if config["split"] not in split_scene_map:
+                return 0
+
+            scene_name = split_scene_map[config["split"]][idx]
+            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+
+        # account for perfect / imperfect calibrations
+        if config["split"] != "test":
+            num_examples *= 2
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train", "additional"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 3
+                    assert disparity.shape == (h, w, 3)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw, c = disparity.shape
+                    print(valid_mask.shape)
+                    mh, mw, _ = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_augmented_view_usage(self):
+        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
+            for left, right, _, _ in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2012
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_0",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_1",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_noc",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2012 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2015
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_2",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_3",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_0",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_1",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSceneFlow
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("FlyingThings3D", "Driving", "Monkaa"),
+        pass_name=("clean", "final")
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]):
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
+
+        for i in range(num_examples):
+            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+
+    def inject_fake_data(self, tmpdir, config):
+        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
+        os.makedirs(scene_flow_dir, exist_ok=True)
+
+        split_dir = scene_flow_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        pass_dir_map = {
+            "clean": "frames_cleanpass",
+            "final": "frames_finalpass",
+        }
+
+        num_examples = 4
+        pass_dir_name = pass_dir_map[config["pass_name"]]
+        # create pass directories
+        pass_dir = split_dir / pass_dir_name
+        disp_dir = split_dir / "disp"
+        os.makedirs(pass_dir, exist_ok=True)
+        os.makedirs(disp_dir, exist_ok=True)
+
+        # root / pass / direction / scene / .imgs
+        # root / disparity / direction / scene / .imgs
+        for direction in ["left", "right"]:
+            for scene_idx in range(num_examples):
+                # scene_dir = pass_dir / direction / f"scene_{scene_idx:06d}"
+                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                datasets_utils.create_image_folder(
+                    root=pass_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.png",
+                    num_examples=3,
+                    size=(3, 100, 200),
+                )
+                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                self._create_pfm_folder(
+                    root=disp_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.pfm",
+                    num_examples=3,
+                    size=(100, 200),
+                )
+
+        return num_examples * 3
+
+    def test_train_splits(self):
+        for split, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split, pass_name=pass_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w, 3)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw, _ = disparity.shape
+                    mh, mw, _ = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+
+class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoFallingThings
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root: str, scene_name: str, num_examples: int, size: Tuple[int, int]):
+        root = pathlib.Path(root) / scene_name
+        os.makedirs(root, exist_ok=True)
+
+        datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[0], size[1]))
+        # single channel depth maps
+        datasets_utils.create_image_file(root, "image1.left.depth.jpg", size=(1, size[0], size[1]))
+        datasets_utils.create_image_file(root, "image1.right.depth.jpg", size=(1, size[0], size[1]))
+
+    def inject_fake_data(self, tmpdir, config):
+        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
+
+        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        for i in range(num_examples):
+            self._make_scene_folder(
+                root=split_dir,
+                scene_name=f"scene_{i:06d}",
+                num_examples=num_examples,
+                size=(100, 200),
+            )
+
+        return num_examples
+
+
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-    @staticmethod
+    @ staticmethod
     def _make_binary_file(num_elements, root, name):
         file_name = os.path.join(root, name)
         np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    @staticmethod
+    @ staticmethod
     def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
         STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-    @staticmethod
+    @ staticmethod
     def _make_label_file(num_images, root, name):
         STL10TestCase._make_binary_file(num_images, root, name)
 
-    @staticmethod
+    @ staticmethod
     def _make_class_names_file(root, name="class_names.txt"):
         with open(os.path.join(root, name), "w") as fh:
             for cname in ("airplane", "bird"):
                 fh.write(f"{cname}\n")
 
-    @staticmethod
+    @ staticmethod
     def _make_fold_indices_file(root):
         num_folds = 10
         offset = 0
@@ -58,7 +570,7 @@ def _make_fold_indices_file(root):
 
         return tuple(range(1, num_folds + 1))
 
-    @staticmethod
+    @ staticmethod
     def _make_train_files(root, num_unlabeled_images=1):
         num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
         num_train_images = sum(num_images_in_fold)
@@ -69,7 +581,7 @@ def _make_train_files(root, num_unlabeled_images=1):
 
         return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @staticmethod
+    @ staticmethod
     def _make_test_files(root, num_images=2):
         STL10TestCase._make_image_file(num_images, root, "test_X.bin")
         STL10TestCase._make_label_file(num_images, root, "test_y.bin")
@@ -887,7 +1399,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_images
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def create_dataset(self, *args, **kwargs):
         with super().create_dataset(*args, **kwargs) as output:
             yield output
@@ -1293,7 +1805,7 @@ def _create_archive(self, root, name, *files):
 
         return archive
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_feature_types(self, config):
         feature_types = self.FEATURE_TYPES
         self.FEATURE_TYPES = self._TRAIN_FEATURE_TYPES if config["train"] else self._TEST_FEATURE_TYPES
@@ -1571,7 +2083,7 @@ def _file_name_fn(self, cls, ext, idx):
     def _is_valid_file_to_extensions(self, is_valid_file):
         return {ext for ext in self._EXTENSIONS if is_valid_file(f"foo.{ext}")}
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_is_valid_file(self, config):
         extensions = config.pop("extensions")
         # We need to explicitly pass extensions=None here or otherwise it would be filled by the value from the
@@ -1581,7 +2093,7 @@ def test_is_valid_file(self, config):
         ) as (dataset, info):
             assert len(dataset) == info["num_examples"]
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1602,7 +2114,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return dict(num_examples=num_examples_total, classes=classes)
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1701,32 +2213,32 @@ class Places365TestCase(datasets_utils.ImageDatasetTestCase):
         *((f"{category}/Places365_train_00000001.png", idx) for category, idx in _CATEGORIES_CONTENT),
     )
 
-    @staticmethod
+    @ staticmethod
     def _make_txt(root, name, seq):
         file = os.path.join(root, name)
         with open(file, "w") as fh:
             for text, idx in seq:
                 fh.write(f"{text} {idx}\n")
 
-    @staticmethod
+    @ staticmethod
     def _make_categories_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._CATEGORIES_CONTENT)
 
-    @staticmethod
+    @ staticmethod
     def _make_file_list_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._FILE_LIST_CONTENT)
 
-    @staticmethod
+    @ staticmethod
     def _make_image(file_name, size):
         os.makedirs(os.path.dirname(file_name), exist_ok=True)
         PIL.Image.fromarray(np.zeros((*size, 3), dtype=np.uint8)).save(file_name)
 
-    @staticmethod
+    @ staticmethod
     def _make_devkit_archive(root, split):
         Places365TestCase._make_categories_txt(root, Places365TestCase._CATEGORIES)
         Places365TestCase._make_file_list_txt(root, Places365TestCase._FILE_LISTS[split])
 
-    @staticmethod
+    @ staticmethod
     def _make_images_archive(root, split, small):
         folder_name = Places365TestCase._IMAGES[(split, small)]
         image_size = (256, 256) if small else (512, random.randint(512, 1024))
@@ -2041,7 +2553,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples[config["split"]]
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_flow(self, config):
         # Make sure flow always exists, and make sure there are as many flow values as (pairs of) images
         # Also make sure the flow is properly decoded
@@ -2100,7 +2612,7 @@ def inject_fake_data(self, tmpdir, config):
         )
         return num_examples
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_flow(self, config):
         h, w = self.FLOW_H, self.FLOW_W
         expected_flow = np.arange(3 * h * w).reshape(h, w, 3).transpose(2, 0, 1)
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 295fe922478..a7dd8397bab 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,4 +1,5 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
+from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereoSynthetic 
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 65336503b87..bcca2b12efb 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-from random import random
+import pathlib
+import random
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
+from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
 from .utils import download_and_extract_archive, download_url, verify_str_arg
@@ -14,15 +16,15 @@
 import json
 
 __all__ = (
-    "CREStereo"  # waiting for download
-    "Middlebury2014"
-    "ETH3D"
-    "Kitti2012"
-    "Kitti2015"
-    "Sintel"
-    "SceneFlow"  # need to find valid mask procedure
-    "FallingThings"
-    "InStereo2k"  # waiting for download
+    "CREStereo"  # waiting for download / need to find valid mask procedure
+    "StereoMiddlebury2014"
+    "StereoETH3D"
+    "StereoKitti2012"
+    "StereoKitti2015"
+    "StereoSintel"
+    "StereoSceneFlow"  # need to find valid mask procedure
+    "StereoFallingThings"
+    "InStereo2k"  # need to find valid mask procedure
 )
 
 
@@ -30,13 +32,15 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        assert header in [b"PF", b"Pf"], f"{file_path} is not a valid .pfm file"
+        if not header in [b"PF", b"Pf"]:
+            raise ValidationError(f"Not a valid PFM file: {file_path}")
 
-        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
-        assert dim_match, f"{file_path} has a Malformed PFM header"
+        dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+        if not dim_match:
+            raise ValidationError(f"Malformed PFM header: {file_path}")
 
         width, height = map(int, dim_match.groups())
-        channels = 3 if header == "PF" else 1
+        channels = 3 if header == b"PF" else 1
         scale = float(file.readline().rstrip())
         # check for endian type
         if scale < 0:
@@ -77,12 +81,12 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
-        dsp_map_left, valid_mask_right = self._read_disparity(self._disparities[index][0])
+        dsp_map_left, valid_mask_left = self._read_disparity(self._disparities[index][0])
         dsp_map_right, valid_mask_right = self._read_disparity(self._disparities[index][1])
 
         imgs = (img_left, img_right)
         dsp_maps = (dsp_map_left, dsp_map_right)
-        valid_masks = (valid_mask_right, valid_mask_right)
+        valid_masks = (valid_mask_left, valid_mask_right)
 
         if self.transforms is not None:
             imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
@@ -93,7 +97,7 @@ def __len__(self) -> int:
         return len(self._images)
 
 
-class CRESSyntethicStereo(StereoMatchingDataset):
+class CREStereoSynthetic(StereoMatchingDataset):
     """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
 
    Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
@@ -104,8 +108,11 @@ class CRESSyntethicStereo(StereoMatchingDataset):
 
     MAX_DISP = 256.
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = True):
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False):
         super().__init__(root, transforms)
+
+        root = Path(root) / "CREStereo"
+
         # if the API user requests a dataset download check that the user can download it
         if download:
             statvfs = os.statvfs(root)
@@ -130,12 +137,17 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
 
         for s in splits:
             imgs_left = sorted(glob(str(root / s / "*_left.jpg")))
-            imgs_right = (p.replace("_left", "_right") for p in imgs_left)
+            imgs_right = list(p.replace("_left", "_right") for p in imgs_left)
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
-            disparity_maps_left = (p.replace("_left", "_left.disp") for p in imgs_left)
-            disparity_maps_right = (p.replace("_right", "_right.disp") for p in imgs_right)
+            disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
+            disparity_maps_right = list(p.replace("_right", "_right.disp") for p in imgs_right)
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -158,7 +170,7 @@ def _download_dataset(self, root: str) -> None:
                 download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
 
 
-class Middlebury2014(StereoMatchingDataset):
+class StereoMiddlebury2014(StereoMatchingDataset):
     """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
 
     The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
@@ -219,12 +231,11 @@ class Middlebury2014(StereoMatchingDataset):
     splits = {
         "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
         "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
-        "test": ['Plants', 'Classroom2E', 'Classroom2', 'Australia', 'DjembeL', 'CrusadeP', 'Crusade', 'Hoops', 'Bicycle2', 'Staircase', 'Newkuba', 'AustraliaP', 'Djembe', 'Livingroom', 'Computer']
+        "test": ["Plants", "Classroom2E", "Classroom2", "Australia", "DjembeL", "CrusadeP", "Crusade", "Hoops", "Bicycle2", "Staircase", "Newkuba", "AustraliaP", "Djembe", "Livingroom", "Computer"]
     }
 
     def __init__(
         self,
-        *,
         root: str,
         split: str = "train",
         use_ambient_views: bool = False,
@@ -237,7 +248,7 @@ def __init__(
         if download:
             self._download_dataset(root)
 
-        root = Path(root) / "FlyingChairs"
+        root = Path(root) / "Middlebury2014"
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -245,11 +256,19 @@ def __init__(
 
         split_scenes = self.splits[split]
         # check that the provided root folder contains the scene splits
-        if not all(s in os.listdir(root / split) for s in split_scenes):
+        if not any(
+            # using startswith to account for perfect / imperfect calibrartion
+            scene.startswith(s) for scene in os.listdir(root / split)
+            for s in split_scenes
+        ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
 
         imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
         imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
 
         if split == "test":
@@ -312,7 +331,7 @@ def _download_dataset(self, root: str):
             shutil.rmtree(os.path.join(root, "MiddEval3"))
 
 
-class ETH3D(StereoMatchingDataset):
+class StereoETH3D(StereoMatchingDataset):
     """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
 
     The dataset is expected to have the following structure: ::
@@ -370,16 +389,20 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
         root = Path(root) / "ETH3D"
-        img_dir = "two_view_training" if split == "train" else "two_view_testing"
+
+        img_dir = "two_view_training" if split == "train" else "two_view_test"
         anot_dir = "two_view_training_gt"
 
         imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "test":
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
-            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*0GT.pfm")))
             # no masks for the right view, always using left as reference
             disparity_maps_right = list("" for _ in disparity_maps_left)
 
@@ -395,11 +418,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.array(valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class Kitti2012(StereoMatchingDataset):
+class StereoKitti2012(StereoMatchingDataset):
     """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
     Uses the RGB images for consistency with Kitti 2015.
 
@@ -433,11 +456,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_left = sorted(glob(str(root / "colored_0" / "*_10.png")))
         imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
             disparity_maps_right = list("" for _ in disparity_maps_left)
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -455,7 +481,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class Kitti2015(StereoMatchingDataset):
+class StereoKitti2015(StereoMatchingDataset):
     """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
 
     The dataset is expected to have the following structure: ::
@@ -468,8 +494,8 @@ class Kitti2015(StereoMatchingDataset):
                 training
                     image_2
                     image_3
-                    disp_noc_0
-                    disp_noc_1
+                    disp_occ_0
+                    disp_occ_1
                     calib
 
     Args:
@@ -488,11 +514,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_left = sorted(glob(str(root / "image_2" / "*_10.png")))
         imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
             disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -510,7 +539,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class SintelDataset(StereoMatchingDataset):
+class StereoSintel(StereoMatchingDataset):
     """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
     Args:
@@ -527,6 +556,9 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
         disparity_maps_right = list("" for _ in dps_masks_left)
 
@@ -554,16 +586,16 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class SceneFlowDataset(StereoMatchingDataset):
+class StereoSceneFlow(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
 
     def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
-        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
-        split = split.upper()
+        root = Path(root) / "SceneFlow"
 
-        verify_str_arg(split, "pass_name", valid_values=("clean", "final", "both"))
+        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
+        verify_str_arg(pass_name, "pass_name", valid_values=("clean", "final", "both"))
 
         passes = {
             "clean": ["frames_cleanpass"],
@@ -571,16 +603,21 @@ def __init__(self, root: str, split: str = "train", pass_name: str = "clean", tr
             "both": ["frames_cleanpass, frames_finalpass"],
         }[pass_name]
 
-        root = Path(root) / split
+        root = root / split
 
         for p in passes:
-            imgs_left = sorted(glob(str(root / p / "left" / "*" / "*.png")))
-            imgs_right = sorted(glob(str(root / p / "right" / "*" / "*.png")))
+            imgs_left = sorted(glob(str(root / p / "*" / "left" / "*.png")))
+            imgs_right = sorted(glob(str(root / p / "*" / "right" / "*.png")))
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root / p))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
             disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -589,8 +626,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid = np.ones_like(disparity)
         return disparity, valid
 
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        return super().__getitem__(index)
 
-class FallingThingsDataset(StereoMatchingDataset):
+
+class StereoFallingThings(StereoMatchingDataset):
     """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
 
     The dataset is expected to have the following structre: ::
@@ -644,11 +684,16 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
         for s in splits:
             imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
             imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
             disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -660,3 +705,59 @@ def _read_disparity(self, file_path: str) -> Tuple:
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0
             return disparity, valid
+
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        return super().__getitem__(index)
+
+
+class InStereo2k(StereoMatchingDataset):
+    """InStereo2k `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            InStereo2k
+                train
+                    scene1
+                        left.png
+                        right.png
+                        left_disp.png
+                        right_disp.png
+                        ...
+                    scene2
+                    ...
+                test
+                    scene1
+                        left.png
+                        right.png
+                        left_disp.png
+                        right_disp.png
+                        ...
+                    scene2
+                    ...
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        root = Path(root) / "InStereo2k" / split
+
+        imgs_left = sorted(glob(str(root / "*" / "left.png")))
+        imgs_right = list(p.replace("left", "right") for p in imgs_left)
+
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
+        imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._images = imgs
+
+        disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
+        disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
+
+        disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._disparities = disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        valid = np.ones_like(disparity)
+        return disparity, valid

From 0959499813c5213d9d035128088ea8ffeceb0444 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 15:10:17 +0100
Subject: [PATCH 06/35] "Added calibrartion arg for Middlebury2014 (#6259)"

---
 test/test_datasets.py                    |  50 ++++++++---
 torchvision/datasets/_stereo_matching.py | 107 +++++++++++++++++++----
 2 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index d390c30cee9..5d557020ac8 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -193,10 +193,7 @@ def inject_fake_data(self, tmpdir, config):
             scene_name = split_scene_map[config["split"]][idx]
             self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
 
-        # account for perfect / imperfect calibrations
-        if config["split"] != "test":
-            num_examples *= 2
-
+        # TODO: add calibration argument test
         return num_examples
 
     def test_train_splits(self):
@@ -428,12 +425,15 @@ class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]):
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
+        paths = []
         for i in range(num_examples):
             datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+            paths.append(str(root / file_name_fn(i)))
+        return paths
 
     def inject_fake_data(self, tmpdir, config):
         scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
@@ -447,27 +447,25 @@ def inject_fake_data(self, tmpdir, config):
             "final": "frames_finalpass",
         }
 
-        num_examples = 4
+        num_examples = 1
         pass_dir_name = pass_dir_map[config["pass_name"]]
         # create pass directories
         pass_dir = split_dir / pass_dir_name
-        disp_dir = split_dir / "disp"
+        disp_dir = split_dir / "disparity"
         os.makedirs(pass_dir, exist_ok=True)
         os.makedirs(disp_dir, exist_ok=True)
 
-        # root / pass / direction / scene / .imgs
-        # root / disparity / direction / scene / .imgs
         for direction in ["left", "right"]:
             for scene_idx in range(num_examples):
-                # scene_dir = pass_dir / direction / f"scene_{scene_idx:06d}"
                 os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
                 datasets_utils.create_image_folder(
                     root=pass_dir / f"scene_{scene_idx:06d}",
                     name=direction,
                     file_name_fn=lambda i: f"{i:06d}.png",
                     num_examples=3,
-                    size=(3, 100, 200),
+                    size=(3, 200, 100),
                 )
+
                 os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
                 self._create_pfm_folder(
                     root=disp_dir / f"scene_{scene_idx:06d}",
@@ -480,18 +478,20 @@ def inject_fake_data(self, tmpdir, config):
         return num_examples * 3
 
     def test_train_splits(self):
-        for split, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
-            with self.create_dataset(split=split, pass_name=pass_name) as (dataset, _):
+        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
+                    print(f"Split {split_name} pass {pass_name}")
                     left_array = np.array(left)
                     right_array = np.array(right)
                     h, w, c = left_array.shape
                     # check that left and right are the same size
                     assert left_array.shape == right_array.shape
+                    print(left_array.shape)
                     # check general shapes
                     assert c == 3
                     assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
+                    assert len(valid_mask.shape) == 3
                     assert disparity.shape == (h, w, 3)
                     # check that valid mask is the same size as the disparity
                     dh, dw, _ = disparity.shape
@@ -534,6 +534,28 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples
 
+    def test_splits(self):
+        for split_name in ["single", "mixed"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    print(f"Split {split_name}")
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    print(left_array.shape)
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
 
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index bcca2b12efb..0bd75fe82a4 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -6,6 +6,7 @@
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
+import warnings
 from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
@@ -238,6 +239,7 @@ def __init__(
         self,
         root: str,
         split: str = "train",
+        calibration: Optional[str] = None,
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False
@@ -245,6 +247,22 @@ def __init__(
         super().__init__(root, transforms)
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
 
+        if calibration:
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", None))
+            if split == "test":
+                warnings.warn(
+                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
+                    RuntimeWarning
+                )
+        else:
+            if split != "test":
+                calibration = "perfect"
+                warnings.warn(
+                    f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                    f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+                    RuntimeWarning
+                )
+
         if download:
             self._download_dataset(root)
 
@@ -263,25 +281,36 @@ def __init__(
         ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
 
-        imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
-        imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+        calibrartion_suffixes = {
+            None: [""],
+            "perfect": ["-perfect"],
+            "imperfect": ["-imperfect"],
+            "both": ["-perfect", "-imperfect"],
+        }[calibration]
 
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
+        for calibration_suffix in calibrartion_suffixes:
+            scene_pattern = "*" + calibration_suffix
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
+            imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))
 
-        if split == "test":
-            dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
-        else:
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
+            self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
+
+            if split == "test":
+                dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+            else:
+
+                dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+                dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
 
-            dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
-            dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
-        self._disparities = list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+            self._disparities += list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
@@ -579,17 +608,60 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # out of frame mask
         off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) == 0
         # combine the masks together
-        valid_mask = np.logical_or(off_mask, valid_mask)
+        valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
 
 class StereoSceneFlow(StereoMatchingDataset):
-    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
+    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            SceneFlow
+                Monkaa
+                    frames_cleanpass
+                        scene1
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                        scene2
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                    frames_finalpass
+                        scene1
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                        ...
+                        ...
+                    disparity
+                        scene1
+                            left
+                                img1.pfm
+                                img2.pfm
+                            right
+                                img1.pfm
+                                img2.pfm
+                FlyingThings3D
+                    ...
+                    ...
+    """
 
-    def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -622,6 +694,9 @@ def __init__(self, root: str, split: str = "train", pass_name: str = "clean", tr
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            raise FileNotFoundError("Disparity map {} not found".format(file_path))
+
         disparity = read_pfm_file(file_path)
         valid = np.ones_like(disparity)
         return disparity, valid

From a9365fe3d095d6c7e695fae483595359f159612a Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 15:58:46 +0100
Subject: [PATCH 07/35] "Fixed test calibration test Middlebury2014 (#6259)"

---
 test/test_datasets.py                    | 40 +++++++++++++++++++++---
 torchvision/datasets/_stereo_matching.py |  7 +++--
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 5d557020ac8..518a95362b9 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -150,7 +150,11 @@ def test_bad_input(self):
 
 class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StereoMiddlebury2014
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "additional"), use_ambient_views=(True, False))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "additional"),
+        calibration=("perfect", "imperfect", "both"),
+        use_ambient_views=(True, False),
+    )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
@@ -193,12 +197,15 @@ def inject_fake_data(self, tmpdir, config):
             scene_name = split_scene_map[config["split"]][idx]
             self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
 
-        # TODO: add calibration argument test
+            print(f"Created {scene_name} for split {config['split']}")        
+
+        if config["calibration"] == "both":
+            num_examples *= 2
         return num_examples
 
     def test_train_splits(self):
-        for split in ["train", "additional"]:
-            with self.create_dataset(split=split) as (dataset, _):
+        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
+            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
                     left_array = np.array(left)
                     right_array = np.array(right)
@@ -219,7 +226,7 @@ def test_train_splits(self):
 
     def test_test_split(self):
         for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
+            with self.create_dataset(split=split, calibration=None) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
                     left_array = np.array(left)
                     right_array = np.array(right)
@@ -239,6 +246,29 @@ def test_augmented_view_usage(self):
                 # check that left and right are the same size
                 assert left_array.shape == right_array.shape
 
+    def test_warnings_train(self):
+        # train set invalid
+        split = "train"
+        calibration = None
+        with pytest.warns(
+            RuntimeWarning,
+            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_warnings_test(self):
+        # test set invalid
+        split = "test"
+        calibration = "perfect"
+        with pytest.warns(
+            RuntimeWarning,
+            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
             with self.create_dataset(split="bad"):
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 0bd75fe82a4..702386b05bd 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -239,7 +239,7 @@ def __init__(
         self,
         root: str,
         split: str = "train",
-        calibration: Optional[str] = None,
+        calibration: Optional[str] = "perfect",
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False
@@ -248,8 +248,9 @@ def __init__(
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
 
         if calibration:
-            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", None))
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both", None))
             if split == "test":
+                calibration = None
                 warnings.warn(
                     "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
                     RuntimeWarning
@@ -267,6 +268,7 @@ def __init__(
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
+        print(split)
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -290,6 +292,7 @@ def __init__(
 
         for calibration_suffix in calibrartion_suffixes:
             scene_pattern = "*" + calibration_suffix
+            print(scene_pattern)
 
             imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
             imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))

From 96c7bf4aa5be5b01c98016207deda8846d55212c Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:03:27 +0100
Subject: [PATCH 08/35] Clean-up. Disp map format to (C, H, W) & valid mask to
 (H, W). (#6259)

---
 test/test_datasets.py                    | 3552 +++++++++++-----------
 torchvision/datasets/__init__.py         |   11 +-
 torchvision/datasets/_stereo_matching.py |  288 +-
 3 files changed, 2081 insertions(+), 1770 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 518a95362b9..dd3c89b9bdc 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1,4 +1,3 @@
-from abc import abstractmethod
 import bz2
 import contextlib
 import csv
@@ -25,701 +24,542 @@
 from torchvision import datasets
 
 
-class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoETH3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
-
-    @staticmethod
-    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
-        # create the scene folder
-        image_paths = []
-        # make the root_dir if it does not exits
-        os.makedirs(root_dir, exist_ok=True)
-
-        for i in range(num_examples):
-            scene_dir = os.path.join(root_dir, f"scene_{i}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # populate with left right images
-            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
-            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
-        return image_paths
-
-    @staticmethod
-    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
-        paths = []
-        # make the root_dir if it does not exits
-        os.makedirs(root_dir, exist_ok=True)
+class STL10TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.STL10
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-        # create scene directories
-        for i in range(num_examples):
-            scene_dir = os.path.join(root_dir, f"scene_{i}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # populate with a random png file for occlusion mask, and a pfm file for disparity
-            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
-            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
-            paths.append(pfm_path)
-        return paths
+    @ staticmethod
+    def _make_binary_file(num_elements, root, name):
+        file_name = os.path.join(root, name)
+        np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    def inject_fake_data(self, tmpdir, config):
-        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+    @ staticmethod
+    def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
+        STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-        num_examples = 2 if config["split"] == "train" else 3
+    @ staticmethod
+    def _make_label_file(num_images, root, name):
+        STL10TestCase._make_binary_file(num_images, root, name)
 
-        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
-        split_dir = os.path.join(eth3d_dir, split_name)
-        self._create_scene_folder(num_examples, split_dir)
+    @ staticmethod
+    def _make_class_names_file(root, name="class_names.txt"):
+        with open(os.path.join(root, name), "w") as fh:
+            for cname in ("airplane", "bird"):
+                fh.write(f"{cname}\n")
 
-        if config["split"] == "train":
-            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
-            self._create_annotation_folder(num_examples, annot_dir)
+    @ staticmethod
+    def _make_fold_indices_file(root):
+        num_folds = 10
+        offset = 0
+        with open(os.path.join(root, "fold_indices.txt"), "w") as fh:
+            for fold in range(num_folds):
+                line = " ".join([str(idx) for idx in range(offset, offset + fold + 1)])
+                fh.write(f"{line}\n")
+                offset += fold + 1
 
-        return num_examples
+        return tuple(range(1, num_folds + 1))
 
-    def test_training_test_splits(self):
-        with self.create_dataset(split="train") as (dataset, _):
-            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
-            for _, _, disparity, valid_mask in dataset:
-                assert len(disparity.shape) == 3
-                assert len(valid_mask.shape) == 2
-                dh, dw, _ = disparity.shape
-                mh, mw = valid_mask.shape
-                assert dh == mh
-                assert dw == mw
+    @ staticmethod
+    def _make_train_files(root, num_unlabeled_images=1):
+        num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
+        num_train_images = sum(num_images_in_fold)
 
-        with self.create_dataset(split="test") as (dataset, _):
-            assert all(d == ("", "") for d in dataset._disparities)
-            for _, _, disparity, valid_mask in dataset:
-                assert disparity is None
-                assert valid_mask is None
+        STL10TestCase._make_image_file(num_train_images, root, "train_X.bin")
+        STL10TestCase._make_label_file(num_train_images, root, "train_y.bin")
+        STL10TestCase._make_image_file(1, root, "unlabeled_X.bin")
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
+    @ staticmethod
+    def _make_test_files(root, num_images=2):
+        STL10TestCase._make_image_file(num_images, root, "test_X.bin")
+        STL10TestCase._make_label_file(num_images, root, "test_y.bin")
 
-class CREStereoSynthethicTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CREStereoSynthetic
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+        return dict(test=num_images)
 
     def inject_fake_data(self, tmpdir, config):
-        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
-        os.makedirs(crestereo_dir, exist_ok=True)
+        root_folder = os.path.join(tmpdir, "stl10_binary")
+        os.mkdir(root_folder)
 
-        split_dir = crestereo_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
-        num_examples = 4
+        num_images_in_split = self._make_train_files(root_folder)
+        num_images_in_split.update(self._make_test_files(root_folder))
+        self._make_class_names_file(root_folder)
 
-        for idx in range(num_examples):
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
-            # these are going to end up being gray scale images
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+        return sum(num_images_in_split[part] for part in config["split"].split("+"))
 
-        return num_examples
+    def test_folds(self):
+        for fold in range(10):
+            with self.create_dataset(split="train", folds=fold) as (dataset, _):
+                assert len(dataset) == fold + 1
 
-    def test_splits(self):
-        for split in ("tree", "shapenet", "reflective", "hole"):
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+    def test_unlabeled(self):
+        with self.create_dataset(split="unlabeled") as (dataset, _):
+            labels = [dataset[idx][1] for idx in range(len(dataset))]
+            assert all(label == -1 for label in labels)
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
+    def test_invalid_folds1(self):
+        with pytest.raises(ValueError):
+            with self.create_dataset(folds=10):
                 pass
 
+    def test_invalid_folds2(self):
+        with pytest.raises(ValueError):
+            with self.create_dataset(folds="0"):
+                pass
+
+
+class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Caltech101
+    FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
 
-class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoMiddlebury2014
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("train", "additional"),
-        calibration=("perfect", "imperfect", "both"),
-        use_ambient_views=(True, False),
+        target_type=("category", "annotation", ["category", "annotation"])
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
-
-    @staticmethod
-    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
-        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
-        scene_dirs = []
-        for c in calibrations:
-            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # make normal images first
-            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
-            # these are going to end up being gray scale images
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
-            scene_dirs.append(scene_dir)
-        return scene_dirs
+    REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir, config):
-        split_scene_map = {
-            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
-            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
-            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
-        }
+        root = pathlib.Path(tmpdir) / "caltech101"
+        images = root / "101_ObjectCategories"
+        annotations = root / "Annotations"
 
-        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
-        os.makedirs(middlebury_dir, exist_ok=True)
+        categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"), ("ying_yang", "ying_yang"))
+        num_images_per_category = 2
 
-        split_dir = middlebury_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
+        for image_category, annotation_category in categories:
+            datasets_utils.create_image_folder(
+                root=images,
+                name=image_category,
+                file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
+                num_examples=num_images_per_category,
+            )
+            self._create_annotation_folder(
+                root=annotations,
+                name=annotation_category,
+                file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
+                num_examples=num_images_per_category,
+            )
 
-        num_examples = 4
-        for idx in range(num_examples):
-            # special case for test_bad_input
-            if config["split"] not in split_scene_map:
-                return 0
+        # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices.
+        os.makedirs(images / "BACKGROUND_Google")
 
-            scene_name = split_scene_map[config["split"]][idx]
-            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+        return num_images_per_category * len(categories)
 
-            print(f"Created {scene_name} for split {config['split']}")        
+    def _create_annotation_folder(self, root, name, file_name_fn, num_examples):
+        root = pathlib.Path(root) / name
+        os.makedirs(root)
 
-        if config["calibration"] == "both":
-            num_examples *= 2
-        return num_examples
+        for idx in range(num_examples):
+            self._create_annotation_file(root, file_name_fn(idx))
 
-    def test_train_splits(self):
-        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
-            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 3
-                    assert disparity.shape == (h, w, 3)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw, c = disparity.shape
-                    print(valid_mask.shape)
-                    mh, mw, _ = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+    def _create_annotation_file(self, root, name):
+        mdict = dict(obj_contour=torch.rand((2, torch.randint(3, 6, size=())), dtype=torch.float64).numpy())
+        datasets_utils.lazy_importer.scipy.io.savemat(str(pathlib.Path(root) / name), mdict)
 
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split, calibration=None) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
+    def test_combined_targets(self):
+        target_types = ["category", "annotation"]
 
-    def test_augmented_view_usage(self):
-        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
-            for left, right, _, _ in dataset:
-                left_array = np.array(left)
-                right_array = np.array(right)
-                # check that left and right are the same size
-                assert left_array.shape == right_array.shape
+        individual_targets = []
+        for target_type in target_types:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                _, target = dataset[0]
+                individual_targets.append(target)
 
-    def test_warnings_train(self):
-        # train set invalid
-        split = "train"
-        calibration = None
-        with pytest.warns(
-            RuntimeWarning,
-            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
-                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
-        ):
-            with self.create_dataset(split=split, calibration=calibration):
-                pass
+        with self.create_dataset(target_type=target_types) as (dataset, _):
+            _, combined_targets = dataset[0]
 
-    def test_warnings_test(self):
-        # test set invalid
-        split = "test"
-        calibration = "perfect"
-        with pytest.warns(
-            RuntimeWarning,
-            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
-        ):
-            with self.create_dataset(split=split, calibration=calibration):
-                pass
+        actual = len(individual_targets)
+        expected = len(combined_targets)
+        assert (
+            actual == expected
+        ), "The number of the returned combined targets does not match the the number targets if requested "
+        f"individually: {actual} != {expected}",
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        for target_type, combined_target, individual_target in zip(target_types, combined_targets, individual_targets):
+            with self.subTest(target_type=target_type):
+                actual = type(combined_target)
+                expected = type(individual_target)
+                assert (
+                    actual is expected
+                ), "Type of the combined target does not match the type of the corresponding individual target: "
+                f"{actual} is not {expected}",
 
 
-class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2012
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Caltech256
 
     def inject_fake_data(self, tmpdir, config):
-        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
-        os.makedirs(kitti_dir, exist_ok=True)
-
-        split_dir = kitti_dir / (config["split"] + "ing")
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
 
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="colored_0",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="colored_1",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
+        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
+        num_images_per_category = 2
 
-        if config["split"] == "train":
+        for idx, category in categories:
             datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_noc",
-                file_name_fn=lambda i: f"{i:06d}.png",
-                num_examples=num_examples,
-                # Kitti2012 uses a single channel image for disparities
-                size=(1, 100, 200),
+                tmpdir,
+                name=f"{idx:03d}.{category}",
+                file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
+                num_examples=num_images_per_category,
             )
 
-        return num_examples
-
-    def test_train_splits(self):
-        for split in ["train"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
-
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
-
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return num_images_per_category * len(categories)
 
 
-class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2015
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.WIDERFace
+    FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
-        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
-        os.makedirs(kitti_dir, exist_ok=True)
-
-        split_dir = kitti_dir / (config["split"] + "ing")
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        widerface_dir = pathlib.Path(tmpdir) / "widerface"
+        annotations_dir = widerface_dir / "wider_face_split"
+        os.makedirs(annotations_dir)
 
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="image_2",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="image_3",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
+        split_to_idx = split_to_num_examples = {
+            "train": 1,
+            "val": 2,
+            "test": 3,
+        }
 
-        if config["split"] == "train":
-            datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_occ_0",
-                file_name_fn=lambda i: f"{i:06d}.png",
-                num_examples=num_examples,
-                # Kitti2015 uses a single channel image for disparities
-                size=(1, 100, 200),
-            )
+        # We need to create all folders regardless of the split in config
+        for split in ("train", "val", "test"):
+            split_idx = split_to_idx[split]
+            num_examples = split_to_num_examples[split]
 
             datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_occ_1",
-                file_name_fn=lambda i: f"{i:06d}.png",
+                root=tmpdir,
+                name=widerface_dir / f"WIDER_{split}" / "images" / "0--Parade",
+                file_name_fn=lambda image_idx: f"0_Parade_marchingband_1_{split_idx + image_idx}.jpg",
                 num_examples=num_examples,
-                # Kitti2015 uses a single channel image for disparities
-                size=(1, 100, 200),
             )
 
-        return num_examples
+            annotation_file_name = {
+                "train": annotations_dir / "wider_face_train_bbx_gt.txt",
+                "val": annotations_dir / "wider_face_val_bbx_gt.txt",
+                "test": annotations_dir / "wider_face_test_filelist.txt",
+            }[split]
 
-    def test_train_splits(self):
-        for split in ["train"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+            annotation_content = {
+                "train": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n449 330 122 149 0 0 0 0 0 0\n"
+                    for image_idx in range(num_examples)
+                ),
+                "val": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n501 160 285 443 0 0 0 0 0 0\n"
+                    for image_idx in range(num_examples)
+                ),
+                "test": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n"
+                    for image_idx in range(num_examples)
+                ),
+            }[split]
 
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
+            with open(annotation_file_name, "w") as annotation_file:
+                annotation_file.write(annotation_content)
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return split_to_num_examples[config["split"]]
 
 
-class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoSceneFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("FlyingThings3D", "Driving", "Monkaa"),
-        pass_name=("clean", "final")
+class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Cityscapes
+    TARGET_TYPES = (
+        "instance",
+        "semantic",
+        "polygon",
+        "color",
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    ADDITIONAL_CONFIGS = (
+        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *datasets_utils.combinations_grid(
+            mode=("coarse",),
+            split=("train", "train_extra", "val"),
+            target_type=TARGET_TYPES,
+        ),
+    )
+    FEATURE_TYPES = (PIL.Image.Image, (dict, PIL.Image.Image))
 
-    @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
-        root = pathlib.Path(root) / name
-        os.makedirs(root, exist_ok=True)
+    def inject_fake_data(self, tmpdir, config):
 
-        paths = []
-        for i in range(num_examples):
-            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
-            paths.append(str(root / file_name_fn(i)))
-        return paths
+        tmpdir = pathlib.Path(tmpdir)
 
-    def inject_fake_data(self, tmpdir, config):
-        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
-        os.makedirs(scene_flow_dir, exist_ok=True)
+        mode_to_splits = {
+            "Coarse": ["train", "train_extra", "val"],
+            "Fine": ["train", "test", "val"],
+        }
 
-        split_dir = scene_flow_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
+        if config["split"] == "train":  # just for coverage of the number of samples
+            cities = ["bochum", "bremen"]
+        else:
+            cities = ["bochum"]
 
-        pass_dir_map = {
-            "clean": "frames_cleanpass",
-            "final": "frames_finalpass",
+        polygon_target = {
+            "imgHeight": 1024,
+            "imgWidth": 2048,
+            "objects": [
+                {
+                    "label": "sky",
+                    "polygon": [
+                        [1241, 0],
+                        [1234, 156],
+                        [1478, 197],
+                        [1611, 172],
+                        [1606, 0],
+                    ],
+                },
+                {
+                    "label": "road",
+                    "polygon": [
+                        [0, 448],
+                        [1331, 274],
+                        [1473, 265],
+                        [2047, 605],
+                        [2047, 1023],
+                        [0, 1023],
+                    ],
+                },
+            ],
         }
 
-        num_examples = 1
-        pass_dir_name = pass_dir_map[config["pass_name"]]
-        # create pass directories
-        pass_dir = split_dir / pass_dir_name
-        disp_dir = split_dir / "disparity"
-        os.makedirs(pass_dir, exist_ok=True)
-        os.makedirs(disp_dir, exist_ok=True)
+        for mode in ["Coarse", "Fine"]:
+            gt_dir = tmpdir / f"gt{mode}"
+            for split in mode_to_splits[mode]:
+                for city in cities:
 
-        for direction in ["left", "right"]:
-            for scene_idx in range(num_examples):
-                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                    def make_image(name, size=10):
+                        datasets_utils.create_image_folder(
+                            root=gt_dir / split,
+                            name=city,
+                            file_name_fn=lambda _: name,
+                            size=size,
+                            num_examples=1,
+                        )
+
+                    make_image(f"{city}_000000_000000_gt{mode}_instanceIds.png")
+                    make_image(f"{city}_000000_000000_gt{mode}_labelIds.png")
+                    make_image(f"{city}_000000_000000_gt{mode}_color.png", size=(4, 10, 10))
+
+                    polygon_target_name = gt_dir / split / city / f"{city}_000000_000000_gt{mode}_polygons.json"
+                    with open(polygon_target_name, "w") as outfile:
+                        json.dump(polygon_target, outfile)
+
+        # Create leftImg8bit folder
+        for split in ["test", "train_extra", "train", "val"]:
+            for city in cities:
                 datasets_utils.create_image_folder(
-                    root=pass_dir / f"scene_{scene_idx:06d}",
-                    name=direction,
-                    file_name_fn=lambda i: f"{i:06d}.png",
-                    num_examples=3,
-                    size=(3, 200, 100),
+                    root=tmpdir / "leftImg8bit" / split,
+                    name=city,
+                    file_name_fn=lambda _: f"{city}_000000_000000_leftImg8bit.png",
+                    num_examples=1,
                 )
 
-                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
-                self._create_pfm_folder(
-                    root=disp_dir / f"scene_{scene_idx:06d}",
-                    name=direction,
-                    file_name_fn=lambda i: f"{i:06d}.pfm",
-                    num_examples=3,
-                    size=(100, 200),
-                )
+        info = {"num_examples": len(cities)}
+        if config["target_type"] == "polygon":
+            info["expected_polygon_target"] = polygon_target
+        return info
 
-        return num_examples * 3
+    def test_combined_targets(self):
+        target_types = ["semantic", "polygon", "color"]
 
-    def test_train_splits(self):
-        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
-            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    print(f"Split {split_name} pass {pass_name}")
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    print(left_array.shape)
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 3
-                    assert disparity.shape == (h, w, 3)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw, _ = disparity.shape
-                    mh, mw, _ = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+        with self.create_dataset(target_type=target_types) as (dataset, _):
+            output = dataset[0]
+            assert isinstance(output, tuple)
+            assert len(output) == 2
+            assert isinstance(output[0], PIL.Image.Image)
+            assert isinstance(output[1], tuple)
+            assert len(output[1]) == 3
+            assert isinstance(output[1][0], PIL.Image.Image)  # semantic
+            assert isinstance(output[1][1], dict)  # polygon
+            assert isinstance(output[1][2], PIL.Image.Image)  # color
 
+    def test_feature_types_target_color(self):
+        with self.create_dataset(target_type="color") as (dataset, _):
+            color_img, color_target = dataset[0]
+            assert isinstance(color_img, PIL.Image.Image)
+            assert np.array(color_target).shape[2] == 4
 
-class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoFallingThings
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    def test_feature_types_target_polygon(self):
+        with self.create_dataset(target_type="polygon") as (dataset, info):
+            polygon_img, polygon_target = dataset[0]
+            assert isinstance(polygon_img, PIL.Image.Image)
+            (polygon_target, info["expected_polygon_target"])
 
-    @staticmethod
-    def _make_scene_folder(root: str, scene_name: str, num_examples: int, size: Tuple[int, int]):
-        root = pathlib.Path(root) / scene_name
-        os.makedirs(root, exist_ok=True)
 
-        datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[0], size[1]))
-        datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[0], size[1]))
-        # single channel depth maps
-        datasets_utils.create_image_file(root, "image1.left.depth.jpg", size=(1, size[0], size[1]))
-        datasets_utils.create_image_file(root, "image1.right.depth.jpg", size=(1, size[0], size[1]))
+class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.ImageNet
+    REQUIRED_PACKAGES = ("scipy",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
 
     def inject_fake_data(self, tmpdir, config):
-        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
-
-        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        tmpdir = pathlib.Path(tmpdir)
 
-        for i in range(num_examples):
-            self._make_scene_folder(
-                root=split_dir,
-                scene_name=f"scene_{i:06d}",
+        wnid = "n01234567"
+        if config["split"] == "train":
+            num_examples = 3
+            datasets_utils.create_image_folder(
+                root=tmpdir,
+                name=tmpdir / "train" / wnid / wnid,
+                file_name_fn=lambda image_idx: f"{wnid}_{image_idx}.JPEG",
+                num_examples=num_examples,
+            )
+        else:
+            num_examples = 1
+            datasets_utils.create_image_folder(
+                root=tmpdir,
+                name=tmpdir / "val" / wnid,
+                file_name_fn=lambda image_ifx: "ILSVRC2012_val_0000000{image_idx}.JPEG",
                 num_examples=num_examples,
-                size=(100, 200),
             )
 
+        wnid_to_classes = {wnid: [1]}
+        torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
         return num_examples
 
-    def test_splits(self):
-        for split_name in ["single", "mixed"]:
-            with self.create_dataset(split=split_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    print(f"Split {split_name}")
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    print(left_array.shape)
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
 
+class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CIFAR10
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
 
-class STL10TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    _VERSION_CONFIG = dict(
+        base_folder="cifar-10-batches-py",
+        train_files=tuple(f"data_batch_{idx}" for idx in range(1, 6)),
+        test_files=("test_batch",),
+        labels_key="labels",
+        meta_file="batches.meta",
+        num_categories=10,
+        categories_key="label_names",
+    )
 
-    @ staticmethod
-    def _make_binary_file(num_elements, root, name):
-        file_name = os.path.join(root, name)
-        np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir) / self._VERSION_CONFIG["base_folder"]
+        os.makedirs(tmpdir)
 
-    @ staticmethod
-    def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
-        STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
+        num_images_per_file = 1
+        for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]):
+            self._create_batch_file(tmpdir, name, num_images_per_file)
 
-    @ staticmethod
-    def _make_label_file(num_images, root, name):
-        STL10TestCase._make_binary_file(num_images, root, name)
+        categories = self._create_meta_file(tmpdir)
 
-    @ staticmethod
-    def _make_class_names_file(root, name="class_names.txt"):
-        with open(os.path.join(root, name), "w") as fh:
-            for cname in ("airplane", "bird"):
-                fh.write(f"{cname}\n")
+        return dict(
+            num_examples=num_images_per_file
+            * len(self._VERSION_CONFIG["train_files"] if config["train"] else self._VERSION_CONFIG["test_files"]),
+            categories=categories,
+        )
 
-    @ staticmethod
-    def _make_fold_indices_file(root):
-        num_folds = 10
-        offset = 0
-        with open(os.path.join(root, "fold_indices.txt"), "w") as fh:
-            for fold in range(num_folds):
-                line = " ".join([str(idx) for idx in range(offset, offset + fold + 1)])
-                fh.write(f"{line}\n")
-                offset += fold + 1
+    def _create_batch_file(self, root, name, num_images):
+        np_rng = np.random.RandomState(0)
+        data = datasets_utils.create_image_or_video_tensor((num_images, 32 * 32 * 3))
+        labels = np_rng.randint(0, self._VERSION_CONFIG["num_categories"], size=num_images).tolist()
+        self._create_binary_file(root, name, {"data": data, self._VERSION_CONFIG["labels_key"]: labels})
 
-        return tuple(range(1, num_folds + 1))
+    def _create_meta_file(self, root):
+        categories = [
+            f"{idx:0{len(str(self._VERSION_CONFIG['num_categories'] - 1))}d}"
+            for idx in range(self._VERSION_CONFIG["num_categories"])
+        ]
+        self._create_binary_file(
+            root, self._VERSION_CONFIG["meta_file"], {self._VERSION_CONFIG["categories_key"]: categories}
+        )
+        return categories
 
-    @ staticmethod
-    def _make_train_files(root, num_unlabeled_images=1):
-        num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
-        num_train_images = sum(num_images_in_fold)
+    def _create_binary_file(self, root, name, content):
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            pickle.dump(content, fh)
 
-        STL10TestCase._make_image_file(num_train_images, root, "train_X.bin")
-        STL10TestCase._make_label_file(num_train_images, root, "train_y.bin")
-        STL10TestCase._make_image_file(1, root, "unlabeled_X.bin")
+    def test_class_to_idx(self):
+        with self.create_dataset() as (dataset, info):
+            expected = {category: label for label, category in enumerate(info["categories"])}
+            actual = dataset.class_to_idx
+            assert actual == expected
 
-        return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @ staticmethod
-    def _make_test_files(root, num_images=2):
-        STL10TestCase._make_image_file(num_images, root, "test_X.bin")
-        STL10TestCase._make_label_file(num_images, root, "test_y.bin")
+class CIFAR100(CIFAR10TestCase):
+    DATASET_CLASS = datasets.CIFAR100
 
-        return dict(test=num_images)
+    _VERSION_CONFIG = dict(
+        base_folder="cifar-100-python",
+        train_files=("train",),
+        test_files=("test",),
+        labels_key="fine_labels",
+        meta_file="meta",
+        num_categories=100,
+        categories_key="fine_label_names",
+    )
 
-    def inject_fake_data(self, tmpdir, config):
-        root_folder = os.path.join(tmpdir, "stl10_binary")
-        os.mkdir(root_folder)
 
-        num_images_in_split = self._make_train_files(root_folder)
-        num_images_in_split.update(self._make_test_files(root_folder))
-        self._make_class_names_file(root_folder)
+class CelebATestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CelebA
+    FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
 
-        return sum(num_images_in_split[part] for part in config["split"].split("+"))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "valid", "test", "all"),
+        target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
+    )
 
-    def test_folds(self):
-        for fold in range(10):
-            with self.create_dataset(split="train", folds=fold) as (dataset, _):
-                assert len(dataset) == fold + 1
+    _SPLIT_TO_IDX = dict(train=0, valid=1, test=2)
 
-    def test_unlabeled(self):
-        with self.create_dataset(split="unlabeled") as (dataset, _):
-            labels = [dataset[idx][1] for idx in range(len(dataset))]
-            assert all(label == -1 for label in labels)
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = pathlib.Path(tmpdir) / "celeba"
+        os.makedirs(base_folder)
 
-    def test_invalid_folds1(self):
-        with pytest.raises(ValueError):
-            with self.create_dataset(folds=10):
-                pass
+        num_images, num_images_per_split = self._create_split_txt(base_folder)
 
-    def test_invalid_folds2(self):
-        with pytest.raises(ValueError):
-            with self.create_dataset(folds="0"):
-                pass
+        datasets_utils.create_image_folder(
+            base_folder, "img_align_celeba", lambda idx: f"{idx + 1:06d}.jpg", num_images
+        )
+        attr_names = self._create_attr_txt(base_folder, num_images)
+        self._create_identity_txt(base_folder, num_images)
+        self._create_bbox_txt(base_folder, num_images)
+        self._create_landmarks_txt(base_folder, num_images)
 
+        return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
 
-class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Caltech101
-    FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
+    def _create_split_txt(self, root):
+        num_images_per_split = dict(train=4, valid=3, test=2)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
-    REQUIRED_PACKAGES = ("scipy",)
+        data = [
+            [self._SPLIT_TO_IDX[split]] for split, num_images in num_images_per_split.items() for _ in range(num_images)
+        ]
+        self._create_txt(root, "list_eval_partition.txt", data)
 
-    def inject_fake_data(self, tmpdir, config):
-        root = pathlib.Path(tmpdir) / "caltech101"
-        images = root / "101_ObjectCategories"
-        annotations = root / "Annotations"
+        num_images_per_split["all"] = num_images = sum(num_images_per_split.values())
+        return num_images, num_images_per_split
 
-        categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"), ("ying_yang", "ying_yang"))
-        num_images_per_category = 2
+    def _create_attr_txt(self, root, num_images):
+        header = ("5_o_Clock_Shadow", "Young")
+        data = torch.rand((num_images, len(header))).ge(0.5).int().mul(2).sub(1).tolist()
+        self._create_txt(root, "list_attr_celeba.txt", data, header=header, add_num_examples=True)
+        return header
 
-        for image_category, annotation_category in categories:
-            datasets_utils.create_image_folder(
-                root=images,
-                name=image_category,
-                file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
-                num_examples=num_images_per_category,
-            )
-            self._create_annotation_folder(
-                root=annotations,
-                name=annotation_category,
-                file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
-                num_examples=num_images_per_category,
-            )
+    def _create_identity_txt(self, root, num_images):
+        data = torch.randint(1, 4, size=(num_images, 1)).tolist()
+        self._create_txt(root, "identity_CelebA.txt", data)
 
-        # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices.
-        os.makedirs(images / "BACKGROUND_Google")
+    def _create_bbox_txt(self, root, num_images):
+        header = ("x_1", "y_1", "width", "height")
+        data = torch.randint(10, size=(num_images, len(header))).tolist()
+        self._create_txt(
+            root, "list_bbox_celeba.txt", data, header=header, add_num_examples=True, add_image_id_to_header=True
+        )
 
-        return num_images_per_category * len(categories)
+    def _create_landmarks_txt(self, root, num_images):
+        header = ("lefteye_x", "rightmouth_y")
+        data = torch.randint(10, size=(num_images, len(header))).tolist()
+        self._create_txt(root, "list_landmarks_align_celeba.txt", data, header=header, add_num_examples=True)
 
-    def _create_annotation_folder(self, root, name, file_name_fn, num_examples):
-        root = pathlib.Path(root) / name
-        os.makedirs(root)
+    def _create_txt(self, root, name, data, header=None, add_num_examples=False, add_image_id_to_header=False):
+        with open(pathlib.Path(root) / name, "w") as fh:
+            if add_num_examples:
+                fh.write(f"{len(data)}\n")
 
-        for idx in range(num_examples):
-            self._create_annotation_file(root, file_name_fn(idx))
+            if header:
+                if add_image_id_to_header:
+                    header = ("image_id", *header)
+                fh.write(f"{' '.join(header)}\n")
 
-    def _create_annotation_file(self, root, name):
-        mdict = dict(obj_contour=torch.rand((2, torch.randint(3, 6, size=())), dtype=torch.float64).numpy())
-        datasets_utils.lazy_importer.scipy.io.savemat(str(pathlib.Path(root) / name), mdict)
+            for idx, line in enumerate(data, 1):
+                fh.write(f"{' '.join((f'{idx:06d}.jpg', *[str(value) for value in line]))}\n")
 
     def test_combined_targets(self):
-        target_types = ["category", "annotation"]
+        target_types = ["attr", "identity", "bbox", "landmarks"]
 
         individual_targets = []
         for target_type in target_types:
@@ -746,1062 +586,659 @@ def test_combined_targets(self):
                 ), "Type of the combined target does not match the type of the corresponding individual target: "
                 f"{actual} is not {expected}",
 
+    def test_no_target(self):
+        with self.create_dataset(target_type=[]) as (dataset, _):
+            _, target = dataset[0]
 
-class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Caltech256
+        assert target is None
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
+    def test_attr_names(self):
+        with self.create_dataset() as (dataset, info):
+            assert tuple(dataset.attr_names) == info["attr_names"]
 
-        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
-        num_images_per_category = 2
+    def test_images_names_split(self):
+        with self.create_dataset(split="all") as (dataset, _):
+            all_imgs_names = set(dataset.filename)
 
-        for idx, category in categories:
-            datasets_utils.create_image_folder(
-                tmpdir,
-                name=f"{idx:03d}.{category}",
-                file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
-                num_examples=num_images_per_category,
-            )
+        merged_imgs_names = set()
+        for split in ["train", "valid", "test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                merged_imgs_names.update(dataset.filename)
 
-        return num_images_per_category * len(categories)
+        assert merged_imgs_names == all_imgs_names
 
 
-class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.WIDERFace
-    FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.VOCSegmentation
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
+
+    ADDITIONAL_CONFIGS = (
+        *datasets_utils.combinations_grid(
+            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
+        ),
+        dict(year="2007", image_set="test"),
+        dict(year="2007-test", image_set="test"),
+    )
 
     def inject_fake_data(self, tmpdir, config):
-        widerface_dir = pathlib.Path(tmpdir) / "widerface"
-        annotations_dir = widerface_dir / "wider_face_split"
-        os.makedirs(annotations_dir)
+        year, is_test_set = (
+            ("2007", True)
+            if config["year"] == "2007-test" or config["image_set"] == "test"
+            else (config["year"], False)
+        )
+        image_set = config["image_set"]
 
-        split_to_idx = split_to_num_examples = {
-            "train": 1,
-            "val": 2,
-            "test": 3,
-        }
+        base_dir = pathlib.Path(tmpdir)
+        if year == "2011":
+            base_dir /= "TrainVal"
+        base_dir = base_dir / "VOCdevkit" / f"VOC{year}"
+        os.makedirs(base_dir)
 
-        # We need to create all folders regardless of the split in config
-        for split in ("train", "val", "test"):
-            split_idx = split_to_idx[split]
-            num_examples = split_to_num_examples[split]
+        num_images, num_images_per_image_set = self._create_image_set_files(base_dir, "ImageSets", is_test_set)
+        datasets_utils.create_image_folder(base_dir, "JPEGImages", lambda idx: f"{idx:06d}.jpg", num_images)
 
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=widerface_dir / f"WIDER_{split}" / "images" / "0--Parade",
-                file_name_fn=lambda image_idx: f"0_Parade_marchingband_1_{split_idx + image_idx}.jpg",
-                num_examples=num_examples,
-            )
+        datasets_utils.create_image_folder(base_dir, "SegmentationClass", lambda idx: f"{idx:06d}.png", num_images)
+        annotation = self._create_annotation_files(base_dir, "Annotations", num_images)
 
-            annotation_file_name = {
-                "train": annotations_dir / "wider_face_train_bbx_gt.txt",
-                "val": annotations_dir / "wider_face_val_bbx_gt.txt",
-                "test": annotations_dir / "wider_face_test_filelist.txt",
-            }[split]
+        return dict(num_examples=num_images_per_image_set[image_set], annotation=annotation)
 
-            annotation_content = {
-                "train": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n449 330 122 149 0 0 0 0 0 0\n"
-                    for image_idx in range(num_examples)
-                ),
-                "val": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n501 160 285 443 0 0 0 0 0 0\n"
-                    for image_idx in range(num_examples)
-                ),
-                "test": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n"
-                    for image_idx in range(num_examples)
-                ),
-            }[split]
+    def _create_image_set_files(self, root, name, is_test_set):
+        root = pathlib.Path(root) / name
+        src = pathlib.Path(root) / "Main"
+        os.makedirs(src, exist_ok=True)
 
-            with open(annotation_file_name, "w") as annotation_file:
-                annotation_file.write(annotation_content)
+        idcs = dict(train=(0, 1, 2), val=(3, 4), test=(5,))
+        idcs["trainval"] = (*idcs["train"], *idcs["val"])
 
-        return split_to_num_examples[config["split"]]
+        for image_set in ("test",) if is_test_set else ("train", "val", "trainval"):
+            self._create_image_set_file(src, image_set, idcs[image_set])
 
+        shutil.copytree(src, root / "Segmentation")
 
-class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Cityscapes
-    TARGET_TYPES = (
-        "instance",
-        "semantic",
-        "polygon",
-        "color",
-    )
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
-            mode=("coarse",),
-            split=("train", "train_extra", "val"),
-            target_type=TARGET_TYPES,
-        ),
-    )
-    FEATURE_TYPES = (PIL.Image.Image, (dict, PIL.Image.Image))
+        num_images = max(itertools.chain(*idcs.values())) + 1
+        num_images_per_image_set = {image_set: len(idcs_) for image_set, idcs_ in idcs.items()}
+        return num_images, num_images_per_image_set
 
-    def inject_fake_data(self, tmpdir, config):
+    def _create_image_set_file(self, root, image_set, idcs):
+        with open(pathlib.Path(root) / f"{image_set}.txt", "w") as fh:
+            fh.writelines([f"{idx:06d}\n" for idx in idcs])
 
-        tmpdir = pathlib.Path(tmpdir)
+    def _create_annotation_files(self, root, name, num_images):
+        root = pathlib.Path(root) / name
+        os.makedirs(root)
 
-        mode_to_splits = {
-            "Coarse": ["train", "train_extra", "val"],
-            "Fine": ["train", "test", "val"],
-        }
+        for idx in range(num_images):
+            annotation = self._create_annotation_file(root, f"{idx:06d}.xml")
 
-        if config["split"] == "train":  # just for coverage of the number of samples
-            cities = ["bochum", "bremen"]
-        else:
-            cities = ["bochum"]
+        return annotation
 
-        polygon_target = {
-            "imgHeight": 1024,
-            "imgWidth": 2048,
-            "objects": [
-                {
-                    "label": "sky",
-                    "polygon": [
-                        [1241, 0],
-                        [1234, 156],
-                        [1478, 197],
-                        [1611, 172],
-                        [1606, 0],
-                    ],
-                },
-                {
-                    "label": "road",
-                    "polygon": [
-                        [0, 448],
-                        [1331, 274],
-                        [1473, 265],
-                        [2047, 605],
-                        [2047, 1023],
-                        [0, 1023],
-                    ],
-                },
-            ],
-        }
+    def _create_annotation_file(self, root, name):
+        def add_child(parent, name, text=None):
+            child = ET.SubElement(parent, name)
+            child.text = text
+            return child
 
-        for mode in ["Coarse", "Fine"]:
-            gt_dir = tmpdir / f"gt{mode}"
-            for split in mode_to_splits[mode]:
-                for city in cities:
+        def add_name(obj, name="dog"):
+            add_child(obj, "name", name)
+            return name
 
-                    def make_image(name, size=10):
-                        datasets_utils.create_image_folder(
-                            root=gt_dir / split,
-                            name=city,
-                            file_name_fn=lambda _: name,
-                            size=size,
-                            num_examples=1,
-                        )
+        def add_bndbox(obj, bndbox=None):
+            if bndbox is None:
+                bndbox = {"xmin": "1", "xmax": "2", "ymin": "3", "ymax": "4"}
 
-                    make_image(f"{city}_000000_000000_gt{mode}_instanceIds.png")
-                    make_image(f"{city}_000000_000000_gt{mode}_labelIds.png")
-                    make_image(f"{city}_000000_000000_gt{mode}_color.png", size=(4, 10, 10))
+            obj = add_child(obj, "bndbox")
+            for name, text in bndbox.items():
+                add_child(obj, name, text)
 
-                    polygon_target_name = gt_dir / split / city / f"{city}_000000_000000_gt{mode}_polygons.json"
-                    with open(polygon_target_name, "w") as outfile:
-                        json.dump(polygon_target, outfile)
+            return bndbox
 
-        # Create leftImg8bit folder
-        for split in ["test", "train_extra", "train", "val"]:
-            for city in cities:
-                datasets_utils.create_image_folder(
-                    root=tmpdir / "leftImg8bit" / split,
-                    name=city,
-                    file_name_fn=lambda _: f"{city}_000000_000000_leftImg8bit.png",
-                    num_examples=1,
-                )
+        annotation = ET.Element("annotation")
+        obj = add_child(annotation, "object")
+        data = dict(name=add_name(obj), bndbox=add_bndbox(obj))
 
-        info = {"num_examples": len(cities)}
-        if config["target_type"] == "polygon":
-            info["expected_polygon_target"] = polygon_target
-        return info
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            fh.write(ET.tostring(annotation))
 
-    def test_combined_targets(self):
-        target_types = ["semantic", "polygon", "color"]
+        return data
 
-        with self.create_dataset(target_type=target_types) as (dataset, _):
-            output = dataset[0]
-            assert isinstance(output, tuple)
-            assert len(output) == 2
-            assert isinstance(output[0], PIL.Image.Image)
-            assert isinstance(output[1], tuple)
-            assert len(output[1]) == 3
-            assert isinstance(output[1][0], PIL.Image.Image)  # semantic
-            assert isinstance(output[1][1], dict)  # polygon
-            assert isinstance(output[1][2], PIL.Image.Image)  # color
 
-    def test_feature_types_target_color(self):
-        with self.create_dataset(target_type="color") as (dataset, _):
-            color_img, color_target = dataset[0]
-            assert isinstance(color_img, PIL.Image.Image)
-            assert np.array(color_target).shape[2] == 4
+class VOCDetectionTestCase(VOCSegmentationTestCase):
+    DATASET_CLASS = datasets.VOCDetection
+    FEATURE_TYPES = (PIL.Image.Image, dict)
 
-    def test_feature_types_target_polygon(self):
-        with self.create_dataset(target_type="polygon") as (dataset, info):
-            polygon_img, polygon_target = dataset[0]
-            assert isinstance(polygon_img, PIL.Image.Image)
-            (polygon_target, info["expected_polygon_target"])
+    def test_annotations(self):
+        with self.create_dataset() as (dataset, info):
+            _, target = dataset[0]
 
+            assert "annotation" in target
+            annotation = target["annotation"]
 
-class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.ImageNet
-    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+            assert "object" in annotation
+            objects = annotation["object"]
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+            assert len(objects) == 1
+            object = objects[0]
 
-        wnid = "n01234567"
-        if config["split"] == "train":
-            num_examples = 3
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=tmpdir / "train" / wnid / wnid,
-                file_name_fn=lambda image_idx: f"{wnid}_{image_idx}.JPEG",
-                num_examples=num_examples,
-            )
-        else:
-            num_examples = 1
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=tmpdir / "val" / wnid,
-                file_name_fn=lambda image_ifx: "ILSVRC2012_val_0000000{image_idx}.JPEG",
-                num_examples=num_examples,
-            )
+            assert object == info["annotation"]
 
-        wnid_to_classes = {wnid: [1]}
-        torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
-        return num_examples
 
+class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CocoDetection
+    FEATURE_TYPES = (PIL.Image.Image, list)
 
-class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    REQUIRED_PACKAGES = ("pycocotools",)
 
-    _VERSION_CONFIG = dict(
-        base_folder="cifar-10-batches-py",
-        train_files=tuple(f"data_batch_{idx}" for idx in range(1, 6)),
-        test_files=("test_batch",),
-        labels_key="labels",
-        meta_file="batches.meta",
-        num_categories=10,
-        categories_key="label_names",
-    )
+    _IMAGE_FOLDER = "images"
+    _ANNOTATIONS_FOLDER = "annotations"
+    _ANNOTATIONS_FILE = "annotations.json"
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir) / self._VERSION_CONFIG["base_folder"]
-        os.makedirs(tmpdir)
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._IMAGE_FOLDER
+        annotation_file = tmpdir / self._ANNOTATIONS_FOLDER / self._ANNOTATIONS_FILE
+        return root, annotation_file
 
-        num_images_per_file = 1
-        for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]):
-            self._create_batch_file(tmpdir, name, num_images_per_file)
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
 
-        categories = self._create_meta_file(tmpdir)
+        num_images = 3
+        num_annotations_per_image = 2
 
-        return dict(
-            num_examples=num_images_per_file
-            * len(self._VERSION_CONFIG["train_files"] if config["train"] else self._VERSION_CONFIG["test_files"]),
-            categories=categories,
+        files = datasets_utils.create_image_folder(
+            tmpdir, name=self._IMAGE_FOLDER, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images
         )
+        file_names = [file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files]
 
-    def _create_batch_file(self, root, name, num_images):
-        np_rng = np.random.RandomState(0)
-        data = datasets_utils.create_image_or_video_tensor((num_images, 32 * 32 * 3))
-        labels = np_rng.randint(0, self._VERSION_CONFIG["num_categories"], size=num_images).tolist()
-        self._create_binary_file(root, name, {"data": data, self._VERSION_CONFIG["labels_key"]: labels})
+        annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
+        os.makedirs(annotation_folder)
+        info = self._create_annotation_file(
+            annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image
+        )
 
-    def _create_meta_file(self, root):
-        categories = [
-            f"{idx:0{len(str(self._VERSION_CONFIG['num_categories'] - 1))}d}"
-            for idx in range(self._VERSION_CONFIG["num_categories"])
-        ]
-        self._create_binary_file(
-            root, self._VERSION_CONFIG["meta_file"], {self._VERSION_CONFIG["categories_key"]: categories}
+        info["num_examples"] = num_images
+        return info
+
+    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image):
+        image_ids = [int(file_name.stem) for file_name in file_names]
+        images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)]
+
+        annotations, info = self._create_annotations(image_ids, num_annotations_per_image)
+        self._create_json(root, name, dict(images=images, annotations=annotations))
+
+        return info
+
+    def _create_annotations(self, image_ids, num_annotations_per_image):
+        annotations = datasets_utils.combinations_grid(
+            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
         )
-        return categories
+        for id, annotation in enumerate(annotations):
+            annotation["id"] = id
+        return annotations, dict()
 
-    def _create_binary_file(self, root, name, content):
-        with open(pathlib.Path(root) / name, "wb") as fh:
-            pickle.dump(content, fh)
+    def _create_json(self, root, name, content):
+        file = pathlib.Path(root) / name
+        with open(file, "w") as fh:
+            json.dump(content, fh)
+        return file
 
-    def test_class_to_idx(self):
-        with self.create_dataset() as (dataset, info):
-            expected = {category: label for label, category in enumerate(info["categories"])}
-            actual = dataset.class_to_idx
-            assert actual == expected
 
+class CocoCaptionsTestCase(CocoDetectionTestCase):
+    DATASET_CLASS = datasets.CocoCaptions
 
-class CIFAR100(CIFAR10TestCase):
-    DATASET_CLASS = datasets.CIFAR100
+    def _create_annotations(self, image_ids, num_annotations_per_image):
+        captions = [str(idx) for idx in range(num_annotations_per_image)]
+        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        for id, annotation in enumerate(annotations):
+            annotation["id"] = id
+        return annotations, dict(captions=captions)
 
-    _VERSION_CONFIG = dict(
-        base_folder="cifar-100-python",
-        train_files=("train",),
-        test_files=("test",),
-        labels_key="fine_labels",
-        meta_file="meta",
-        num_categories=100,
-        categories_key="fine_label_names",
-    )
+    def test_captions(self):
+        with self.create_dataset() as (dataset, info):
+            _, captions = dataset[0]
+            assert tuple(captions) == tuple(info["captions"])
 
 
-class CelebATestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CelebA
-    FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
+class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.UCF101
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("train", "valid", "test", "all"),
-        target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
-    )
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
 
-    _SPLIT_TO_IDX = dict(train=0, valid=1, test=2)
+    _VIDEO_FOLDER = "videos"
+    _ANNOTATIONS_FOLDER = "annotations"
+
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._VIDEO_FOLDER
+        annotation_path = tmpdir / self._ANNOTATIONS_FOLDER
+        return root, annotation_path
 
     def inject_fake_data(self, tmpdir, config):
-        base_folder = pathlib.Path(tmpdir) / "celeba"
-        os.makedirs(base_folder)
+        tmpdir = pathlib.Path(tmpdir)
 
-        num_images, num_images_per_split = self._create_split_txt(base_folder)
+        video_folder = tmpdir / self._VIDEO_FOLDER
+        os.makedirs(video_folder)
+        video_files = self._create_videos(video_folder)
 
-        datasets_utils.create_image_folder(
-            base_folder, "img_align_celeba", lambda idx: f"{idx + 1:06d}.jpg", num_images
-        )
-        attr_names = self._create_attr_txt(base_folder, num_images)
-        self._create_identity_txt(base_folder, num_images)
-        self._create_bbox_txt(base_folder, num_images)
-        self._create_landmarks_txt(base_folder, num_images)
+        annotations_folder = tmpdir / self._ANNOTATIONS_FOLDER
+        os.makedirs(annotations_folder)
+        num_examples = self._create_annotation_files(annotations_folder, video_files, config["fold"], config["train"])
 
-        return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
+        return num_examples
 
-    def _create_split_txt(self, root):
-        num_images_per_split = dict(train=4, valid=3, test=2)
+    def _create_videos(self, root, num_examples_per_class=3):
+        def file_name_fn(cls, idx, clips_per_group=2):
+            return f"v_{cls}_g{(idx // clips_per_group) + 1:02d}_c{(idx % clips_per_group) + 1:02d}.avi"
 
-        data = [
-            [self._SPLIT_TO_IDX[split]] for split, num_images in num_images_per_split.items() for _ in range(num_images)
+        video_files = [
+            datasets_utils.create_video_folder(root, cls, lambda idx: file_name_fn(cls, idx), num_examples_per_class)
+            for cls in ("ApplyEyeMakeup", "YoYo")
         ]
-        self._create_txt(root, "list_eval_partition.txt", data)
-
-        num_images_per_split["all"] = num_images = sum(num_images_per_split.values())
-        return num_images, num_images_per_split
+        return [path.relative_to(root) for path in itertools.chain(*video_files)]
 
-    def _create_attr_txt(self, root, num_images):
-        header = ("5_o_Clock_Shadow", "Young")
-        data = torch.rand((num_images, len(header))).ge(0.5).int().mul(2).sub(1).tolist()
-        self._create_txt(root, "list_attr_celeba.txt", data, header=header, add_num_examples=True)
-        return header
+    def _create_annotation_files(self, root, video_files, fold, train):
+        current_videos = random.sample(video_files, random.randrange(1, len(video_files) - 1))
+        current_annotation = self._annotation_file_name(fold, train)
+        self._create_annotation_file(root, current_annotation, current_videos)
 
-    def _create_identity_txt(self, root, num_images):
-        data = torch.randint(1, 4, size=(num_images, 1)).tolist()
-        self._create_txt(root, "identity_CelebA.txt", data)
+        other_videos = set(video_files) - set(current_videos)
+        other_annotations = [
+            self._annotation_file_name(fold, train) for fold, train in itertools.product((1, 2, 3), (True, False))
+        ]
+        other_annotations.remove(current_annotation)
+        for name in other_annotations:
+            self._create_annotation_file(root, name, other_videos)
 
-    def _create_bbox_txt(self, root, num_images):
-        header = ("x_1", "y_1", "width", "height")
-        data = torch.randint(10, size=(num_images, len(header))).tolist()
-        self._create_txt(
-            root, "list_bbox_celeba.txt", data, header=header, add_num_examples=True, add_image_id_to_header=True
-        )
+        return len(current_videos)
 
-    def _create_landmarks_txt(self, root, num_images):
-        header = ("lefteye_x", "rightmouth_y")
-        data = torch.randint(10, size=(num_images, len(header))).tolist()
-        self._create_txt(root, "list_landmarks_align_celeba.txt", data, header=header, add_num_examples=True)
+    def _annotation_file_name(self, fold, train):
+        return f"{'train' if train else 'test'}list{fold:02d}.txt"
 
-    def _create_txt(self, root, name, data, header=None, add_num_examples=False, add_image_id_to_header=False):
+    def _create_annotation_file(self, root, name, video_files):
         with open(pathlib.Path(root) / name, "w") as fh:
-            if add_num_examples:
-                fh.write(f"{len(data)}\n")
+            fh.writelines(f"{str(file).replace(os.sep, '/')}\n" for file in sorted(video_files))
 
-            if header:
-                if add_image_id_to_header:
-                    header = ("image_id", *header)
-                fh.write(f"{' '.join(header)}\n")
 
-            for idx, line in enumerate(data, 1):
-                fh.write(f"{' '.join((f'{idx:06d}.jpg', *[str(value) for value in line]))}\n")
+class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.LSUN
 
-    def test_combined_targets(self):
-        target_types = ["attr", "identity", "bbox", "landmarks"]
+    REQUIRED_PACKAGES = ("lmdb",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
+    )
 
-        individual_targets = []
-        for target_type in target_types:
-            with self.create_dataset(target_type=target_type) as (dataset, _):
-                _, target = dataset[0]
-                individual_targets.append(target)
+    _CATEGORIES = (
+        "bedroom",
+        "bridge",
+        "church_outdoor",
+        "classroom",
+        "conference_room",
+        "dining_room",
+        "kitchen",
+        "living_room",
+        "restaurant",
+        "tower",
+    )
 
-        with self.create_dataset(target_type=target_types) as (dataset, _):
-            _, combined_targets = dataset[0]
+    def inject_fake_data(self, tmpdir, config):
+        root = pathlib.Path(tmpdir)
 
-        actual = len(individual_targets)
-        expected = len(combined_targets)
-        assert (
-            actual == expected
-        ), "The number of the returned combined targets does not match the the number targets if requested "
-        f"individually: {actual} != {expected}",
+        num_images = 0
+        for cls in self._parse_classes(config["classes"]):
+            num_images += self._create_lmdb(root, cls)
 
-        for target_type, combined_target, individual_target in zip(target_types, combined_targets, individual_targets):
-            with self.subTest(target_type=target_type):
-                actual = type(combined_target)
-                expected = type(individual_target)
-                assert (
-                    actual is expected
-                ), "Type of the combined target does not match the type of the corresponding individual target: "
-                f"{actual} is not {expected}",
+        return num_images
 
-    def test_no_target(self):
-        with self.create_dataset(target_type=[]) as (dataset, _):
-            _, target = dataset[0]
+    @ contextlib.contextmanager
+    def create_dataset(self, *args, **kwargs):
+        with super().create_dataset(*args, **kwargs) as output:
+            yield output
+            # Currently datasets.LSUN caches the keys in the current directory rather than in the root directory. Thus,
+            # this creates a number of _cache_* files in the current directory that will not be removed together
+            # with the temporary directory
+            for file in os.listdir(os.getcwd()):
+                if file.startswith("_cache_"):
+                    try:
+                        os.remove(file)
+                    except FileNotFoundError:
+                        # When the same test is run in parallel (in fb internal tests), a thread may remove another
+                        # thread's file. We should be able to remove the try/except when
+                        # https://github.com/pytorch/vision/issues/825 is fixed.
+                        pass
 
-        assert target is None
+    def _parse_classes(self, classes):
+        if not isinstance(classes, str):
+            return classes
 
-    def test_attr_names(self):
-        with self.create_dataset() as (dataset, info):
-            assert tuple(dataset.attr_names) == info["attr_names"]
+        split = classes
+        if split == "test":
+            return [split]
 
-    def test_images_names_split(self):
-        with self.create_dataset(split="all") as (dataset, _):
-            all_imgs_names = set(dataset.filename)
+        return [f"{category}_{split}" for category in self._CATEGORIES]
 
-        merged_imgs_names = set()
-        for split in ["train", "valid", "test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                merged_imgs_names.update(dataset.filename)
+    def _create_lmdb(self, root, cls):
+        lmdb = datasets_utils.lazy_importer.lmdb
+        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
 
-        assert merged_imgs_names == all_imgs_names
+        folder = f"{cls}_lmdb"
 
+        num_images = torch.randint(1, 4, size=()).item()
+        format = "png"
+        files = datasets_utils.create_image_folder(root, folder, lambda idx: f"{idx}.{format}", num_images)
 
-class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.VOCSegmentation
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
-
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
-        dict(year="2007", image_set="test"),
-        dict(year="2007-test", image_set="test"),
-    )
-
-    def inject_fake_data(self, tmpdir, config):
-        year, is_test_set = (
-            ("2007", True)
-            if config["year"] == "2007-test" or config["image_set"] == "test"
-            else (config["year"], False)
-        )
-        image_set = config["image_set"]
-
-        base_dir = pathlib.Path(tmpdir)
-        if year == "2011":
-            base_dir /= "TrainVal"
-        base_dir = base_dir / "VOCdevkit" / f"VOC{year}"
-        os.makedirs(base_dir)
+        with lmdb.open(str(root / folder)) as env, env.begin(write=True) as txn:
+            for file in files:
+                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
 
-        num_images, num_images_per_image_set = self._create_image_set_files(base_dir, "ImageSets", is_test_set)
-        datasets_utils.create_image_folder(base_dir, "JPEGImages", lambda idx: f"{idx:06d}.jpg", num_images)
+                buffer = io.BytesIO()
+                PIL.Image.open(file).save(buffer, format)
+                buffer.seek(0)
+                value = buffer.read()
 
-        datasets_utils.create_image_folder(base_dir, "SegmentationClass", lambda idx: f"{idx:06d}.png", num_images)
-        annotation = self._create_annotation_files(base_dir, "Annotations", num_images)
+                txn.put(key, value)
 
-        return dict(num_examples=num_images_per_image_set[image_set], annotation=annotation)
+                os.remove(file)
 
-    def _create_image_set_files(self, root, name, is_test_set):
-        root = pathlib.Path(root) / name
-        src = pathlib.Path(root) / "Main"
-        os.makedirs(src, exist_ok=True)
+        return num_images
 
-        idcs = dict(train=(0, 1, 2), val=(3, 4), test=(5,))
-        idcs["trainval"] = (*idcs["train"], *idcs["val"])
+    def test_not_found_or_corrupted(self):
+        # LSUN does not raise built-in exception, but a custom one. It is expressive enough to not 'cast' it to
+        # RuntimeError or FileNotFoundError that are normally checked by this test.
+        with pytest.raises(datasets_utils.lazy_importer.lmdb.Error):
+            super().test_not_found_or_corrupted()
 
-        for image_set in ("test",) if is_test_set else ("train", "val", "trainval"):
-            self._create_image_set_file(src, image_set, idcs[image_set])
 
-        shutil.copytree(src, root / "Segmentation")
+class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.Kinetics
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
 
-        num_images = max(itertools.chain(*idcs.values())) + 1
-        num_images_per_image_set = {image_set: len(idcs_) for image_set, idcs_ in idcs.items()}
-        return num_images, num_images_per_image_set
+    def inject_fake_data(self, tmpdir, config):
+        classes = ("Abseiling", "Zumba")
+        num_videos_per_class = 2
+        tmpdir = pathlib.Path(tmpdir) / config["split"]
+        digits = string.ascii_letters + string.digits + "-_"
+        for cls in classes:
+            datasets_utils.create_video_folder(
+                tmpdir,
+                cls,
+                lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4",
+                num_videos_per_class,
+            )
+        return num_videos_per_class * len(classes)
 
-    def _create_image_set_file(self, root, image_set, idcs):
-        with open(pathlib.Path(root) / f"{image_set}.txt", "w") as fh:
-            fh.writelines([f"{idx:06d}\n" for idx in idcs])
 
-    def _create_annotation_files(self, root, name, num_images):
-        root = pathlib.Path(root) / name
-        os.makedirs(root)
+class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.Kinetics400
 
-        for idx in range(num_images):
-            annotation = self._create_annotation_file(root, f"{idx:06d}.xml")
+    def inject_fake_data(self, tmpdir, config):
+        classes = ("Abseiling", "Zumba")
+        num_videos_per_class = 2
 
-        return annotation
+        digits = string.ascii_letters + string.digits + "-_"
+        for cls in classes:
+            datasets_utils.create_video_folder(
+                tmpdir,
+                cls,
+                lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi",
+                num_videos_per_class,
+            )
 
-    def _create_annotation_file(self, root, name):
-        def add_child(parent, name, text=None):
-            child = ET.SubElement(parent, name)
-            child.text = text
-            return child
+        return num_videos_per_class * len(classes)
 
-        def add_name(obj, name="dog"):
-            add_child(obj, "name", name)
-            return name
 
-        def add_bndbox(obj, bndbox=None):
-            if bndbox is None:
-                bndbox = {"xmin": "1", "xmax": "2", "ymin": "3", "ymax": "4"}
+class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.HMDB51
 
-            obj = add_child(obj, "bndbox")
-            for name, text in bndbox.items():
-                add_child(obj, name, text)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
 
-            return bndbox
+    _VIDEO_FOLDER = "videos"
+    _SPLITS_FOLDER = "splits"
+    _CLASSES = ("brush_hair", "wave")
 
-        annotation = ET.Element("annotation")
-        obj = add_child(annotation, "object")
-        data = dict(name=add_name(obj), bndbox=add_bndbox(obj))
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._VIDEO_FOLDER
+        annotation_path = tmpdir / self._SPLITS_FOLDER
+        return root, annotation_path
 
-        with open(pathlib.Path(root) / name, "wb") as fh:
-            fh.write(ET.tostring(annotation))
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
 
-        return data
+        video_folder = tmpdir / self._VIDEO_FOLDER
+        os.makedirs(video_folder)
+        video_files = self._create_videos(video_folder)
 
+        splits_folder = tmpdir / self._SPLITS_FOLDER
+        os.makedirs(splits_folder)
+        num_examples = self._create_split_files(splits_folder, video_files, config["fold"], config["train"])
 
-class VOCDetectionTestCase(VOCSegmentationTestCase):
-    DATASET_CLASS = datasets.VOCDetection
-    FEATURE_TYPES = (PIL.Image.Image, dict)
+        return num_examples
 
-    def test_annotations(self):
-        with self.create_dataset() as (dataset, info):
-            _, target = dataset[0]
+    def _create_videos(self, root, num_examples_per_class=3):
+        def file_name_fn(cls, idx, clips_per_group=2):
+            return f"{cls}_{(idx // clips_per_group) + 1:d}_{(idx % clips_per_group) + 1:d}.avi"
 
-            assert "annotation" in target
-            annotation = target["annotation"]
+        return [
+            (
+                cls,
+                datasets_utils.create_video_folder(
+                    root,
+                    cls,
+                    lambda idx: file_name_fn(cls, idx),
+                    num_examples_per_class,
+                ),
+            )
+            for cls in self._CLASSES
+        ]
 
-            assert "object" in annotation
-            objects = annotation["object"]
+    def _create_split_files(self, root, video_files, fold, train):
+        num_videos = num_train_videos = 0
 
-            assert len(objects) == 1
-            object = objects[0]
+        for cls, videos in video_files:
+            num_videos += len(videos)
 
-            assert object == info["annotation"]
+            train_videos = set(random.sample(videos, random.randrange(1, len(videos) - 1)))
+            num_train_videos += len(train_videos)
 
+            with open(pathlib.Path(root) / f"{cls}_test_split{fold}.txt", "w") as fh:
+                fh.writelines(f"{file.name} {1 if file in train_videos else 2}\n" for file in videos)
 
-class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CocoDetection
-    FEATURE_TYPES = (PIL.Image.Image, list)
+        return num_train_videos if train else (num_videos - num_train_videos)
 
-    REQUIRED_PACKAGES = ("pycocotools",)
 
-    _IMAGE_FOLDER = "images"
-    _ANNOTATIONS_FOLDER = "annotations"
-    _ANNOTATIONS_FILE = "annotations.json"
+class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Omniglot
 
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._IMAGE_FOLDER
-        annotation_file = tmpdir / self._ANNOTATIONS_FOLDER / self._ANNOTATIONS_FILE
-        return root, annotation_file
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+        target_folder = (
+            pathlib.Path(tmpdir) / "omniglot-py" / f"images_{'background' if config['background'] else 'evaluation'}"
+        )
+        os.makedirs(target_folder)
 
-        num_images = 3
-        num_annotations_per_image = 2
+        num_images = 0
+        for name in ("Alphabet_of_the_Magi", "Tifinagh"):
+            num_images += self._create_alphabet_folder(target_folder, name)
 
-        files = datasets_utils.create_image_folder(
-            tmpdir, name=self._IMAGE_FOLDER, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images
-        )
-        file_names = [file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files]
+        return num_images
 
-        annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
-        os.makedirs(annotation_folder)
-        info = self._create_annotation_file(
-            annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image
-        )
+    def _create_alphabet_folder(self, root, name):
+        num_images_total = 0
+        for idx in range(torch.randint(1, 4, size=()).item()):
+            num_images = torch.randint(1, 4, size=()).item()
+            num_images_total += num_images
 
-        info["num_examples"] = num_images
-        return info
+            datasets_utils.create_image_folder(
+                root / name, f"character{idx:02d}", lambda image_idx: f"{image_idx:02d}.png", num_images
+            )
 
-    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image):
-        image_ids = [int(file_name.stem) for file_name in file_names]
-        images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)]
+        return num_images_total
 
-        annotations, info = self._create_annotations(image_ids, num_annotations_per_image)
-        self._create_json(root, name, dict(images=images, annotations=annotations))
 
-        return info
+class SBUTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SBU
+    FEATURE_TYPES = (PIL.Image.Image, str)
 
-    def _create_annotations(self, image_ids, num_annotations_per_image):
-        annotations = datasets_utils.combinations_grid(
-            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
-        )
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
-        return annotations, dict()
+    def inject_fake_data(self, tmpdir, config):
+        num_images = 3
 
-    def _create_json(self, root, name, content):
-        file = pathlib.Path(root) / name
-        with open(file, "w") as fh:
-            json.dump(content, fh)
-        return file
+        dataset_folder = pathlib.Path(tmpdir) / "dataset"
+        images = datasets_utils.create_image_folder(tmpdir, "dataset", self._create_file_name, num_images)
 
+        self._create_urls_txt(dataset_folder, images)
+        self._create_captions_txt(dataset_folder, num_images)
 
-class CocoCaptionsTestCase(CocoDetectionTestCase):
-    DATASET_CLASS = datasets.CocoCaptions
-
-    def _create_annotations(self, image_ids, num_annotations_per_image):
-        captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
-        return annotations, dict(captions=captions)
-
-    def test_captions(self):
-        with self.create_dataset() as (dataset, info):
-            _, captions = dataset[0]
-            assert tuple(captions) == tuple(info["captions"])
+        return num_images
 
+    def _create_file_name(self, idx):
+        part1 = datasets_utils.create_random_string(10, string.digits)
+        part2 = datasets_utils.create_random_string(10, string.ascii_lowercase, string.digits[:6])
+        return f"{part1}_{part2}.jpg"
 
-class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.UCF101
+    def _create_urls_txt(self, root, images):
+        with open(root / "SBU_captioned_photo_dataset_urls.txt", "w") as fh:
+            for image in images:
+                fh.write(
+                    f"http://static.flickr.com/{datasets_utils.create_random_string(4, string.digits)}/{image.name}\n"
+                )
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    def _create_captions_txt(self, root, num_images):
+        with open(root / "SBU_captioned_photo_dataset_captions.txt", "w") as fh:
+            for _ in range(num_images):
+                fh.write(f"{datasets_utils.create_random_string(10)}\n")
 
-    _VIDEO_FOLDER = "videos"
-    _ANNOTATIONS_FOLDER = "annotations"
 
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._VIDEO_FOLDER
-        annotation_path = tmpdir / self._ANNOTATIONS_FOLDER
-        return root, annotation_path
+class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SEMEION
 
     def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+        num_images = 3
 
-        video_folder = tmpdir / self._VIDEO_FOLDER
-        os.makedirs(video_folder)
-        video_files = self._create_videos(video_folder)
+        images = torch.rand(num_images, 256)
+        labels = F.one_hot(torch.randint(10, size=(num_images,)))
+        with open(pathlib.Path(tmpdir) / "semeion.data", "w") as fh:
+            for image, one_hot_labels in zip(images, labels):
+                image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
+                labels_columns = " ".join([str(label.item()) for label in one_hot_labels])
+                fh.write(f"{image_columns} {labels_columns}\n")
 
-        annotations_folder = tmpdir / self._ANNOTATIONS_FOLDER
-        os.makedirs(annotations_folder)
-        num_examples = self._create_annotation_files(annotations_folder, video_files, config["fold"], config["train"])
+        return num_images
 
-        return num_examples
 
-    def _create_videos(self, root, num_examples_per_class=3):
-        def file_name_fn(cls, idx, clips_per_group=2):
-            return f"v_{cls}_g{(idx // clips_per_group) + 1:02d}_c{(idx % clips_per_group) + 1:02d}.avi"
+class USPSTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.USPS
 
-        video_files = [
-            datasets_utils.create_video_folder(root, cls, lambda idx: file_name_fn(cls, idx), num_examples_per_class)
-            for cls in ("ApplyEyeMakeup", "YoYo")
-        ]
-        return [path.relative_to(root) for path in itertools.chain(*video_files)]
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
 
-    def _create_annotation_files(self, root, video_files, fold, train):
-        current_videos = random.sample(video_files, random.randrange(1, len(video_files) - 1))
-        current_annotation = self._annotation_file_name(fold, train)
-        self._create_annotation_file(root, current_annotation, current_videos)
+    def inject_fake_data(self, tmpdir, config):
+        num_images = 2 if config["train"] else 1
 
-        other_videos = set(video_files) - set(current_videos)
-        other_annotations = [
-            self._annotation_file_name(fold, train) for fold, train in itertools.product((1, 2, 3), (True, False))
-        ]
-        other_annotations.remove(current_annotation)
-        for name in other_annotations:
-            self._create_annotation_file(root, name, other_videos)
+        images = torch.rand(num_images, 256) * 2 - 1
+        labels = torch.randint(1, 11, size=(num_images,))
 
-        return len(current_videos)
+        with bz2.open(pathlib.Path(tmpdir) / f"usps{'.t' if not config['train'] else ''}.bz2", "w") as fh:
+            for image, label in zip(images, labels):
+                line = " ".join((str(label.item()), *[f"{idx}:{pixel:.6f}" for idx, pixel in enumerate(image, 1)]))
+                fh.write(f"{line}\n".encode())
 
-    def _annotation_file_name(self, fold, train):
-        return f"{'train' if train else 'test'}list{fold:02d}.txt"
+        return num_images
 
-    def _create_annotation_file(self, root, name, video_files):
-        with open(pathlib.Path(root) / name, "w") as fh:
-            fh.writelines(f"{str(file).replace(os.sep, '/')}\n" for file in sorted(video_files))
 
+class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SBDataset
+    FEATURE_TYPES = (PIL.Image.Image, (np.ndarray, PIL.Image.Image))
 
-class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.LSUN
+    REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
 
-    REQUIRED_PACKAGES = ("lmdb",)
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
+        image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
     )
 
-    _CATEGORIES = (
-        "bedroom",
-        "bridge",
-        "church_outdoor",
-        "classroom",
-        "conference_room",
-        "dining_room",
-        "kitchen",
-        "living_room",
-        "restaurant",
-        "tower",
-    )
+    _NUM_CLASSES = 20
 
     def inject_fake_data(self, tmpdir, config):
-        root = pathlib.Path(tmpdir)
-
-        num_images = 0
-        for cls in self._parse_classes(config["classes"]):
-            num_images += self._create_lmdb(root, cls)
-
-        return num_images
+        num_images, num_images_per_image_set = self._create_split_files(tmpdir)
 
-    @ contextlib.contextmanager
-    def create_dataset(self, *args, **kwargs):
-        with super().create_dataset(*args, **kwargs) as output:
-            yield output
-            # Currently datasets.LSUN caches the keys in the current directory rather than in the root directory. Thus,
-            # this creates a number of _cache_* files in the current directory that will not be removed together
-            # with the temporary directory
-            for file in os.listdir(os.getcwd()):
-                if file.startswith("_cache_"):
-                    try:
-                        os.remove(file)
-                    except FileNotFoundError:
-                        # When the same test is run in parallel (in fb internal tests), a thread may remove another
-                        # thread's file. We should be able to remove the try/except when
-                        # https://github.com/pytorch/vision/issues/825 is fixed.
-                        pass
+        sizes = self._create_target_folder(tmpdir, "cls", num_images)
 
-    def _parse_classes(self, classes):
-        if not isinstance(classes, str):
-            return classes
+        datasets_utils.create_image_folder(
+            tmpdir, "img", lambda idx: f"{self._file_stem(idx)}.jpg", num_images, size=lambda idx: sizes[idx]
+        )
 
-        split = classes
-        if split == "test":
-            return [split]
+        return num_images_per_image_set[config["image_set"]]
 
-        return [f"{category}_{split}" for category in self._CATEGORIES]
+    def _create_split_files(self, root):
+        root = pathlib.Path(root)
 
-    def _create_lmdb(self, root, cls):
-        lmdb = datasets_utils.lazy_importer.lmdb
-        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
+        splits = dict(train=(0, 1, 2), train_noval=(0, 2), val=(3,))
 
-        folder = f"{cls}_lmdb"
+        for split, idcs in splits.items():
+            self._create_split_file(root, split, idcs)
 
-        num_images = torch.randint(1, 4, size=()).item()
-        format = "png"
-        files = datasets_utils.create_image_folder(root, folder, lambda idx: f"{idx}.{format}", num_images)
+        num_images = max(itertools.chain(*splits.values())) + 1
+        num_images_per_split = {split: len(idcs) for split, idcs in splits.items()}
+        return num_images, num_images_per_split
 
-        with lmdb.open(str(root / folder)) as env, env.begin(write=True) as txn:
-            for file in files:
-                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
+    def _create_split_file(self, root, name, idcs):
+        with open(root / f"{name}.txt", "w") as fh:
+            fh.writelines(f"{self._file_stem(idx)}\n" for idx in idcs)
 
-                buffer = io.BytesIO()
-                PIL.Image.open(file).save(buffer, format)
-                buffer.seek(0)
-                value = buffer.read()
+    def _create_target_folder(self, root, name, num_images):
+        io = datasets_utils.lazy_importer.scipy.io
 
-                txn.put(key, value)
+        target_folder = pathlib.Path(root) / name
+        os.makedirs(target_folder)
 
-                os.remove(file)
+        sizes = [torch.randint(1, 4, size=(2,)).tolist() for _ in range(num_images)]
+        for idx, size in enumerate(sizes):
+            content = dict(
+                GTcls=dict(Boundaries=self._create_boundaries(size), Segmentation=self._create_segmentation(size))
+            )
+            io.savemat(target_folder / f"{self._file_stem(idx)}.mat", content)
 
-        return num_images
+        return sizes
 
-    def test_not_found_or_corrupted(self):
-        # LSUN does not raise built-in exception, but a custom one. It is expressive enough to not 'cast' it to
-        # RuntimeError or FileNotFoundError that are normally checked by this test.
-        with pytest.raises(datasets_utils.lazy_importer.lmdb.Error):
-            super().test_not_found_or_corrupted()
+    def _create_boundaries(self, size):
+        sparse = datasets_utils.lazy_importer.scipy.sparse
+        return [
+            [sparse.csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())]
+            for _ in range(self._NUM_CLASSES)
+        ]
 
+    def _create_segmentation(self, size):
+        return torch.randint(0, self._NUM_CLASSES + 1, size=size, dtype=torch.uint8).numpy()
 
-class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    def _file_stem(self, idx):
+        return f"2008_{idx:06d}"
 
-    def inject_fake_data(self, tmpdir, config):
-        classes = ("Abseiling", "Zumba")
-        num_videos_per_class = 2
-        tmpdir = pathlib.Path(tmpdir) / config["split"]
-        digits = string.ascii_letters + string.digits + "-_"
-        for cls in classes:
-            datasets_utils.create_video_folder(
-                tmpdir,
-                cls,
-                lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4",
-                num_videos_per_class,
-            )
-        return num_videos_per_class * len(classes)
 
+class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.FakeData
+    FEATURE_TYPES = (PIL.Image.Image, int)
 
-class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.Kinetics400
+    def dataset_args(self, tmpdir, config):
+        return ()
 
     def inject_fake_data(self, tmpdir, config):
-        classes = ("Abseiling", "Zumba")
-        num_videos_per_class = 2
+        return config["size"]
 
-        digits = string.ascii_letters + string.digits + "-_"
-        for cls in classes:
-            datasets_utils.create_video_folder(
-                tmpdir,
-                cls,
-                lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi",
-                num_videos_per_class,
-            )
+    def test_not_found_or_corrupted(self):
+        self.skipTest("The data is generated at creation and thus cannot be non-existent or corrupted.")
 
-        return num_videos_per_class * len(classes)
 
+class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.PhotoTour
 
-class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.HMDB51
+    # The PhotoTour dataset returns examples with different features with respect to the 'train' parameter. Thus,
+    # we overwrite 'FEATURE_TYPES' with a dummy value to satisfy the initial checks of the base class. Furthermore, we
+    # overwrite the 'test_feature_types()' method to select the correct feature types before the test is run.
+    FEATURE_TYPES = ()
+    _TRAIN_FEATURE_TYPES = (torch.Tensor,)
+    _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    datasets_utils.combinations_grid(train=(True, False))
 
-    _VIDEO_FOLDER = "videos"
-    _SPLITS_FOLDER = "splits"
-    _CLASSES = ("brush_hair", "wave")
-
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._VIDEO_FOLDER
-        annotation_path = tmpdir / self._SPLITS_FOLDER
-        return root, annotation_path
-
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-
-        video_folder = tmpdir / self._VIDEO_FOLDER
-        os.makedirs(video_folder)
-        video_files = self._create_videos(video_folder)
-
-        splits_folder = tmpdir / self._SPLITS_FOLDER
-        os.makedirs(splits_folder)
-        num_examples = self._create_split_files(splits_folder, video_files, config["fold"], config["train"])
-
-        return num_examples
-
-    def _create_videos(self, root, num_examples_per_class=3):
-        def file_name_fn(cls, idx, clips_per_group=2):
-            return f"{cls}_{(idx // clips_per_group) + 1:d}_{(idx % clips_per_group) + 1:d}.avi"
-
-        return [
-            (
-                cls,
-                datasets_utils.create_video_folder(
-                    root,
-                    cls,
-                    lambda idx: file_name_fn(cls, idx),
-                    num_examples_per_class,
-                ),
-            )
-            for cls in self._CLASSES
-        ]
-
-    def _create_split_files(self, root, video_files, fold, train):
-        num_videos = num_train_videos = 0
-
-        for cls, videos in video_files:
-            num_videos += len(videos)
-
-            train_videos = set(random.sample(videos, random.randrange(1, len(videos) - 1)))
-            num_train_videos += len(train_videos)
-
-            with open(pathlib.Path(root) / f"{cls}_test_split{fold}.txt", "w") as fh:
-                fh.writelines(f"{file.name} {1 if file in train_videos else 2}\n" for file in videos)
-
-        return num_train_videos if train else (num_videos - num_train_videos)
-
-
-class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Omniglot
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
-
-    def inject_fake_data(self, tmpdir, config):
-        target_folder = (
-            pathlib.Path(tmpdir) / "omniglot-py" / f"images_{'background' if config['background'] else 'evaluation'}"
-        )
-        os.makedirs(target_folder)
-
-        num_images = 0
-        for name in ("Alphabet_of_the_Magi", "Tifinagh"):
-            num_images += self._create_alphabet_folder(target_folder, name)
-
-        return num_images
-
-    def _create_alphabet_folder(self, root, name):
-        num_images_total = 0
-        for idx in range(torch.randint(1, 4, size=()).item()):
-            num_images = torch.randint(1, 4, size=()).item()
-            num_images_total += num_images
-
-            datasets_utils.create_image_folder(
-                root / name, f"character{idx:02d}", lambda image_idx: f"{image_idx:02d}.png", num_images
-            )
-
-        return num_images_total
-
-
-class SBUTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SBU
-    FEATURE_TYPES = (PIL.Image.Image, str)
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 3
-
-        dataset_folder = pathlib.Path(tmpdir) / "dataset"
-        images = datasets_utils.create_image_folder(tmpdir, "dataset", self._create_file_name, num_images)
-
-        self._create_urls_txt(dataset_folder, images)
-        self._create_captions_txt(dataset_folder, num_images)
-
-        return num_images
-
-    def _create_file_name(self, idx):
-        part1 = datasets_utils.create_random_string(10, string.digits)
-        part2 = datasets_utils.create_random_string(10, string.ascii_lowercase, string.digits[:6])
-        return f"{part1}_{part2}.jpg"
-
-    def _create_urls_txt(self, root, images):
-        with open(root / "SBU_captioned_photo_dataset_urls.txt", "w") as fh:
-            for image in images:
-                fh.write(
-                    f"http://static.flickr.com/{datasets_utils.create_random_string(4, string.digits)}/{image.name}\n"
-                )
-
-    def _create_captions_txt(self, root, num_images):
-        with open(root / "SBU_captioned_photo_dataset_captions.txt", "w") as fh:
-            for _ in range(num_images):
-                fh.write(f"{datasets_utils.create_random_string(10)}\n")
-
-
-class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SEMEION
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 3
-
-        images = torch.rand(num_images, 256)
-        labels = F.one_hot(torch.randint(10, size=(num_images,)))
-        with open(pathlib.Path(tmpdir) / "semeion.data", "w") as fh:
-            for image, one_hot_labels in zip(images, labels):
-                image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
-                labels_columns = " ".join([str(label.item()) for label in one_hot_labels])
-                fh.write(f"{image_columns} {labels_columns}\n")
-
-        return num_images
-
-
-class USPSTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.USPS
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 2 if config["train"] else 1
-
-        images = torch.rand(num_images, 256) * 2 - 1
-        labels = torch.randint(1, 11, size=(num_images,))
-
-        with bz2.open(pathlib.Path(tmpdir) / f"usps{'.t' if not config['train'] else ''}.bz2", "w") as fh:
-            for image, label in zip(images, labels):
-                line = " ".join((str(label.item()), *[f"{idx}:{pixel:.6f}" for idx, pixel in enumerate(image, 1)]))
-                fh.write(f"{line}\n".encode())
-
-        return num_images
-
-
-class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SBDataset
-    FEATURE_TYPES = (PIL.Image.Image, (np.ndarray, PIL.Image.Image))
-
-    REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
-    )
-
-    _NUM_CLASSES = 20
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images, num_images_per_image_set = self._create_split_files(tmpdir)
-
-        sizes = self._create_target_folder(tmpdir, "cls", num_images)
-
-        datasets_utils.create_image_folder(
-            tmpdir, "img", lambda idx: f"{self._file_stem(idx)}.jpg", num_images, size=lambda idx: sizes[idx]
-        )
-
-        return num_images_per_image_set[config["image_set"]]
-
-    def _create_split_files(self, root):
-        root = pathlib.Path(root)
-
-        splits = dict(train=(0, 1, 2), train_noval=(0, 2), val=(3,))
-
-        for split, idcs in splits.items():
-            self._create_split_file(root, split, idcs)
-
-        num_images = max(itertools.chain(*splits.values())) + 1
-        num_images_per_split = {split: len(idcs) for split, idcs in splits.items()}
-        return num_images, num_images_per_split
-
-    def _create_split_file(self, root, name, idcs):
-        with open(root / f"{name}.txt", "w") as fh:
-            fh.writelines(f"{self._file_stem(idx)}\n" for idx in idcs)
-
-    def _create_target_folder(self, root, name, num_images):
-        io = datasets_utils.lazy_importer.scipy.io
-
-        target_folder = pathlib.Path(root) / name
-        os.makedirs(target_folder)
-
-        sizes = [torch.randint(1, 4, size=(2,)).tolist() for _ in range(num_images)]
-        for idx, size in enumerate(sizes):
-            content = dict(
-                GTcls=dict(Boundaries=self._create_boundaries(size), Segmentation=self._create_segmentation(size))
-            )
-            io.savemat(target_folder / f"{self._file_stem(idx)}.mat", content)
-
-        return sizes
-
-    def _create_boundaries(self, size):
-        sparse = datasets_utils.lazy_importer.scipy.sparse
-        return [
-            [sparse.csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())]
-            for _ in range(self._NUM_CLASSES)
-        ]
-
-    def _create_segmentation(self, size):
-        return torch.randint(0, self._NUM_CLASSES + 1, size=size, dtype=torch.uint8).numpy()
-
-    def _file_stem(self, idx):
-        return f"2008_{idx:06d}"
-
-
-class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.FakeData
-    FEATURE_TYPES = (PIL.Image.Image, int)
-
-    def dataset_args(self, tmpdir, config):
-        return ()
-
-    def inject_fake_data(self, tmpdir, config):
-        return config["size"]
-
-    def test_not_found_or_corrupted(self):
-        self.skipTest("The data is generated at creation and thus cannot be non-existent or corrupted.")
-
-
-class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.PhotoTour
-
-    # The PhotoTour dataset returns examples with different features with respect to the 'train' parameter. Thus,
-    # we overwrite 'FEATURE_TYPES' with a dummy value to satisfy the initial checks of the base class. Furthermore, we
-    # overwrite the 'test_feature_types()' method to select the correct feature types before the test is run.
-    FEATURE_TYPES = ()
-    _TRAIN_FEATURE_TYPES = (torch.Tensor,)
-    _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
-
-    datasets_utils.combinations_grid(train=(True, False))
-
-    _NAME = "liberty"
+    _NAME = "liberty"
 
     def dataset_args(self, tmpdir, config):
         return tmpdir, self._NAME
@@ -2898,341 +2335,1042 @@ def inject_fake_data(self, tmpdir: str, config):
                 )
             )
 
-        meta_folder = data_folder / "labels"
-        meta_folder.mkdir()
-        image_ids = [str(path.relative_to(path.parents[1])).replace(os.sep, "/") for path in image_files]
-        image_ids_in_config = random.choices(image_ids, k=len(image_files) // 2)
-        with open(meta_folder / f"{config['split']}{config['partition']}.txt", "w") as file:
-            file.write("\n".join(image_ids_in_config) + "\n")
+        meta_folder = data_folder / "labels"
+        meta_folder.mkdir()
+        image_ids = [str(path.relative_to(path.parents[1])).replace(os.sep, "/") for path in image_files]
+        image_ids_in_config = random.choices(image_ids, k=len(image_files) // 2)
+        with open(meta_folder / f"{config['split']}{config['partition']}.txt", "w") as file:
+            file.write("\n".join(image_ids_in_config) + "\n")
+
+        return len(image_ids_in_config)
+
+
+class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.FER2013
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, "fer2013")
+        os.makedirs(base_folder)
+
+        num_samples = 5
+        with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
+            writer = csv.DictWriter(
+                file,
+                fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
+                quoting=csv.QUOTE_NONNUMERIC,
+                quotechar='"',
+            )
+            writer.writeheader()
+            for _ in range(num_samples):
+                row = dict(
+                    pixels=" ".join(
+                        str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
+                    )
+                )
+                if config["split"] == "train":
+                    row["emotion"] = str(int(torch.randint(0, 7, ())))
+
+                writer.writerow(row)
+
+        return num_samples
+
+
+class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.GTSRB
+    FEATURE_TYPES = (PIL.Image.Image, int)
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    def inject_fake_data(self, tmpdir: str, config):
+        root_folder = os.path.join(tmpdir, "gtsrb")
+        os.makedirs(root_folder, exist_ok=True)
+
+        # Train data
+        train_folder = os.path.join(root_folder, "GTSRB", "Training")
+        os.makedirs(train_folder, exist_ok=True)
+
+        num_examples = 3 if config["split"] == "train" else 4
+        classes = ("00000", "00042", "00012")
+        for class_idx in classes:
+            datasets_utils.create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples,
+            )
+
+        total_number_of_examples = num_examples * len(classes)
+        # Test data
+        test_folder = os.path.join(root_folder, "GTSRB", "Final_Test", "Images")
+        os.makedirs(test_folder, exist_ok=True)
+
+        with open(os.path.join(root_folder, "GT-final_test.csv"), "w") as csv_file:
+            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+
+            for _ in range(total_number_of_examples):
+                image_file = datasets_utils.create_random_string(5, string.digits) + ".ppm"
+                datasets_utils.create_image_file(test_folder, image_file)
+                row = [
+                    image_file,
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(0, 43, size=()).item(),
+                ]
+                csv_file.write(";".join(map(str, row)) + "\n")
+
+        return total_number_of_examples
+
+
+class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CLEVRClassification
+    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+
+    def inject_fake_data(self, tmpdir, config):
+        data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
+
+        images_folder = data_folder / "images"
+        image_files = datasets_utils.create_image_folder(
+            images_folder, config["split"], lambda idx: f"CLEVR_{config['split']}_{idx:06d}.png", num_examples=5
+        )
+
+        scenes_folder = data_folder / "scenes"
+        scenes_folder.mkdir()
+        if config["split"] != "test":
+            with open(scenes_folder / f"CLEVR_{config['split']}_scenes.json", "w") as file:
+                json.dump(
+                    dict(
+                        info=dict(),
+                        scenes=[
+                            dict(image_filename=image_file.name, objects=[dict()] * int(torch.randint(10, ())))
+                            for image_file in image_files
+                        ],
+                    ),
+                    file,
+                )
+
+        return len(image_files)
+
+
+class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.OxfordIIITPet
+    FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("trainval", "test"),
+        target_types=("category", "segmentation", ["category", "segmentation"], []),
+    )
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, "oxford-iiit-pet")
+
+        classification_anns_meta = (
+            dict(cls="Abyssinian", label=0, species="cat"),
+            dict(cls="Keeshond", label=18, species="dog"),
+            dict(cls="Yorkshire Terrier", label=37, species="dog"),
+        )
+        split_and_classification_anns = [
+            self._meta_to_split_and_classification_ann(meta, idx)
+            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
+        ]
+        image_ids, *_ = zip(*split_and_classification_anns)
+
+        image_files = datasets_utils.create_image_folder(
+            base_folder, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
+        )
+
+        anns_folder = os.path.join(base_folder, "annotations")
+        os.makedirs(anns_folder)
+        split_and_classification_anns_in_split = random.choices(split_and_classification_anns, k=len(image_ids) // 2)
+        with open(os.path.join(anns_folder, f"{config['split']}.txt"), "w", newline="") as file:
+            writer = csv.writer(file, delimiter=" ")
+            for split_and_classification_ann in split_and_classification_anns_in_split:
+                writer.writerow(split_and_classification_ann)
+
+        segmentation_files = datasets_utils.create_image_folder(
+            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
+        )
+
+        # The dataset has some rogue files
+        for path in image_files[:2]:
+            path.with_suffix(".mat").touch()
+        for path in segmentation_files:
+            path.with_name(f".{path.name}").touch()
+
+        return len(split_and_classification_anns_in_split)
+
+    def _meta_to_split_and_classification_ann(self, meta, idx):
+        image_id = "_".join(
+            [
+                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
+                str(idx),
+            ]
+        )
+        class_id = str(meta["label"] + 1)
+        species = "1" if meta["species"] == "cat" else "2"
+        breed_id = "-1"
+        return (image_id, class_id, species, breed_id)
+
+
+class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StanfordCars
+    REQUIRED_PACKAGES = ("scipy",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    def inject_fake_data(self, tmpdir, config):
+        import scipy.io as io
+        from numpy.core.records import fromarrays
+
+        num_examples = {"train": 5, "test": 7}[config["split"]]
+        num_classes = 3
+        base_folder = pathlib.Path(tmpdir) / "stanford_cars"
+
+        devkit = base_folder / "devkit"
+        devkit.mkdir(parents=True)
+
+        if config["split"] == "train":
+            images_folder_name = "cars_train"
+            annotations_mat_path = devkit / "cars_train_annos.mat"
+        else:
+            images_folder_name = "cars_test"
+            annotations_mat_path = base_folder / "cars_test_annos_withlabels.mat"
+
+        datasets_utils.create_image_folder(
+            root=base_folder,
+            name=images_folder_name,
+            file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
+            num_examples=num_examples,
+        )
+
+        classes = np.random.randint(1, num_classes + 1, num_examples, dtype=np.uint8)
+        fnames = [f"{i:5d}.jpg" for i in range(num_examples)]
+        rec_array = fromarrays(
+            [classes, fnames],
+            names=["class", "fname"],
+        )
+        io.savemat(annotations_mat_path, {"annotations": rec_array})
+
+        random_class_names = ["random_name"] * num_classes
+        io.savemat(devkit / "cars_meta.mat", {"class_names": random_class_names})
+
+        return num_examples
+
+
+class Country211TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Country211
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+
+    def inject_fake_data(self, tmpdir: str, config):
+        split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
+        split_folder.mkdir(parents=True, exist_ok=True)
+
+        num_examples = {
+            "train": 3,
+            "valid": 4,
+            "test": 5,
+        }[config["split"]]
+
+        classes = ("AD", "BS", "GR")
+        for cls in classes:
+            datasets_utils.create_image_folder(
+                split_folder,
+                name=cls,
+                file_name_fn=lambda idx: f"{idx}.jpg",
+                num_examples=num_examples,
+            )
+
+        return num_examples * len(classes)
+
+
+class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Flowers102
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    REQUIRED_PACKAGES = ("scipy",)
+
+    def inject_fake_data(self, tmpdir: str, config):
+        base_folder = pathlib.Path(tmpdir) / "flowers-102"
+
+        num_classes = 3
+        num_images_per_split = dict(train=5, val=4, test=3)
+        num_images_total = sum(num_images_per_split.values())
+        datasets_utils.create_image_folder(
+            base_folder,
+            "jpg",
+            file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
+            num_examples=num_images_total,
+        )
+
+        label_dict = dict(
+            labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
+
+        setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
+        np.random.shuffle(setid_mat)
+        setid_dict = dict(
+            trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
+            valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
+            tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
+
+        return num_images_per_split[config["split"]]
+
+
+class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.PCAM
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    REQUIRED_PACKAGES = ("h5py",)
+
+    def inject_fake_data(self, tmpdir: str, config):
+        base_folder = pathlib.Path(tmpdir) / "pcam"
+        base_folder.mkdir()
+
+        num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+
+        images_file = datasets.PCAM._FILES[config["split"]]["images"][0]
+        with datasets_utils.lazy_importer.h5py.File(str(base_folder / images_file), "w") as f:
+            f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+
+        targets_file = datasets.PCAM._FILES[config["split"]]["targets"][0]
+        with datasets_utils.lazy_importer.h5py.File(str(base_folder / targets_file), "w") as f:
+            f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+
+        return num_images
+
+
+class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.RenderedSST2
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
+
+    def inject_fake_data(self, tmpdir: str, config):
+        root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
+        image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
+
+        num_images_per_class = {"train": 5, "test": 6, "val": 7}
+        sampled_classes = ["positive", "negative"]
+        for cls in sampled_classes:
+            datasets_utils.create_image_folder(
+                image_folder,
+                cls,
+                file_name_fn=lambda idx: f"{idx}.png",
+                num_examples=num_images_per_class[config["split"]],
+            )
+
+        return len(sampled_classes) * num_images_per_class[config["split"]]
+
+
+class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoETH3D
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        # create the scene folder
+        image_paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with left right images
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
+        return image_paths
+
+    @staticmethod
+    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        # create scene directories
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with a random png file for occlusion mask, and a pfm file for disparity
+            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
+            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
+            paths.append(pfm_path)
+        return paths
+
+    def inject_fake_data(self, tmpdir, config):
+        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+
+        num_examples = 2 if config["split"] == "train" else 3
+
+        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
+        split_dir = os.path.join(eth3d_dir, split_name)
+        self._create_scene_folder(num_examples, split_dir)
+
+        if config["split"] == "train":
+            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
+            self._create_annotation_folder(num_examples, annot_dir)
+
+        return num_examples
+
+    def test_training_test_splits(self):
+        with self.create_dataset(split="train") as (dataset, _):
+            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            for _, _, disparity, valid_mask in dataset:
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                _, dh, dw = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
+
+        with self.create_dataset(split="test") as (dataset, _):
+            assert all(d == ("", "") for d in dataset._disparities)
+            for _, _, disparity, valid_mask in dataset:
+                assert disparity is None
+                assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CREStereo
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
+        os.makedirs(crestereo_dir, exist_ok=True)
+
+        split_dir = crestereo_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5}.get(config["split"], 0)
+
+        for idx in range(num_examples):
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+
+        return num_examples
+
+    def test_splits(self):
+        for split in ("tree", "shapenet", "reflective", "hole"):
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoMiddlebury2014
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "additional"),
+        calibration=("perfect", "imperfect", "both"),
+        use_ambient_views=(True, False),
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
+        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
+        scene_dirs = []
+        for c in calibrations:
+            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # make normal images first
+            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
+            scene_dirs.append(scene_dir)
+        return scene_dirs
+
+    def inject_fake_data(self, tmpdir, config):
+        split_scene_map = {
+            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
+            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+        }
+
+        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
+        os.makedirs(middlebury_dir, exist_ok=True)
+
+        split_dir = middlebury_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"train": 2, "additional": 3, "test": 4}.get(config["split"], 0)
+        for idx in range(num_examples):
+            scene_name = split_scene_map[config["split"]][idx]
+            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+
+        if config["calibration"] == "both":
+            num_examples *= 2
+        return num_examples
+
+    def test_train_splits(self):
+        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
+            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    print("disparities", disparity.shape, valid_mask.shape)
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split, calibration=None) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_augmented_view_usage(self):
+        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
+            for left, right, _, _ in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+
+    def test_warnings_train(self):
+        # train set invalid
+        split = "train"
+        calibration = None
+        with pytest.warns(
+            RuntimeWarning,
+            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_warnings_test(self):
+        # test set invalid
+        split = "test"
+        calibration = "perfect"
+        with pytest.warns(
+            RuntimeWarning,
+            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2012
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"train": 4, "test": 3}.get(config["split"], 0)
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_0",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_1",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_noc",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2012 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    assert disparity is None
+                    assert valid_mask is None
 
-        return len(image_ids_in_config)
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
-class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.FER2013
+class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2015
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-
-    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
-        base_folder = os.path.join(tmpdir, "fer2013")
-        os.makedirs(base_folder)
-
-        num_samples = 5
-        with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
-            writer = csv.DictWriter(
-                file,
-                fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
-                quoting=csv.QUOTE_NONNUMERIC,
-                quotechar='"',
-            )
-            writer.writeheader()
-            for _ in range(num_samples):
-                row = dict(
-                    pixels=" ".join(
-                        str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
-                    )
-                )
-                if config["split"] == "train":
-                    row["emotion"] = str(int(torch.randint(0, 7, ())))
-
-                writer.writerow(row)
-
-        return num_samples
-
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
+        os.makedirs(kitti_dir, exist_ok=True)
 
-class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.GTSRB
-    FEATURE_TYPES = (PIL.Image.Image, int)
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+        num_examples = {"train": 4, "test": 6}.get(config["split"], 0)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        root_folder = os.path.join(tmpdir, "gtsrb")
-        os.makedirs(root_folder, exist_ok=True)
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_2",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_3",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
 
-        # Train data
-        train_folder = os.path.join(root_folder, "GTSRB", "Training")
-        os.makedirs(train_folder, exist_ok=True)
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_0",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
 
-        num_examples = 3 if config["split"] == "train" else 4
-        classes = ("00000", "00042", "00012")
-        for class_idx in classes:
             datasets_utils.create_image_folder(
-                train_folder,
-                name=class_idx,
-                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                root=split_dir,
+                name="disp_occ_1",
+                file_name_fn=lambda i: f"{i:06d}.png",
                 num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
             )
 
-        total_number_of_examples = num_examples * len(classes)
-        # Test data
-        test_folder = os.path.join(root_folder, "GTSRB", "Final_Test", "Images")
-        os.makedirs(test_folder, exist_ok=True)
+        return num_examples
 
-        with open(os.path.join(root_folder, "GT-final_test.csv"), "w") as csv_file:
-            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-            for _ in range(total_number_of_examples):
-                image_file = datasets_utils.create_random_string(5, string.digits) + ".ppm"
-                datasets_utils.create_image_file(test_folder, image_file)
-                row = [
-                    image_file,
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(0, 43, size=()).item(),
-                ]
-                csv_file.write(";".join(map(str, row)) + "\n")
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert disparity is None
+                    assert valid_mask is None
 
-        return total_number_of_examples
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
-class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CLEVRClassification
-    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSceneFlow
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("FlyingThings3D", "Driving", "Monkaa"),
+        pass_name=("clean", "final")
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    @staticmethod
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
 
-    def inject_fake_data(self, tmpdir, config):
-        data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
+        paths = []
+        for i in range(num_examples):
+            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+            paths.append(str(root / file_name_fn(i)))
+        return paths
 
-        images_folder = data_folder / "images"
-        image_files = datasets_utils.create_image_folder(
-            images_folder, config["split"], lambda idx: f"CLEVR_{config['split']}_{idx:06d}.png", num_examples=5
-        )
+    def inject_fake_data(self, tmpdir, config):
+        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
+        os.makedirs(scene_flow_dir, exist_ok=True)
 
-        scenes_folder = data_folder / "scenes"
-        scenes_folder.mkdir()
-        if config["split"] != "test":
-            with open(scenes_folder / f"CLEVR_{config['split']}_scenes.json", "w") as file:
-                json.dump(
-                    dict(
-                        info=dict(),
-                        scenes=[
-                            dict(image_filename=image_file.name, objects=[dict()] * int(torch.randint(10, ())))
-                            for image_file in image_files
-                        ],
-                    ),
-                    file,
-                )
+        split_dir = scene_flow_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
-        return len(image_files)
+        pass_dir_map = {
+            "clean": "frames_cleanpass",
+            "final": "frames_finalpass",
+        }
 
+        num_examples = 1
+        pass_dir_name = pass_dir_map.get(config["pass_name"], None)
 
-class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.OxfordIIITPet
-    FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
+        # create pass directories
+        pass_dir = split_dir / pass_dir_name
+        disp_dir = split_dir / "disparity"
+        os.makedirs(pass_dir, exist_ok=True)
+        os.makedirs(disp_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("trainval", "test"),
-        target_types=("category", "segmentation", ["category", "segmentation"], []),
-    )
+        num_examples = {"FlyingThings3D": 4, "Driving": 6, "Monkaa": 5}.get(config["split"], 0)
 
-    def inject_fake_data(self, tmpdir, config):
-        base_folder = os.path.join(tmpdir, "oxford-iiit-pet")
+        for direction in ["left", "right"]:
+            for scene_idx in range(num_examples):
+                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                datasets_utils.create_image_folder(
+                    root=pass_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.png",
+                    num_examples=1,
+                    size=(3, 200, 100),
+                )
 
-        classification_anns_meta = (
-            dict(cls="Abyssinian", label=0, species="cat"),
-            dict(cls="Keeshond", label=18, species="dog"),
-            dict(cls="Yorkshire Terrier", label=37, species="dog"),
-        )
-        split_and_classification_anns = [
-            self._meta_to_split_and_classification_ann(meta, idx)
-            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
-        ]
-        image_ids, *_ = zip(*split_and_classification_anns)
+                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                self._create_pfm_folder(
+                    root=disp_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.pfm",
+                    num_examples=1,
+                    size=(100, 200),
+                )
 
-        image_files = datasets_utils.create_image_folder(
-            base_folder, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
-        )
+        return num_examples
 
-        anns_folder = os.path.join(base_folder, "annotations")
-        os.makedirs(anns_folder)
-        split_and_classification_anns_in_split = random.choices(split_and_classification_anns, k=len(image_ids) // 2)
-        with open(os.path.join(anns_folder, f"{config['split']}.txt"), "w", newline="") as file:
-            writer = csv.writer(file, delimiter=" ")
-            for split_and_classification_ann in split_and_classification_anns_in_split:
-                writer.writerow(split_and_classification_ann)
+    def test_splits(self):
+        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        segmentation_files = datasets_utils.create_image_folder(
-            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
-        )
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
-        # The dataset has some rogue files
-        for path in image_files[:2]:
-            path.with_suffix(".mat").touch()
-        for path in segmentation_files:
-            path.with_name(f".{path.name}").touch()
 
-        return len(split_and_classification_anns_in_split)
+class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoFallingThings
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-    def _meta_to_split_and_classification_ann(self, meta, idx):
-        image_id = "_".join(
-            [
-                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
-                str(idx),
-            ]
-        )
-        class_id = str(meta["label"] + 1)
-        species = "1" if meta["species"] == "cat" else "2"
-        breed_id = "-1"
-        return (image_id, class_id, species, breed_id)
+    @staticmethod
+    def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
+        file = pathlib.Path(root) / name
+        image = np.ones((size[0], size[1]), dtype=np.uint8)
+        PIL.Image.fromarray(image).save(file)
 
+    @staticmethod
+    def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> List[str]:
+        paths = []
+        root = pathlib.Path(root) / scene_name
+        os.makedirs(root, exist_ok=True)
+        # jpg images
+        paths.append(datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0])))
+        paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
+        # single channel depth maps
+        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1])))
+        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1])))
+        # camera settings json. Minimal example for _read_disparity function testing
+        settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
+        with open(root / "_camera_settings.json", "w") as f:
+            json.dump(settings_json, f)
 
-class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StanfordCars
-    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+        return paths
 
     def inject_fake_data(self, tmpdir, config):
-        import scipy.io as io
-        from numpy.core.records import fromarrays
+        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
+        os.makedirs(fallingthings_dir, exist_ok=True)
 
-        num_examples = {"train": 5, "test": 7}[config["split"]]
-        num_classes = 3
-        base_folder = pathlib.Path(tmpdir) / "stanford_cars"
+        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
-        devkit = base_folder / "devkit"
-        devkit.mkdir(parents=True)
+        num_examples = {"single": 2, "mixed": 3}.get(config["split"], 0)
 
-        if config["split"] == "train":
-            images_folder_name = "cars_train"
-            annotations_mat_path = devkit / "cars_train_annos.mat"
-        else:
-            images_folder_name = "cars_test"
-            annotations_mat_path = base_folder / "cars_test_annos_withlabels.mat"
+        for i in range(num_examples):
+            self._make_scene_folder(
+                root=split_dir,
+                scene_name=f"scene_{i:06d}",
+                size=(100, 200),
+            )
 
-        datasets_utils.create_image_folder(
-            root=base_folder,
-            name=images_folder_name,
-            file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
-            num_examples=num_examples,
-        )
+        return num_examples
 
-        classes = np.random.randint(1, num_classes + 1, num_examples, dtype=np.uint8)
-        fnames = [f"{i:5d}.jpg" for i in range(num_examples)]
-        rec_array = fromarrays(
-            [classes, fnames],
-            names=["class", "fname"],
-        )
-        io.savemat(annotations_mat_path, {"annotations": rec_array})
+    def test_splits(self):
+        for split_name in ["single", "mixed"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        random_class_names = ["random_name"] * num_classes
-        io.savemat(devkit / "cars_meta.mat", {"class_names": random_class_names})
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
-        return num_examples
 
+class StereoSintelTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSintel
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-class Country211TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Country211
+    def inject_fake_data(self, tmpdir, config):
+        sintel_dir = pathlib.Path(tmpdir) / "Sintel"
+        os.makedirs(sintel_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+        split_dir = pathlib.Path(sintel_dir) / "training"
+        os.makedirs(split_dir, exist_ok=True)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
-        split_folder.mkdir(parents=True, exist_ok=True)
+        # a single setting, since there are no splits
+        num_examples = 4
 
-        num_examples = {
-            "train": 3,
-            "valid": 4,
-            "test": 5,
-        }[config["split"]]
+        for view in ["final_left", "final_right"]:
+            root = split_dir / view
+            os.makedirs(root, exist_ok=True)
 
-        classes = ("AD", "BS", "GR")
-        for cls in classes:
             datasets_utils.create_image_folder(
-                split_folder,
-                name=cls,
-                file_name_fn=lambda idx: f"{idx}.jpg",
+                root=root,
+                name="scene1",
+                file_name_fn=lambda i: f"{i:06d}.png",
                 num_examples=num_examples,
+                size=(3, 100, 200),
             )
 
-        return num_examples * len(classes)
-
-
-class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Flowers102
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    REQUIRED_PACKAGES = ("scipy",)
-
-    def inject_fake_data(self, tmpdir: str, config):
-        base_folder = pathlib.Path(tmpdir) / "flowers-102"
-
-        num_classes = 3
-        num_images_per_split = dict(train=5, val=4, test=3)
-        num_images_total = sum(num_images_per_split.values())
         datasets_utils.create_image_folder(
-            base_folder,
-            "jpg",
-            file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
-            num_examples=num_images_total,
+            root=split_dir / "occlusions",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(1, 100, 200),
         )
 
-        label_dict = dict(
-            labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
+        datasets_utils.create_image_folder(
+            root=split_dir / "outofframe",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(1, 100, 200),
         )
-        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
 
-        setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
-        np.random.shuffle(setid_mat)
-        setid_dict = dict(
-            trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
-            valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
-            tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
+        datasets_utils.create_image_folder(
+            root=split_dir / "disparities",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
         )
-        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
-
-        return num_images_per_split[config["split"]]
 
+        return num_examples
 
-class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.PCAM
+    def test_splits(self):
+        with self.create_dataset() as (dataset, _):
+            for left, right, disparity, valid_mask in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                h, w, c = left_array.shape
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+                # check general shapes
+                assert c == 3
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                assert disparity.shape == (1, h, w)
+                # check that valid mask is the same size as the disparity
+                _, dh, dw = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    REQUIRED_PACKAGES = ("h5py",)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        base_folder = pathlib.Path(tmpdir) / "pcam"
-        base_folder.mkdir()
+class InStereo2k(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.InStereo2k
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
 
-        num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+    @staticmethod
+    def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
 
-        images_file = datasets.PCAM._FILES[config["split"]]["images"][0]
-        with datasets_utils.lazy_importer.h5py.File(str(base_folder / images_file), "w") as f:
-            f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+        datasets_utils.create_image_file(root=root, name="left.png", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="right.png", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="left_disp.png", size=(1, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="right_disp.png", size=(1, size[0], size[1]))
 
-        targets_file = datasets.PCAM._FILES[config["split"]]["targets"][0]
-        with datasets_utils.lazy_importer.h5py.File(str(base_folder / targets_file), "w") as f:
-            f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+    def inject_fake_data(self, tmpdir, config):
+        in_stereo_dir = pathlib.Path(tmpdir) / "InStereo2k"
+        os.makedirs(in_stereo_dir, exist_ok=True)
 
-        return num_images
+        split_dir = pathlib.Path(in_stereo_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
+        num_examples = {"train": 4, "test": 5}.get(config["split"], 0)
 
-class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
+        for i in range(num_examples):
+            self._make_scene_folder(split_dir, f"scene_{i:06d}", (100, 200))
 
-    def inject_fake_data(self, tmpdir: str, config):
-        root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
-        image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
+        return num_examples
 
-        num_images_per_class = {"train": 5, "test": 6, "val": 7}
-        sampled_classes = ["positive", "negative"]
-        for cls in sampled_classes:
-            datasets_utils.create_image_folder(
-                image_folder,
-                cls,
-                file_name_fn=lambda idx: f"{idx}.png",
-                num_examples=num_images_per_class[config["split"]],
-            )
+    def test_splits(self):
+        for split_name in ["train", "test"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        return len(sampled_classes) * num_images_per_class[config["split"]]
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
 if __name__ == "__main__":
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index a7dd8397bab..8b38ba73a85 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,5 +1,5 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
-from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereoSynthetic 
+from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereo, InStereo2k
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
@@ -106,4 +106,13 @@
     "FGVCAircraft",
     "EuroSAT",
     "RenderedSST2",
+    "StereoETH3D",
+    "StereoFallingThings",
+    "StereoKitti2012",
+    "StereoKitti2015",
+    "StereoMiddlebury2014",
+    "StereoSceneFlow",
+    "StereoSintel",
+    "CREStereo",
+    "InStereo2k",
 )
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 702386b05bd..4de0b5b0532 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,31 +1,30 @@
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-import pathlib
 import random
 import re
 import shutil
-from typing import Callable, List, Optional, Tuple, Any
+from typing import Callable, List, Optional, Tuple
 import warnings
 from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
-from .utils import download_and_extract_archive, download_url, verify_str_arg
+from .utils import download_and_extract_archive, verify_str_arg
 import os
 import numpy as np
 from PIL import Image
 import json
 
 __all__ = (
-    "CREStereo"  # waiting for download / need to find valid mask procedure
+    "CREStereo"
     "StereoMiddlebury2014"
     "StereoETH3D"
     "StereoKitti2012"
     "StereoKitti2015"
     "StereoSintel"
-    "StereoSceneFlow"  # need to find valid mask procedure
+    "StereoSceneFlow"
     "StereoFallingThings"
-    "InStereo2k"  # need to find valid mask procedure
+    "InStereo2k"
 )
 
 
@@ -54,13 +53,38 @@ def read_pfm_file(file_path: str) -> np.array:
         data = np.reshape(data, (height, width, channels))
         data = np.flipud(data)
 
-        return data
+        # PFM files for disparity maps should contain only a single channel
+        # they should also be returned in (C, H, W) format
+        return np.transpose(data[:, :, :1], (2, 0, 1))
 
 
 class StereoMatchingDataset(ABC, VisionDataset):
     """Base interface for Stereo matching datasets"""
 
     def __init__(self, root: str, transforms: Optional[Callable] = None):
+        """
+
+        Args:
+            root(str): Root directory of the dataset.
+            transforms(callable, optional): A function/transform that takes in Tuples of
+                (images, disparities, valid_masks) and returns a transformed version of each of them.
+                images is a Tuple of (``PIL.Image``, ``PIL.Image``)
+                disparities is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (1, H, W)
+                valid_masks is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (H, W)
+
+                In some cases, when a dataset does not provide disparties, the ``disparities`` and
+                ``valid_masks`` can be Tuples containing None values.
+
+                For training splits generally the datasets provide a minimal guarantee of
+                images: (``PIL.Image``, ``PIL.Image``)
+                disparities: (``np.ndarray``, ``None``) with shape (1, H, W)
+                valid_masks: (``np.ndarray``, ``None``) with shape (H, W)
+
+                For some test splits, the datasets provides outputs that look like:
+                imgaes: (``PIL.Image``, ``PIL.Image``)
+                disparities: (``None``, ``None``)
+                valid_masks: (``None``, ``None``)
+        """
         super().__init__(root=root)
         self.transforms = transforms
 
@@ -79,6 +103,18 @@ def _read_disparity(self, file_path: str) -> Tuple:
         pass
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` where ``valid_mask``
+            is a numpy boolean mask of shape (H, W)
+            indicating which disparity values are valid. The disparity is a numpy array of
+            shape (1, H, W) and the images are PIL images. ``disparity`` and ``valid_mask`` are None for
+            datasets on which for ``split="test"`` the authors did not provide annotations.
+        """
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
@@ -98,21 +134,59 @@ def __len__(self) -> int:
         return len(self._images)
 
 
-class CREStereoSynthetic(StereoMatchingDataset):
+class CREStereo(StereoMatchingDataset):
     """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
 
-   Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
-   """
-    DOWNLOAD_SPACE = 4 * 1024 * 1024 * 1024  # dataset requires download requires about 400 GB of free space
+    Dataset details on the official paper `repo <https://github.com/megvii-research/CREStereo>`_.
 
-    EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
+    The dataset is expected to have the following structure: ::
 
-    MAX_DISP = 256.
+        root
+            CREStereo
+                tree
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    img2_left.jpg
+                    img2_right.jpg
+                    img2_left.disp.jpg
+                    img2_right.disp.jpg
+                    ...
+                shapenet
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                reflective
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                hole
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False):
+    Args:
+        root (str): Root directory of the dataset.
+        split (str): The split of the dataset to use. One of ``"tree"``, ``"shapenet"``, ``"reflective"``, ``"hole"``
+        or ``"all"``. The ``"all"`` split contains all of the above splits.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+        download (bool, optional): If true, downloads the dataset from the internet and puts it in the root directory.
+        max_disparity (int, optional): Maximum disparity value. Used to compute the valid mask.
+   """
+    DOWNLOAD_SPACE = 400 * 1024 * 1024 * 1024
+
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False, max_disparity: float = 256.):
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
+        self.max_disparity = max_disparity
 
         # if the API user requests a dataset download check that the user can download it
         if download:
@@ -149,16 +223,23 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
             disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
             disparity_maps_right = list(p.replace("_right", "_right.disp") for p in imgs_right)
 
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
+
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = (disparity < self.MAX_DISP) & (disparity > 0.)
+        valid = (disparity < self.max_disparity) & (disparity > 0.)
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity = disparity[None, :, :]
         return disparity, valid
 
     def _download_dataset(self, root: str) -> None:
-        # TODO: remove before release, used only for testing purposes
         dirs = ["tree", "shapenet", "reflective", "hole"]
         # create directory subtree for the download
         for d in dirs:
@@ -221,11 +302,11 @@ class StereoMiddlebury2014(StereoMatchingDataset):
 
     Args:
         root (string): Root directory of the Middleburry 2014 Dataset.
-        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
-        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible. Sampled with equal probability.
+        split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
+        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
+        The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
         calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
     """
 
@@ -268,7 +349,7 @@ def __init__(
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
-        print(split)
+
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -292,24 +373,23 @@ def __init__(
 
         for calibration_suffix in calibrartion_suffixes:
             scene_pattern = "*" + calibration_suffix
-            print(scene_pattern)
 
             imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
             imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))
-
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
             self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
 
             if split == "test":
-                dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+                disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
             else:
+                disparity_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+                disparity_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
+                if not len(disparity_maps_left) or not len(disparity_maps_right):
+                    raise FileNotFoundError("No disparity maps found in {}".format(root / split))
 
-                dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
-                dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
-
-            self._disparities += list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+            self._disparities += list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
@@ -317,6 +397,7 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
+        """Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True."""
         if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
             # initialize sampleable container
             ambient_file_paths = list(file_path.replace("im1.png", view_name) for view_name in ["im1E.png", "im1L.png"])
@@ -332,6 +413,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return None, None
         disparity_map = read_pfm_file(file_path)
         valid_mask = disparity_map < 1e3
+        # remove the channel dimension from the valid mask
+        valid_mask = valid_mask[0, :, :]
         return disparity_map, valid_mask
 
     def _download_dataset(self, root: str):
@@ -357,10 +440,13 @@ def _download_dataset(self, root: str):
             download_and_extract_archive(url=test_set_url, download_root=root, remove_finished=True)
             for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
                 for scene in scene_names:
-                    shutil.move(os.path.join(scene_dir, scene), os.path.join(root, scene))
+                    scene_dst_dir = root / "test" / scene
+                    scene_src_dir = scene_dir / scene
+                    os.makedirs(scene_dst_dir, exist_ok=True)
+                    shutil.move(str(scene_src_dir), str(scene_dst_dir))
 
             # cleanup MiddEval3 directory
-            shutil.rmtree(os.path.join(root, "MiddEval3"))
+            shutil.rmtree(str(root / "MiddEval3"))
 
 
 class StereoETH3D(StereoMatchingDataset):
@@ -411,8 +497,7 @@ class StereoETH3D(StereoMatchingDataset):
         root (string): Root directory of the ETH3D Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
         calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -427,7 +512,6 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
@@ -435,8 +519,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
             disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*0GT.pfm")))
-            # no masks for the right view, always using left as reference
             disparity_maps_right = list("" for _ in disparity_maps_left)
+            if not len(disparity_maps_left):
+                raise FileNotFoundError("No disparity maps found in {}".format(root / anot_dir))
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -447,10 +532,10 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = read_pfm_file(file_path)
         valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
-        valid_mask = np.array(valid_mask)
+        valid_mask = np.array(valid_mask).astype(np.bool)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -474,8 +559,7 @@ class StereoKitti2012(StereoMatchingDataset):
     Args:
         root (string): Root directory where Kitti2012 is located.
         split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
     """
 
@@ -494,6 +578,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
             disparity_maps_right = list("" for _ in disparity_maps_left)
+            if not len(disparity_maps_left):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
@@ -506,7 +593,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = np.array(Image.open(file_path)) / 256.0
         valid_mask = disparity_map > 0.0
-
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
@@ -533,8 +621,7 @@ class StereoKitti2015(StereoMatchingDataset):
     Args:
         root (string): Root directory where Kitti2015 is located.
         split (string, optional): The dataset split of scenes, either "train" (default) or test.
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -552,6 +639,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
             disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
+            if not len(disparity_maps_left) or not len(disparity_maps_right):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
@@ -564,7 +654,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = np.array(Image.open(file_path)) / 256.0
         valid_mask = disparity_map < 0.0
-
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
@@ -574,10 +665,45 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 class StereoSintel(StereoMatchingDataset):
     """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
+    The dataset is expected to have the following structure: ::
+
+        root
+            Sintel
+                training
+                    final_left
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    final_right
+                        scene2
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    disparities
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    occlusions
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    outofframe
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+
     Args:
         root (string): Root directory where Sintel Stereo is located.
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, transforms: Optional[Callable] = None):
@@ -587,11 +713,13 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
+        if not len(dps_masks_left):
+            raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         disparity_maps_right = list("" for _ in dps_masks_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
@@ -605,7 +733,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = np.array(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
         disparity_map = r * 4 + g / (2**6) + b / (2**14)
-
+        # reshape into (C, H, W) format
+        disparity_map = np.transpose(disparity_map, (2, 0, 1))
         # occlusion mask
         valid_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) == 0
         # out of frame mask
@@ -662,6 +791,10 @@ class StereoSceneFlow(StereoMatchingDataset):
                 FlyingThings3D
                     ...
                     ...
+
+    Args:
+        root (string): Root directory where SceneFlow is located.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
@@ -683,7 +816,6 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
         for p in passes:
             imgs_left = sorted(glob(str(root / p / "*" / "left" / "*.png")))
             imgs_right = sorted(glob(str(root / p / "*" / "right" / "*.png")))
-
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root / p))
 
@@ -693,15 +825,19 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
             disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
 
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
+
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        if not os.path.exists(file_path):
-            raise FileNotFoundError("Disparity map {} not found".format(file_path))
-
         disparity = read_pfm_file(file_path)
-        valid = np.ones_like(disparity)
+        # keep valid mask with shape (H, W)
+        valid = np.ones(disparity.shape[1:]).astype(np.bool)
         return disparity, valid
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
@@ -745,13 +881,20 @@ class StereoFallingThings(StereoMatchingDataset):
                         ...
                     scene2
                     ...
+
+    Args:
+        root (string): Root directory where FallingThings is located.
+        split (string): Either "single", "mixed", or "both".
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+
     """
 
     def __init__(self, root: str, split: str = "single", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
+        root = Path(root) / "FallingThings"
+
         verify_str_arg(split, "split", valid_values=("single", "mixed", "both"))
-        split = split.upper()
 
         splits = {
             "single": ["single"],
@@ -760,28 +903,35 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
         }[split]
 
         for s in splits:
-            imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
-            imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
-
+            imgs_left = sorted(glob(str(root / s / "*" / "*.left.jpg")))
+            imgs_right = sorted(glob(str(root / s / "*" / "*.right.jpg")))
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
-            disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
-            disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+            disparity_maps_left = sorted(glob(str(root / s / "*" / "*.left.depth.png")))
+            disparity_maps_right = sorted(glob(str(root / s / "*" / "*.right.depth.png")))
+            if not len(disparity_maps_left) or not len(disparity_maps_right):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
 
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        depth = Image.Open(file_path)
-        with open(os.path.split(file_path)[0] + '_camera_settings.json', 'r') as f:
+        # (H, W) image
+        depth = np.array(Image.open(file_path))
+        # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
+        # in order to extract disparity from depth maps
+        with open(os.path.split(file_path)[0] + '/_camera_settings.json', 'r') as f:
             intrinsics = json.load(f)
             fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            # inverse of depth-from-disparity equation
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0
+            # unsqueeze disparity to (C, H, W)
+            disparity = disparity[None, :, :]
             return disparity, valid
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
@@ -789,7 +939,7 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
 
 
 class InStereo2k(StereoMatchingDataset):
-    """InStereo2k `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+    """InStereo2k `<https://github.com/YuhuaXu/StereoDataset>`_ dataset
 
     The dataset is expected to have the following structre: ::
 
@@ -813,6 +963,11 @@ class InStereo2k(StereoMatchingDataset):
                         ...
                     scene2
                     ...
+
+    Args:
+        root (string): Root directory where InStereo2k is located.
+        split (string): Either "train" or "test".
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -820,9 +975,10 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         root = Path(root) / "InStereo2k" / split
 
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
         imgs_left = sorted(glob(str(root / "*" / "left.png")))
         imgs_right = list(p.replace("left", "right") for p in imgs_left)
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
@@ -832,10 +988,18 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
         disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
 
+        if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
+
+        if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
+
         disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
         self._disparities = disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = np.ones_like(disparity)
+        valid = np.ones_like(disparity).astype(np.bool)
+        # unsqueeze disparity to (C, H, W)
+        disparity = disparity[None, :, :]
         return disparity, valid

From bbb1c562c4435b4324e03f73bad5ed985b149e2a Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:12:06 +0100
Subject: [PATCH 09/35] Ran ufmt. (#6259)

---
 torchvision/datasets/__init__.py         |  12 +-
 torchvision/datasets/_stereo_matching.py | 195 +++++++++++++++--------
 2 files changed, 138 insertions(+), 69 deletions(-)

diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 8b38ba73a85..973d5ca9f7e 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,5 +1,15 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
-from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereo, InStereo2k
+from ._stereo_matching import (
+    StereoETH3D,
+    StereoFallingThings,
+    StereoKitti2012,
+    StereoKitti2015,
+    StereoMiddlebury2014,
+    StereoSceneFlow,
+    StereoSintel,
+    CREStereo,
+    InStereo2k,
+)
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 4de0b5b0532..3edb0f639a5 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,19 +1,21 @@
-from abc import ABC, abstractmethod
-from glob import glob
-from pathlib import Path
+import json
+import os
 import random
 import re
 import shutil
-from typing import Callable, List, Optional, Tuple
 import warnings
+from abc import ABC, abstractmethod
+from glob import glob
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
 from jsonschema import ValidationError
+from PIL import Image
 from torch import Tensor
-from .vision import VisionDataset
+
 from .utils import download_and_extract_archive, verify_str_arg
-import os
-import numpy as np
-from PIL import Image
-import json
+from .vision import VisionDataset
 
 __all__ = (
     "CREStereo"
@@ -35,7 +37,7 @@ def read_pfm_file(file_path: str) -> np.array:
         if not header in [b"PF", b"Pf"]:
             raise ValidationError(f"Not a valid PFM file: {file_path}")
 
-        dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+        dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())
         if not dim_match:
             raise ValidationError(f"Malformed PFM header: {file_path}")
 
@@ -45,11 +47,11 @@ def read_pfm_file(file_path: str) -> np.array:
         # check for endian type
         if scale < 0:
             scale = -scale
-            endian = '<'
+            endian = "<"
         else:
-            endian = '>'
+            endian = ">"
 
-        data = np.fromfile(file, endian + 'f')
+        data = np.fromfile(file, endian + "f")
         data = np.reshape(data, (height, width, channels))
         data = np.flipud(data)
 
@@ -126,7 +128,11 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         valid_masks = (valid_mask_left, valid_mask_right)
 
         if self.transforms is not None:
-            imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
+            (
+                imgs,
+                dsp_maps,
+                valid_masks,
+            ) = self.transforms(imgs, dsp_maps, valid_masks)
 
         return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
 
@@ -135,7 +141,7 @@ def __len__(self) -> int:
 
 
 class CREStereo(StereoMatchingDataset):
-    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
+    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture.
 
     Dataset details on the official paper `repo <https://github.com/megvii-research/CREStereo>`_.
 
@@ -179,10 +185,18 @@ class CREStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (bool, optional): If true, downloads the dataset from the internet and puts it in the root directory.
         max_disparity (int, optional): Maximum disparity value. Used to compute the valid mask.
-   """
+    """
+
     DOWNLOAD_SPACE = 400 * 1024 * 1024 * 1024
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False, max_disparity: float = 256.):
+    def __init__(
+        self,
+        root: str,
+        split: str = "tree",
+        transforms: Optional[Callable] = None,
+        download: bool = False,
+        max_disparity: float = 256.0,
+    ):
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
@@ -234,7 +248,7 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = (disparity < self.max_disparity) & (disparity > 0.)
+        valid = (disparity < self.max_disparity) & (disparity > 0.0)
         # unsqueeze the disparity map into (C, H, W) format
         disparity = disparity[None, :, :]
         return disparity, valid
@@ -261,33 +275,33 @@ class StereoMiddlebury2014(StereoMatchingDataset):
             Middlebury2014
                 train
                     scene1-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     scene2-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     ...
                 additional
                     scene1-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     ...
                 test
@@ -305,15 +319,56 @@ class StereoMiddlebury2014(StereoMatchingDataset):
         split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
         use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
         The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
     """
 
     splits = {
-        "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
-        "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
-        "test": ["Plants", "Classroom2E", "Classroom2", "Australia", "DjembeL", "CrusadeP", "Crusade", "Hoops", "Bicycle2", "Staircase", "Newkuba", "AustraliaP", "Djembe", "Livingroom", "Computer"]
+        "train": [
+            "Adirondack",
+            "Jadeplant",
+            "Motorcycle",
+            "Piano",
+            "Pipes",
+            "Playroom",
+            "Playtable",
+            "Recycle",
+            "Shelves",
+            "Vintage",
+        ],
+        "additional": [
+            "Backpack",
+            "Bicycle1",
+            "Cable",
+            "Classroom1",
+            "Couch",
+            "Flowers",
+            "Mask",
+            "Shopvac",
+            "Sticks",
+            "Storage",
+            "Sword1",
+            "Sword2",
+            "Umbrella",
+        ],
+        "test": [
+            "Plants",
+            "Classroom2E",
+            "Classroom2",
+            "Australia",
+            "DjembeL",
+            "CrusadeP",
+            "Crusade",
+            "Hoops",
+            "Bicycle2",
+            "Staircase",
+            "Newkuba",
+            "AustraliaP",
+            "Djembe",
+            "Livingroom",
+            "Computer",
+        ],
     }
 
     def __init__(
@@ -323,7 +378,7 @@ def __init__(
         calibration: Optional[str] = "perfect",
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
-        download: bool = False
+        download: bool = False,
     ):
         super().__init__(root, transforms)
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
@@ -333,8 +388,7 @@ def __init__(
             if split == "test":
                 calibration = None
                 warnings.warn(
-                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
-                    RuntimeWarning
+                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.", RuntimeWarning
                 )
         else:
             if split != "test":
@@ -342,7 +396,7 @@ def __init__(
                 warnings.warn(
                     f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
                     f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
-                    RuntimeWarning
+                    RuntimeWarning,
                 )
 
         if download:
@@ -351,15 +405,14 @@ def __init__(
         root = Path(root) / "Middlebury2014"
 
         if not os.path.exists(root / split):
-            raise FileNotFoundError(
-                f"The {split} directory was not found in the provided root directory"
-            )
+            raise FileNotFoundError(f"The {split} directory was not found in the provided root directory")
 
         split_scenes = self.splits[split]
         # check that the provided root folder contains the scene splits
         if not any(
             # using startswith to account for perfect / imperfect calibrartion
-            scene.startswith(s) for scene in os.listdir(root / split)
+            scene.startswith(s)
+            for scene in os.listdir(root / split)
             for s in split_scenes
         ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
@@ -429,7 +482,9 @@ def _download_dataset(self, root: str):
                 scene_name = f"{scene}-{calibration}"
                 for calibration in ["perfect", "imperfect"]:
                     scene_url = f"{base_url}/{scene_name}.zip"
-                    download_and_extract_archive(url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True)
+                    download_and_extract_archive(
+                        url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True
+                    )
 
         if any(s not in os.listdir(root) for s in self.splits["test"]):
             # test split is downloaded from a different location
@@ -450,7 +505,7 @@ def _download_dataset(self, root: str):
 
 
 class StereoETH3D(StereoMatchingDataset):
-    """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
+    """ "ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
 
     The dataset is expected to have the following structure: ::
 
@@ -458,13 +513,13 @@ class StereoETH3D(StereoMatchingDataset):
             ETH3D
                 two_view_training
                     scene1
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
                         calib.txt
                     scene2
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
@@ -480,13 +535,13 @@ class StereoETH3D(StereoMatchingDataset):
                     ...
                 two_view_testing
                     scene1
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
                         calib.txt
                     scene2
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
@@ -496,7 +551,7 @@ class StereoETH3D(StereoMatchingDataset):
     Args:
         root (string): Root directory of the ETH3D Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
@@ -540,7 +595,7 @@ def __getitem__(self, index: int) -> Tuple:
 
 
 class StereoKitti2012(StereoMatchingDataset):
-    """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
+    """ "Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
     Uses the RGB images for consistency with Kitti 2015.
 
     The dataset is expected to have the following structure: ::
@@ -560,7 +615,7 @@ class StereoKitti2012(StereoMatchingDataset):
         root (string): Root directory where Kitti2012 is located.
         split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -602,7 +657,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 
 
 class StereoKitti2015(StereoMatchingDataset):
-    """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
+    """ "Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
 
     The dataset is expected to have the following structure: ::
 
@@ -663,7 +718,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 
 
 class StereoSintel(StereoMatchingDataset):
-    """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
+    """ "Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
     The dataset is expected to have the following structure: ::
 
@@ -732,7 +787,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # disparity decoding as per Sintel instructions
         disparity_map = np.array(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
-        disparity_map = r * 4 + g / (2**6) + b / (2**14)
+        disparity_map = r * 4 + g / (2 ** 6) + b / (2 ** 14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
         # occlusion mask
@@ -797,7 +852,9 @@ class StereoSceneFlow(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
+    def __init__(
+        self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None
+    ):
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -823,7 +880,9 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
-            disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+            disparity_maps_right = [
+                file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right
+            ]
 
             if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
@@ -924,9 +983,9 @@ def _read_disparity(self, file_path: str) -> Tuple:
         depth = np.array(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
-        with open(os.path.split(file_path)[0] + '/_camera_settings.json', 'r') as f:
+        with open(os.path.split(file_path)[0] + "/_camera_settings.json", "r") as f:
             intrinsics = json.load(f)
-            fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            fx = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             # inverse of depth-from-disparity equation
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0

From 669611eab0681edf1ffef5796f7755150575b4a3 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:48:42 +0100
Subject: [PATCH 10/35] Adressed CI/CD errors

---
 torchvision/datasets/_stereo_matching.py | 41 ++++++++++++------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 3edb0f639a5..254d9d2624a 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -10,7 +10,6 @@
 from typing import Callable, List, Optional, Tuple
 
 import numpy as np
-from jsonschema import ValidationError
 from PIL import Image
 from torch import Tensor
 
@@ -35,11 +34,11 @@ def read_pfm_file(file_path: str) -> np.array:
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
         if not header in [b"PF", b"Pf"]:
-            raise ValidationError(f"Not a valid PFM file: {file_path}")
+            raise ValueError(f"Not a valid PFM file: {file_path}")
 
         dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())
         if not dim_match:
-            raise ValidationError(f"Malformed PFM header: {file_path}")
+            raise ValueError(f"Malformed PFM header: {file_path}")
 
         width, height = map(int, dim_match.groups())
         channels = 3 if header == b"PF" else 1
@@ -231,7 +230,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
@@ -243,7 +242,7 @@ def __init__(
             if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -432,7 +431,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += list((left, right) for left, right in zip(imgs_left, imgs_right))
 
             if split == "test":
                 disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
@@ -442,7 +441,7 @@ def __init__(
                 if not len(disparity_maps_left) or not len(disparity_maps_right):
                     raise FileNotFoundError("No disparity maps found in {}".format(root / split))
 
-            self._disparities += list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
@@ -578,8 +577,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             if not len(disparity_maps_left):
                 raise FileNotFoundError("No disparity maps found in {}".format(root / anot_dir))
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -639,8 +638,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -700,8 +699,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -777,8 +776,8 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         disparity_maps_right = list("" for _ in dps_masks_left)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(dps_masks_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(dps_masks_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -876,7 +875,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root / p))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
@@ -890,7 +889,7 @@ def __init__(
             if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -967,7 +966,7 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = sorted(glob(str(root / s / "*" / "*.left.depth.png")))
@@ -975,7 +974,7 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
             if not len(disparity_maps_left) or not len(disparity_maps_right):
                 raise FileNotFoundError("No disparity maps found in {}".format(root))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -1041,7 +1040,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
-        imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
         self._images = imgs
 
         disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
@@ -1053,7 +1052,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
             raise FileNotFoundError("No disparity valid maps found in {}".format(root))
 
-        disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
         self._disparities = disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:

From d9d17a8ff5796ab2c79ce035525533d8d54dc7ed Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 11:21:36 +0100
Subject: [PATCH 11/35] Ran formatting pre-commit hook

---
 test/datasets_utils.py                   | 16 ++---
 test/test_datasets.py                    | 76 +++++++++++++-----------
 torchvision/datasets/_stereo_matching.py |  2 +-
 3 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index f051e325968..9afd8f741fd 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -561,9 +561,11 @@ def test_feature_types(self, config):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"], f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
+            assert (
+                len(dataset) == info["num_examples"]
+            ), f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
 
-    @ test_all_configs
+    @test_all_configs
     def test_transforms(self, config):
         mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args)
         for kwarg in self._TRANSFORM_KWARGS:
@@ -587,7 +589,7 @@ class ImageDatasetTestCase(DatasetTestCase):
 
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def create_dataset(
         self,
         config: Optional[Dict[str, Any]] = None,
@@ -610,7 +612,7 @@ def create_dataset(
             with self._force_load_images():
                 yield dataset, info
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def _force_load_images(self):
         open = PIL.Image.open
 
@@ -649,7 +651,7 @@ def _set_default_frames_per_clip(self, inject_fake_data):
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @ functools.wraps(inject_fake_data)
+        @functools.wraps(inject_fake_data)
         def wrapper(tmpdir, config):
             args = inject_fake_data(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
@@ -748,7 +750,7 @@ def size(idx: int) -> Tuple[int, int, int]:
     ]
 
 
-@ requires_lazy_imports("av")
+@requires_lazy_imports("av")
 def create_video_file(
     root: Union[pathlib.Path, str],
     name: Union[pathlib.Path, str],
@@ -790,7 +792,7 @@ def create_video_file(
     return file
 
 
-@ requires_lazy_imports("av")
+@requires_lazy_imports("av")
 def create_video_folder(
     root: Union[str, pathlib.Path],
     name: Union[str, pathlib.Path],
diff --git a/test/test_datasets.py b/test/test_datasets.py
index dd3c89b9bdc..5db3be40b4f 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -10,10 +10,10 @@
 import random
 import shutil
 import string
-from typing import List, Callable, Tuple
 import unittest
 import xml.etree.ElementTree as ET
 import zipfile
+from typing import List, Callable, Tuple
 
 import datasets_utils
 import numpy as np
@@ -28,26 +28,26 @@ class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-    @ staticmethod
+    @staticmethod
     def _make_binary_file(num_elements, root, name):
         file_name = os.path.join(root, name)
         np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    @ staticmethod
+    @staticmethod
     def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
         STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-    @ staticmethod
+    @staticmethod
     def _make_label_file(num_images, root, name):
         STL10TestCase._make_binary_file(num_images, root, name)
 
-    @ staticmethod
+    @staticmethod
     def _make_class_names_file(root, name="class_names.txt"):
         with open(os.path.join(root, name), "w") as fh:
             for cname in ("airplane", "bird"):
                 fh.write(f"{cname}\n")
 
-    @ staticmethod
+    @staticmethod
     def _make_fold_indices_file(root):
         num_folds = 10
         offset = 0
@@ -59,7 +59,7 @@ def _make_fold_indices_file(root):
 
         return tuple(range(1, num_folds + 1))
 
-    @ staticmethod
+    @staticmethod
     def _make_train_files(root, num_unlabeled_images=1):
         num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
         num_train_images = sum(num_images_in_fold)
@@ -70,7 +70,7 @@ def _make_train_files(root, num_unlabeled_images=1):
 
         return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @ staticmethod
+    @staticmethod
     def _make_test_files(root, num_images=2):
         STL10TestCase._make_image_file(num_images, root, "test_X.bin")
         STL10TestCase._make_label_file(num_images, root, "test_y.bin")
@@ -888,7 +888,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_images
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def create_dataset(self, *args, **kwargs):
         with super().create_dataset(*args, **kwargs) as output:
             yield output
@@ -1294,7 +1294,7 @@ def _create_archive(self, root, name, *files):
 
         return archive
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_feature_types(self, config):
         feature_types = self.FEATURE_TYPES
         self.FEATURE_TYPES = self._TRAIN_FEATURE_TYPES if config["train"] else self._TEST_FEATURE_TYPES
@@ -1572,7 +1572,7 @@ def _file_name_fn(self, cls, ext, idx):
     def _is_valid_file_to_extensions(self, is_valid_file):
         return {ext for ext in self._EXTENSIONS if is_valid_file(f"foo.{ext}")}
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_is_valid_file(self, config):
         extensions = config.pop("extensions")
         # We need to explicitly pass extensions=None here or otherwise it would be filled by the value from the
@@ -1582,7 +1582,7 @@ def test_is_valid_file(self, config):
         ) as (dataset, info):
             assert len(dataset) == info["num_examples"]
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1603,7 +1603,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return dict(num_examples=num_examples_total, classes=classes)
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1702,32 +1702,32 @@ class Places365TestCase(datasets_utils.ImageDatasetTestCase):
         *((f"{category}/Places365_train_00000001.png", idx) for category, idx in _CATEGORIES_CONTENT),
     )
 
-    @ staticmethod
+    @staticmethod
     def _make_txt(root, name, seq):
         file = os.path.join(root, name)
         with open(file, "w") as fh:
             for text, idx in seq:
                 fh.write(f"{text} {idx}\n")
 
-    @ staticmethod
+    @staticmethod
     def _make_categories_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._CATEGORIES_CONTENT)
 
-    @ staticmethod
+    @staticmethod
     def _make_file_list_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._FILE_LIST_CONTENT)
 
-    @ staticmethod
+    @staticmethod
     def _make_image(file_name, size):
         os.makedirs(os.path.dirname(file_name), exist_ok=True)
         PIL.Image.fromarray(np.zeros((*size, 3), dtype=np.uint8)).save(file_name)
 
-    @ staticmethod
+    @staticmethod
     def _make_devkit_archive(root, split):
         Places365TestCase._make_categories_txt(root, Places365TestCase._CATEGORIES)
         Places365TestCase._make_file_list_txt(root, Places365TestCase._FILE_LISTS[split])
 
-    @ staticmethod
+    @staticmethod
     def _make_images_archive(root, split, small):
         folder_name = Places365TestCase._IMAGES[(split, small)]
         image_size = (256, 256) if small else (512, random.randint(512, 1024))
@@ -2042,7 +2042,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples[config["split"]]
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_flow(self, config):
         # Make sure flow always exists, and make sure there are as many flow values as (pairs of) images
         # Also make sure the flow is properly decoded
@@ -2101,7 +2101,7 @@ def inject_fake_data(self, tmpdir, config):
         )
         return num_examples
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_flow(self, config):
         h, w = self.FLOW_H, self.FLOW_W
         expected_flow = np.arange(3 * h * w).reshape(h, w, 3).transpose(2, 0, 1)
@@ -2726,7 +2726,9 @@ def inject_fake_data(self, tmpdir, config):
 
     def test_training_test_splits(self):
         with self.create_dataset(split="train") as (dataset, _):
-            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            assert dataset._images and len(dataset._images) == len(
+                dataset._disparities
+            ), "Training images do not match with training disparities"
             for _, _, disparity, valid_mask in dataset:
                 assert len(disparity.shape) == 3
                 assert len(valid_mask.shape) == 2
@@ -2813,10 +2815,10 @@ def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
             scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
             os.makedirs(scene_dir, exist_ok=True)
             # make normal images first
-            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1L.png", size=(3, 100, 100))
             # these are going to end up being gray scale images
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
@@ -2827,7 +2829,7 @@ def inject_fake_data(self, tmpdir, config):
         split_scene_map = {
             "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
             "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
-            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"],
         }
 
         middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
@@ -2895,7 +2897,7 @@ def test_warnings_train(self):
         with pytest.warns(
             RuntimeWarning,
             match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
-                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+            f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
         ):
             with self.create_dataset(split=split, calibration=calibration):
                 pass
@@ -2905,8 +2907,7 @@ def test_warnings_test(self):
         split = "test"
         calibration = "perfect"
         with pytest.warns(
-            RuntimeWarning,
-            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+            RuntimeWarning, match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
         ):
             with self.create_dataset(split=split, calibration=calibration):
                 pass
@@ -3086,13 +3087,14 @@ def test_bad_input(self):
 class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StereoSceneFlow
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("FlyingThings3D", "Driving", "Monkaa"),
-        pass_name=("clean", "final")
+        split=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
+    def _create_pfm_folder(
+        root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]
+    ) -> List[str]:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
@@ -3193,8 +3195,12 @@ def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> Lis
         paths.append(datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0])))
         paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
         # single channel depth maps
-        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1])))
-        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1])))
+        paths.append(
+            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
+        )
+        paths.append(
+            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
+        )
         # camera settings json. Minimal example for _read_disparity function testing
         settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
         with open(root / "_camera_settings.json", "w") as f:
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 254d9d2624a..8ef5f3e6e1a 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -33,7 +33,7 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        if not header in [b"PF", b"Pf"]:
+        if header not in [b"PF", b"Pf"]:
             raise ValueError(f"Not a valid PFM file: {file_path}")
 
         dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())

From a31ee83e49802063dc1941f41dab49b511efc515 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Sun, 10 Jul 2022 17:05:50 +0100
Subject: [PATCH 12/35] Added Stereo Matching dataset interface and several
 classic datasets.

---
 torchvision/datasets/_stereo_matching.py | 479 +++++++++++++++++++++++
 1 file changed, 479 insertions(+)
 create mode 100644 torchvision/datasets/_stereo_matching.py

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
new file mode 100644
index 00000000000..42535c1623b
--- /dev/null
+++ b/torchvision/datasets/_stereo_matching.py
@@ -0,0 +1,479 @@
+from abc import ABC, abstractmethod
+from functools import reduce
+from glob import glob
+from pathlib import Path
+from random import random
+import re
+import shutil
+from typing import Callable, List, Optional, Tuple, Any
+import lzma
+from torch import Tensor
+from .vision import VisionDataset
+from .utils import download_and_extract_archive, download_url, verify_str_arg
+import os
+from torch.utils.model_zoo import tqdm
+import numpy as np
+from PIL import Image
+
+__all__ = (
+    "CSEStereo"
+    "Middlebury2014"
+    "ETH3D"
+    "Kitti2012"
+    "Kitti2015"
+)
+
+
+def read_pfm_file(file_path: str) -> np.array:
+    # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
+    with open(file_path, "rb") as file:
+        header = file.readline().rstrip()
+        assert header in ["PF", "Pf"], f"{file_path} is not a valid .pfm file"
+        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
+        assert dim_match, f"{file_path} has a Malformed PFM header"
+
+        width, height = map(int, dim_match.groups())
+        channels = 3 if header == "PF" else 1
+        scale = float(file.readline().rstrip())
+        # check for endian type
+        if scale < 0:
+            scale = -scale
+            endian = '<'
+        else:
+            endian = '>'
+
+        data = np.fromfile(file, endian + 'f')
+        data = np.reshape(data, (height, width, channels))
+        data = np.flipud(data)
+
+        return data
+
+
+class StereoMatchingDataset(ABC, VisionDataset):
+    """Base interface for Stereo matching datasets"""
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None):
+        super().__init__(root=root)
+        self.transforms = transforms
+
+        self._images: List[Tuple] = []
+        self._disparities: List[Tuple] = []
+
+    def _read_img(self, file_path: str) -> Image.Image:
+        img = Image.open(file_path)
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        return img
+
+    @abstractmethod
+    def _read_disparity(self, file_path: str) -> Tuple:
+        # function that returns a disparity map and an occlusion map
+        pass
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        img_left = self._read_img(self._images[index][0])
+        img_right = self._read_img(self._images[index][1])
+
+        dsp_map_left, occ_mask_left = self._read_disparity(self._disparities[index][0])
+        dsp_map_right, occ_mask_right = self._read_disparity(self._disparities[index][1])
+
+        imgs = (img_left, img_right)
+        dsp_maps = (dsp_map_left, dsp_map_right)
+        occ_masks = (occ_mask_left, occ_mask_right)
+
+        if self.transforms is not None:
+            imgs, dsp_maps, occ_masks, = self.transforms(imgs, dsp_maps, occ_masks)
+
+        return imgs, dsp_maps, occ_masks
+
+    def __len__(self) -> int:
+        return len(self._images)
+
+
+class CRESSyntethicStereo(StereoMatchingDataset):
+    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
+
+   Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
+   """
+    DOWNLOAD_SPACE = 4 * 1024 * 1024 * 1024  # dataset requires download requires about 400 GB of free space
+
+    EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None, download: bool = True):
+        super().__init__(root, transforms)
+        # if the API user requests a dataset download check that the user can download it
+        if download:
+            statvfs = os.statvfs(root)
+            # measured in bytes
+            available_space = statvfs.f_frsize * statvfs.f_bavail
+            if available_space - self.DOWNLOAD_SPACE < 0:
+                raise ValueError(
+                    f"The storage device for {root} is too small to download the dataset), "
+                    f"an additional {self.DOWNLOAD_SPACE - self.available_space:.2f} GB are required."
+                )
+            self._download_dataset(root)
+
+    def _download_dataset(self, root: str) -> None:
+        # TODO: remove before release, used only for testing purposes
+        dirs = ["tree", "shapenet", "reflective", "hole"]
+        # create directory subtree for the download
+        for d in dirs:
+            d_path = os.path.join(root, d)
+            if not os.path.exists(d_path):
+                os.makedirs(d_path)
+
+            for i in range(self.EXPERIMENTAL_RANGE):
+                url = f"https://data.megengine.org.cn/research/crestereo/dataset/{d}/{i}.tar"
+                download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
+
+
+class Middlebury2014(StereoMatchingDataset):
+    """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
+
+    The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
+
+        root
+            Middlebury2014
+                train
+                    scene1-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    scene2-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    ...
+                additional
+                    scene1-{ ,perfect,imperfect}
+                        calib.txt                    
+                        im{0,1}.png                  
+                        im1E.png                     
+                        im1L.png                     
+                        disp{0,1}.pfm                
+                        disp{0,1}-n.png              
+                        disp{0,1}-sd.pfm             
+                        disp{0,1}y.pfm
+                    ...
+                test
+                    scene1
+                        calib.txt
+                        im{0,1}.png
+                    scene2
+                        calib.txt
+                        im{0,1}.png
+                    ...
+
+
+    Args:
+        root (string): Root directory of the Middleburry 2014 Dataset.
+        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
+        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible. Sampled with equal probability.
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+    """
+
+    splits = {
+        "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
+        "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
+        "test": ['Plants', 'Classroom2E', 'Classroom2', 'Australia', 'DjembeL', 'CrusadeP', 'Crusade', 'Hoops', 'Bicycle2', 'Staircase', 'Newkuba', 'AustraliaP', 'Djembe', 'Livingroom', 'Computer']
+    }
+
+    def __init__(
+        self,
+        *,
+        root: str,
+        split: str = "train",
+        use_ambient_views: bool = False,
+        transforms: Optional[Callable] = None,
+        download: bool = False
+    ):
+        super().__init__(root, transforms)
+        verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
+
+        if download:
+            self._download_dataset(root)
+
+        root = Path(root) / "FlyingChairs"
+        if not os.path.exists(root / split):
+            raise FileNotFoundError(
+                f"The {split} directory was not found in the provided root directory"
+            )
+
+        split_scenes = self.splits[split]
+        # check that the provided root folder contains the scene splits
+        if not all(s in os.listdir(root / split) for s in split_scenes):
+            raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
+
+        imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
+        imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+
+        if split == "test":
+            dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+        else:
+
+            dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+            dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
+        self._disparities = list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+
+        self.use_ambient_views = use_ambient_views
+
+    def __getitem__(self, index: int) -> Tuple:
+        return super().__getitem__(index)
+
+    def _read_img(self, file_path: str) -> Image.Image:
+        if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
+            # initialize sampleable container
+            ambient_file_paths = list(file_path.replace("im1.png", view_name) for view_name in ["im1E.png", "im1L.png"])
+            # double check that we're not going to try to read from an invalid file path
+            ambient_file_paths = list(filter(lambda p: os.path.exists(p), ambient_file_paths))
+            # keep the original image as an option as well for uniform sampling between base views
+            ambient_file_paths.append(file_path)
+            file_path = random.choice(ambient_file_paths)
+        return super()._read_img(file_path)
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):  # case when dealing with the test split
+            return None, None
+        dsp_mask = read_pfm_file(file_path)
+        occ_mask = dsp_mask < 1e3
+        return dsp_mask, occ_mask
+
+    def _download_dataset(self, root: str):
+        base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
+        # train and additional splits have 2 different calibration settings
+        root = Path(root) / "Middlebury2014"
+        for split_name, split_scenes in self.splits.values():
+            if split_name == "test":
+                continue
+            split_root = root / split_name
+            for scene in split_scenes:
+                scene_name = f"{scene}-{calibration}"
+                for calibration in ["perfect", "imperfect"]:
+                    scene_url = f"{base_url}/{scene_name}.zip"
+                    download_and_extract_archive(url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True)
+
+        if any(s not in os.listdir(root) for s in self.splits["test"]):
+            # test split is downloaded from a different location
+            test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
+
+            # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
+            # we want to move the contents from testF into the  directory
+            download_and_extract_archive(url=test_set_url, download_root=root, remove_finished=True)
+            for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
+                for scene in scene_names:
+                    shutil.move(os.path.join(scene_dir, scene), os.path.join(root, scene))
+
+            # cleanup MiddEval3 directory
+            shutil.rmtree(os.path.join(root, "MiddEval3"))
+
+
+class ETH3D(StereoMatchingDataset):
+    """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            ETH3D
+                two_view_training
+                    scene1
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    scene2
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    ...
+                two_view_training_gt
+                    scene1
+                        disp0GT.pfm
+                        mask0nocc.png
+                    scene2
+                        disp0GT.pfm
+                        mask0nocc.png
+                    ...
+                two_view_testing
+                    scene1
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    scene2
+                        im1.png 
+                        im0.png
+                        images.txt
+                        cameras.txt
+                        calib.txt
+                    ...
+
+    Args:
+        root (string): Root directory of the ETH3D Dataset.
+        split (string, optional): The dataset split of scenes, either "train" (default) or "test".
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "ETH3D"
+        img_dir = "two_view_training" if split == "train" else "two_view_testing"
+        anot_dir = "two_view_training_gt"
+
+        imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
+        imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
+
+        if split == "test":
+            dsp_masks_left, dsp_masks_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+        else:
+            dsp_masks_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            # no masks for the right view, always using left as reference
+            dsp_masks_right = list("" for _ in dsp_masks_left)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = read_pfm_file(file_path)
+        occ_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
+        occ_mask = np.array(occ_mask)
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
+
+
+class Kitti2012(StereoMatchingDataset):
+    """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
+    Uses the RGB images for consistency with Kitti 2015.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            Kitti2012
+                testing
+                    colored_0
+                    colored_1
+                training
+                    colored_0
+                    colored_1
+                    disp_noc
+                    calib
+
+    Args:
+        root (string): Root directory where Kitti2012 is located.
+        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "Kitti2012" / (split + "ing")
+        imgs_left = sorted(glob(str(root / "colored_0" / "*_10.png")))
+        imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
+
+        if split == "train":
+            dsp_masks_left = sorted(glob(str(root / "disp_noc" / "*.png")))
+            dsp_masks_right = list("" for _ in dsp_masks_left)
+        else:
+            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = np.array(Image.open(file_path)) / 256.0
+        occ_mask = dsp_mask > 0.0
+
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
+
+
+class Kitti2015(StereoMatchingDataset):
+    """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
+
+    The dataset is expected to have the following structure: ::
+
+        root
+            Kitti2015
+                testing
+                    image_2
+                    image_3
+                training
+                    image_2
+                    image_3
+                    disp_noc_0
+                    disp_noc_1
+                    calib
+
+    Args:
+        root (string): Root directory where Kitti2015 is located.
+        split (string, optional): The dataset split of scenes, either "train" (default) or test.
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
+        root = Path(root) / "Kitti2015" / (split + "ing")
+        imgs_left = sorted(glob(str(root / "image_2" / "*_10.png")))
+        imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
+
+        if split == "train":
+            dsp_masks_left = sorted(glob(str(root / "disp_noc_0" / "*.png")))
+            dsp_masks_right = sorted(glob(str(root / "disp_noc_1" / "*.png")))
+        else:
+            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        dsp_mask = np.array(Image.open(file_path)) / 256.0
+        occ_mask = dsp_mask > 0.0
+
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)

From 4a5ac8931cb04d85d6bd833af2a0b0c8ebffcdd9 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 20:19:23 +0100
Subject: [PATCH 13/35] added SceneFlow, FallingThings and CREStereo

---
 torchvision/datasets/_stereo_matching.py | 47 +++++++++++++++++++++++-
 vision                                   |  1 +
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 160000 vision

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 42535c1623b..960e443bd46 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -28,7 +28,8 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        assert header in ["PF", "Pf"], f"{file_path} is not a valid .pfm file"
+        assert header in [b"PF", b"Pf"], f"{file_path} is not a valid .pfm file"
+
         dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
         assert dim_match, f"{file_path} has a Malformed PFM header"
 
@@ -477,3 +478,47 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
+
+
+class SintelDataset(StereoMatchingDataset):
+    """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
+
+    Args:
+        root (string): Root directory where Sintel Stereo is located.
+        transforms (callalbe, optional): A function/transform that takes in
+            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+    """
+
+    def __init__(self, root: str, transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        root = Path(root) / "Sintel"
+
+        imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
+        imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
+
+        dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
+        dsp_masks_right = list("" for _ in dps_masks_left)
+
+        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._dsp_masks = list((l, r) for l, r in zip(dps_masks_left, dsp_masks_right))
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            return None, None
+
+        # disparity decoding as per Sintel instructions
+        dsp_mask = np.array(Image.open(file_path), dtype=np.float32)
+        r, g, b = np.split(dsp_mask, 3, axis=-1)
+        dsp_mask = r * 4 + g / (2**6) + b / (2**14)
+
+        # occlusion mask
+        occ_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) > 0
+        # out of frame mask
+        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) > 0
+        # combine the masks together
+        occ_mask = np.logical_or(off_mask, occ_mask)
+        return dsp_mask, occ_mask
+
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+        return super().__getitem__(index)
diff --git a/vision b/vision
new file mode 160000
index 00000000000..bd19fb8ea9b
--- /dev/null
+++ b/vision
@@ -0,0 +1 @@
+Subproject commit bd19fb8ea9b1f67df2a2a1ee116874609ad3ee8c

From a1fc699e18c0d1e6e541b7d1e49fc3397c6572c8 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 23:29:04 +0100
Subject: [PATCH 14/35] added SceneFlow, FallingThings and CREStereo

---
 torchvision/datasets/_stereo_matching.py | 228 ++++++++++++++++++-----
 1 file changed, 183 insertions(+), 45 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 960e443bd46..65336503b87 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,26 +1,28 @@
 from abc import ABC, abstractmethod
-from functools import reduce
 from glob import glob
 from pathlib import Path
 from random import random
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
-import lzma
 from torch import Tensor
 from .vision import VisionDataset
 from .utils import download_and_extract_archive, download_url, verify_str_arg
 import os
-from torch.utils.model_zoo import tqdm
 import numpy as np
 from PIL import Image
+import json
 
 __all__ = (
-    "CSEStereo"
+    "CREStereo"  # waiting for download
     "Middlebury2014"
     "ETH3D"
     "Kitti2012"
     "Kitti2015"
+    "Sintel"
+    "SceneFlow"  # need to find valid mask procedure
+    "FallingThings"
+    "InStereo2k"  # waiting for download
 )
 
 
@@ -71,21 +73,21 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
-        dsp_map_left, occ_mask_left = self._read_disparity(self._disparities[index][0])
-        dsp_map_right, occ_mask_right = self._read_disparity(self._disparities[index][1])
+        dsp_map_left, valid_mask_right = self._read_disparity(self._disparities[index][0])
+        dsp_map_right, valid_mask_right = self._read_disparity(self._disparities[index][1])
 
         imgs = (img_left, img_right)
         dsp_maps = (dsp_map_left, dsp_map_right)
-        occ_masks = (occ_mask_left, occ_mask_right)
+        valid_masks = (valid_mask_right, valid_mask_right)
 
         if self.transforms is not None:
-            imgs, dsp_maps, occ_masks, = self.transforms(imgs, dsp_maps, occ_masks)
+            imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
 
-        return imgs, dsp_maps, occ_masks
+        return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
 
     def __len__(self) -> int:
         return len(self._images)
@@ -100,7 +102,9 @@ class CRESSyntethicStereo(StereoMatchingDataset):
 
     EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None, download: bool = True):
+    MAX_DISP = 256.
+
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = True):
         super().__init__(root, transforms)
         # if the API user requests a dataset download check that the user can download it
         if download:
@@ -114,6 +118,32 @@ def __init__(self, root: str, transforms: Optional[Callable] = None, download: b
                 )
             self._download_dataset(root)
 
+        verify_str_arg(split, "split", valid_values=("tree", "shapenet", "reflective", "hole", "all"))
+
+        splits = {
+            "tree": ["tree"],
+            "shapenet": ["shapenet"],
+            "reflective": ["reflective"],
+            "hole": ["hole"],
+            "all": ["hole", "shapenet", "reflective", "hole"],
+        }[split]
+
+        for s in splits:
+            imgs_left = sorted(glob(str(root / s / "*_left.jpg")))
+            imgs_right = (p.replace("_left", "_right") for p in imgs_left)
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = (p.replace("_left", "_left.disp") for p in imgs_left)
+            disparity_maps_right = (p.replace("_right", "_right.disp") for p in imgs_right)
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        valid = (disparity < self.MAX_DISP) & (disparity > 0.)
+        return disparity, valid
+
     def _download_dataset(self, root: str) -> None:
         # TODO: remove before release, used only for testing purposes
         dirs = ["tree", "shapenet", "reflective", "hole"]
@@ -249,9 +279,9 @@ def _read_img(self, file_path: str) -> Image.Image:
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):  # case when dealing with the test split
             return None, None
-        dsp_mask = read_pfm_file(file_path)
-        occ_mask = dsp_mask < 1e3
-        return dsp_mask, occ_mask
+        disparity_map = read_pfm_file(file_path)
+        valid_mask = disparity_map < 1e3
+        return disparity_map, valid_mask
 
     def _download_dataset(self, root: str):
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
@@ -347,23 +377,23 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
 
         if split == "test":
-            dsp_masks_left, dsp_masks_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
-            dsp_masks_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
             # no masks for the right view, always using left as reference
-            dsp_masks_right = list("" for _ in dsp_masks_left)
+            disparity_maps_right = list("" for _ in disparity_maps_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = read_pfm_file(file_path)
-        occ_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
-        occ_mask = np.array(occ_mask)
-        return dsp_mask, occ_mask
+        disparity_map = read_pfm_file(file_path)
+        valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
+        valid_mask = np.array(valid_mask)
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -404,22 +434,22 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
 
         if split == "train":
-            dsp_masks_left = sorted(glob(str(root / "disp_noc" / "*.png")))
-            dsp_masks_right = list("" for _ in dsp_masks_left)
+            disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
+            disparity_maps_right = list("" for _ in disparity_maps_left)
         else:
-            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = np.array(Image.open(file_path)) / 256.0
-        occ_mask = dsp_mask > 0.0
+        disparity_map = np.array(Image.open(file_path)) / 256.0
+        valid_mask = disparity_map > 0.0
 
-        return dsp_mask, occ_mask
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -459,22 +489,22 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
 
         if split == "train":
-            dsp_masks_left = sorted(glob(str(root / "disp_noc_0" / "*.png")))
-            dsp_masks_right = sorted(glob(str(root / "disp_noc_1" / "*.png")))
+            disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
+            disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
         else:
-            dsp_masks_left, dsp_masks_right = list("" for _ in dsp_masks_left), list("" for _ in dsp_masks_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dsp_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        dsp_mask = np.array(Image.open(file_path)) / 256.0
-        occ_mask = dsp_mask > 0.0
+        disparity_map = np.array(Image.open(file_path)) / 256.0
+        valid_mask = disparity_map < 0.0
 
-        return dsp_mask, occ_mask
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
@@ -498,27 +528,135 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
 
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
-        dsp_masks_right = list("" for _ in dps_masks_left)
+        disparity_maps_right = list("" for _ in dps_masks_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._dsp_masks = list((l, r) for l, r in zip(dps_masks_left, dsp_masks_right))
+        self._disparities = list((l, r) for l, r in zip(dps_masks_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
         # disparity decoding as per Sintel instructions
-        dsp_mask = np.array(Image.open(file_path), dtype=np.float32)
-        r, g, b = np.split(dsp_mask, 3, axis=-1)
-        dsp_mask = r * 4 + g / (2**6) + b / (2**14)
+        disparity_map = np.array(Image.open(file_path), dtype=np.float32)
+        r, g, b = np.split(disparity_map, 3, axis=-1)
+        disparity_map = r * 4 + g / (2**6) + b / (2**14)
 
         # occlusion mask
-        occ_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) > 0
+        valid_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) == 0
         # out of frame mask
-        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) > 0
+        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) == 0
         # combine the masks together
-        occ_mask = np.logical_or(off_mask, occ_mask)
-        return dsp_mask, occ_mask
+        valid_mask = np.logical_or(off_mask, valid_mask)
+        return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
+
+
+class SceneFlowDataset(StereoMatchingDataset):
+    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
+
+    def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
+        split = split.upper()
+
+        verify_str_arg(split, "pass_name", valid_values=("clean", "final", "both"))
+
+        passes = {
+            "clean": ["frames_cleanpass"],
+            "final": ["frames_finalpass"],
+            "both": ["frames_cleanpass, frames_finalpass"],
+        }[pass_name]
+
+        root = Path(root) / split
+
+        for p in passes:
+            imgs_left = sorted(glob(str(root / p / "left" / "*" / "*.png")))
+            imgs_right = sorted(glob(str(root / p / "right" / "*" / "*.png")))
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
+            disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = read_pfm_file(file_path)
+        valid = np.ones_like(disparity)
+        return disparity, valid
+
+
+class FallingThingsDataset(StereoMatchingDataset):
+    """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            FallingThings
+                single
+                    scene1
+                        _object_settings.json
+                        _camera_settings.json
+                        image1.left.depth.png
+                        image1.right.depth.png
+                        image1.left.jpg
+                        image1.right.jpg
+                        image2.left.depth.png
+                        image2.right.depth.png
+                        image2.left.jpg
+                        image2.right
+                        ...
+                    scene2
+                    ...
+                mixed
+                    scene1
+                        _object_settings.json
+                        _camera_settings.json
+                        image1.left.depth.png
+                        image1.right.depth.png
+                        image1.left.jpg
+                        image1.right.jpg
+                        image2.left.depth.png
+                        image2.right.depth.png
+                        image2.left.jpg
+                        image2.right
+                        ...
+                    scene2
+                    ...
+    """
+
+    def __init__(self, root: str, split: str = "single", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        verify_str_arg(split, "split", valid_values=("single", "mixed", "both"))
+        split = split.upper()
+
+        splits = {
+            "single": ["single"],
+            "mixed": ["mixed"],
+            "both": ["single", "mixed"],
+        }[split]
+
+        for s in splits:
+            imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
+            imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
+            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += imgs
+
+            disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
+            disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        depth = Image.Open(file_path)
+        with open(os.path.split(file_path)[0] + '_camera_settings.json', 'r') as f:
+            intrinsics = json.load(f)
+            fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
+            valid = disparity > 0
+            return disparity, valid

From 62368b1d1eb8e260d3ea89acfc245a940c44a700 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 11 Jul 2022 23:34:27 +0100
Subject: [PATCH 15/35] "removed duplicate folder"

---
 vision | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 vision

diff --git a/vision b/vision
deleted file mode 160000
index bd19fb8ea9b..00000000000
--- a/vision
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd19fb8ea9b1f67df2a2a1ee116874609ad3ee8c

From 33c52a5705a414ce2f47fe193e7cb9c0f48432d1 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 11:22:29 +0100
Subject: [PATCH 16/35] Added InStereo2k. Started working on dataset tests

---
 test/datasets_utils.py                   |  14 +-
 test/test_datasets.py                    | 552 ++++++++++++++++++++++-
 torchvision/datasets/__init__.py         |   1 +
 torchvision/datasets/_stereo_matching.py | 191 ++++++--
 4 files changed, 686 insertions(+), 72 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 88eb4e17823..f051e325968 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -561,9 +561,9 @@ def test_feature_types(self, config):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(dataset) == info["num_examples"], f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
 
-    @test_all_configs
+    @ test_all_configs
     def test_transforms(self, config):
         mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args)
         for kwarg in self._TRANSFORM_KWARGS:
@@ -587,7 +587,7 @@ class ImageDatasetTestCase(DatasetTestCase):
 
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def create_dataset(
         self,
         config: Optional[Dict[str, Any]] = None,
@@ -610,7 +610,7 @@ def create_dataset(
             with self._force_load_images():
                 yield dataset, info
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def _force_load_images(self):
         open = PIL.Image.open
 
@@ -649,7 +649,7 @@ def _set_default_frames_per_clip(self, inject_fake_data):
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @functools.wraps(inject_fake_data)
+        @ functools.wraps(inject_fake_data)
         def wrapper(tmpdir, config):
             args = inject_fake_data(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
@@ -748,7 +748,7 @@ def size(idx: int) -> Tuple[int, int, int]:
     ]
 
 
-@requires_lazy_imports("av")
+@ requires_lazy_imports("av")
 def create_video_file(
     root: Union[pathlib.Path, str],
     name: Union[pathlib.Path, str],
@@ -790,7 +790,7 @@ def create_video_file(
     return file
 
 
-@requires_lazy_imports("av")
+@ requires_lazy_imports("av")
 def create_video_folder(
     root: Union[str, pathlib.Path],
     name: Union[str, pathlib.Path],
diff --git a/test/test_datasets.py b/test/test_datasets.py
index a108479aee3..d390c30cee9 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1,3 +1,4 @@
+from abc import abstractmethod
 import bz2
 import contextlib
 import csv
@@ -10,6 +11,7 @@
 import random
 import shutil
 import string
+from typing import List, Callable, Tuple
 import unittest
 import xml.etree.ElementTree as ET
 import zipfile
@@ -23,30 +25,540 @@
 from torchvision import datasets
 
 
+class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoETH3D
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        # create the scene folder
+        image_paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with left right images
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
+        return image_paths
+
+    @staticmethod
+    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        # create scene directories
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with a random png file for occlusion mask, and a pfm file for disparity
+            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
+            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
+            paths.append(pfm_path)
+        return paths
+
+    def inject_fake_data(self, tmpdir, config):
+        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+
+        num_examples = 2 if config["split"] == "train" else 3
+
+        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
+        split_dir = os.path.join(eth3d_dir, split_name)
+        self._create_scene_folder(num_examples, split_dir)
+
+        if config["split"] == "train":
+            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
+            self._create_annotation_folder(num_examples, annot_dir)
+
+        return num_examples
+
+    def test_training_test_splits(self):
+        with self.create_dataset(split="train") as (dataset, _):
+            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            for _, _, disparity, valid_mask in dataset:
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                dh, dw, _ = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
+
+        with self.create_dataset(split="test") as (dataset, _):
+            assert all(d == ("", "") for d in dataset._disparities)
+            for _, _, disparity, valid_mask in dataset:
+                assert disparity is None
+                assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class CREStereoSynthethicTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CREStereoSynthetic
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
+        os.makedirs(crestereo_dir, exist_ok=True)
+
+        split_dir = crestereo_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+        num_examples = 4
+
+        for idx in range(num_examples):
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+
+        return num_examples
+
+    def test_splits(self):
+        for split in ("tree", "shapenet", "reflective", "hole"):
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoMiddlebury2014
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "additional"), use_ambient_views=(True, False))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
+        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
+        scene_dirs = []
+        for c in calibrations:
+            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # make normal images first
+            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
+            scene_dirs.append(scene_dir)
+        return scene_dirs
+
+    def inject_fake_data(self, tmpdir, config):
+        split_scene_map = {
+            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
+            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+        }
+
+        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
+        os.makedirs(middlebury_dir, exist_ok=True)
+
+        split_dir = middlebury_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+        for idx in range(num_examples):
+            # special case for test_bad_input
+            if config["split"] not in split_scene_map:
+                return 0
+
+            scene_name = split_scene_map[config["split"]][idx]
+            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+
+        # account for perfect / imperfect calibrations
+        if config["split"] != "test":
+            num_examples *= 2
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train", "additional"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 3
+                    assert disparity.shape == (h, w, 3)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw, c = disparity.shape
+                    print(valid_mask.shape)
+                    mh, mw, _ = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_augmented_view_usage(self):
+        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
+            for left, right, _, _ in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2012
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_0",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_1",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_noc",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2012 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2015
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_2",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_3",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_0",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_1",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 2
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSceneFlow
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("FlyingThings3D", "Driving", "Monkaa"),
+        pass_name=("clean", "final")
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]):
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
+
+        for i in range(num_examples):
+            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+
+    def inject_fake_data(self, tmpdir, config):
+        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
+        os.makedirs(scene_flow_dir, exist_ok=True)
+
+        split_dir = scene_flow_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        pass_dir_map = {
+            "clean": "frames_cleanpass",
+            "final": "frames_finalpass",
+        }
+
+        num_examples = 4
+        pass_dir_name = pass_dir_map[config["pass_name"]]
+        # create pass directories
+        pass_dir = split_dir / pass_dir_name
+        disp_dir = split_dir / "disp"
+        os.makedirs(pass_dir, exist_ok=True)
+        os.makedirs(disp_dir, exist_ok=True)
+
+        # root / pass / direction / scene / .imgs
+        # root / disparity / direction / scene / .imgs
+        for direction in ["left", "right"]:
+            for scene_idx in range(num_examples):
+                # scene_dir = pass_dir / direction / f"scene_{scene_idx:06d}"
+                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                datasets_utils.create_image_folder(
+                    root=pass_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.png",
+                    num_examples=3,
+                    size=(3, 100, 200),
+                )
+                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                self._create_pfm_folder(
+                    root=disp_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.pfm",
+                    num_examples=3,
+                    size=(100, 200),
+                )
+
+        return num_examples * 3
+
+    def test_train_splits(self):
+        for split, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split, pass_name=pass_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w, 3)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw, _ = disparity.shape
+                    mh, mw, _ = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+
+class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoFallingThings
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root: str, scene_name: str, num_examples: int, size: Tuple[int, int]):
+        root = pathlib.Path(root) / scene_name
+        os.makedirs(root, exist_ok=True)
+
+        datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[0], size[1]))
+        # single channel depth maps
+        datasets_utils.create_image_file(root, "image1.left.depth.jpg", size=(1, size[0], size[1]))
+        datasets_utils.create_image_file(root, "image1.right.depth.jpg", size=(1, size[0], size[1]))
+
+    def inject_fake_data(self, tmpdir, config):
+        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
+
+        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = 4
+
+        for i in range(num_examples):
+            self._make_scene_folder(
+                root=split_dir,
+                scene_name=f"scene_{i:06d}",
+                num_examples=num_examples,
+                size=(100, 200),
+            )
+
+        return num_examples
+
+
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-    @staticmethod
+    @ staticmethod
     def _make_binary_file(num_elements, root, name):
         file_name = os.path.join(root, name)
         np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    @staticmethod
+    @ staticmethod
     def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
         STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-    @staticmethod
+    @ staticmethod
     def _make_label_file(num_images, root, name):
         STL10TestCase._make_binary_file(num_images, root, name)
 
-    @staticmethod
+    @ staticmethod
     def _make_class_names_file(root, name="class_names.txt"):
         with open(os.path.join(root, name), "w") as fh:
             for cname in ("airplane", "bird"):
                 fh.write(f"{cname}\n")
 
-    @staticmethod
+    @ staticmethod
     def _make_fold_indices_file(root):
         num_folds = 10
         offset = 0
@@ -58,7 +570,7 @@ def _make_fold_indices_file(root):
 
         return tuple(range(1, num_folds + 1))
 
-    @staticmethod
+    @ staticmethod
     def _make_train_files(root, num_unlabeled_images=1):
         num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
         num_train_images = sum(num_images_in_fold)
@@ -69,7 +581,7 @@ def _make_train_files(root, num_unlabeled_images=1):
 
         return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @staticmethod
+    @ staticmethod
     def _make_test_files(root, num_images=2):
         STL10TestCase._make_image_file(num_images, root, "test_X.bin")
         STL10TestCase._make_label_file(num_images, root, "test_y.bin")
@@ -887,7 +1399,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_images
 
-    @contextlib.contextmanager
+    @ contextlib.contextmanager
     def create_dataset(self, *args, **kwargs):
         with super().create_dataset(*args, **kwargs) as output:
             yield output
@@ -1293,7 +1805,7 @@ def _create_archive(self, root, name, *files):
 
         return archive
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_feature_types(self, config):
         feature_types = self.FEATURE_TYPES
         self.FEATURE_TYPES = self._TRAIN_FEATURE_TYPES if config["train"] else self._TEST_FEATURE_TYPES
@@ -1571,7 +2083,7 @@ def _file_name_fn(self, cls, ext, idx):
     def _is_valid_file_to_extensions(self, is_valid_file):
         return {ext for ext in self._EXTENSIONS if is_valid_file(f"foo.{ext}")}
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_is_valid_file(self, config):
         extensions = config.pop("extensions")
         # We need to explicitly pass extensions=None here or otherwise it would be filled by the value from the
@@ -1581,7 +2093,7 @@ def test_is_valid_file(self, config):
         ) as (dataset, info):
             assert len(dataset) == info["num_examples"]
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1602,7 +2114,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return dict(num_examples=num_examples_total, classes=classes)
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1701,32 +2213,32 @@ class Places365TestCase(datasets_utils.ImageDatasetTestCase):
         *((f"{category}/Places365_train_00000001.png", idx) for category, idx in _CATEGORIES_CONTENT),
     )
 
-    @staticmethod
+    @ staticmethod
     def _make_txt(root, name, seq):
         file = os.path.join(root, name)
         with open(file, "w") as fh:
             for text, idx in seq:
                 fh.write(f"{text} {idx}\n")
 
-    @staticmethod
+    @ staticmethod
     def _make_categories_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._CATEGORIES_CONTENT)
 
-    @staticmethod
+    @ staticmethod
     def _make_file_list_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._FILE_LIST_CONTENT)
 
-    @staticmethod
+    @ staticmethod
     def _make_image(file_name, size):
         os.makedirs(os.path.dirname(file_name), exist_ok=True)
         PIL.Image.fromarray(np.zeros((*size, 3), dtype=np.uint8)).save(file_name)
 
-    @staticmethod
+    @ staticmethod
     def _make_devkit_archive(root, split):
         Places365TestCase._make_categories_txt(root, Places365TestCase._CATEGORIES)
         Places365TestCase._make_file_list_txt(root, Places365TestCase._FILE_LISTS[split])
 
-    @staticmethod
+    @ staticmethod
     def _make_images_archive(root, split, small):
         folder_name = Places365TestCase._IMAGES[(split, small)]
         image_size = (256, 256) if small else (512, random.randint(512, 1024))
@@ -2041,7 +2553,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples[config["split"]]
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_flow(self, config):
         # Make sure flow always exists, and make sure there are as many flow values as (pairs of) images
         # Also make sure the flow is properly decoded
@@ -2100,7 +2612,7 @@ def inject_fake_data(self, tmpdir, config):
         )
         return num_examples
 
-    @datasets_utils.test_all_configs
+    @ datasets_utils.test_all_configs
     def test_flow(self, config):
         h, w = self.FLOW_H, self.FLOW_W
         expected_flow = np.arange(3 * h * w).reshape(h, w, 3).transpose(2, 0, 1)
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 295fe922478..a7dd8397bab 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,4 +1,5 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
+from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereoSynthetic 
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 65336503b87..bcca2b12efb 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-from random import random
+import pathlib
+import random
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
+from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
 from .utils import download_and_extract_archive, download_url, verify_str_arg
@@ -14,15 +16,15 @@
 import json
 
 __all__ = (
-    "CREStereo"  # waiting for download
-    "Middlebury2014"
-    "ETH3D"
-    "Kitti2012"
-    "Kitti2015"
-    "Sintel"
-    "SceneFlow"  # need to find valid mask procedure
-    "FallingThings"
-    "InStereo2k"  # waiting for download
+    "CREStereo"  # waiting for download / need to find valid mask procedure
+    "StereoMiddlebury2014"
+    "StereoETH3D"
+    "StereoKitti2012"
+    "StereoKitti2015"
+    "StereoSintel"
+    "StereoSceneFlow"  # need to find valid mask procedure
+    "StereoFallingThings"
+    "InStereo2k"  # need to find valid mask procedure
 )
 
 
@@ -30,13 +32,15 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        assert header in [b"PF", b"Pf"], f"{file_path} is not a valid .pfm file"
+        if not header in [b"PF", b"Pf"]:
+            raise ValidationError(f"Not a valid PFM file: {file_path}")
 
-        dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline())
-        assert dim_match, f"{file_path} has a Malformed PFM header"
+        dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+        if not dim_match:
+            raise ValidationError(f"Malformed PFM header: {file_path}")
 
         width, height = map(int, dim_match.groups())
-        channels = 3 if header == "PF" else 1
+        channels = 3 if header == b"PF" else 1
         scale = float(file.readline().rstrip())
         # check for endian type
         if scale < 0:
@@ -77,12 +81,12 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
-        dsp_map_left, valid_mask_right = self._read_disparity(self._disparities[index][0])
+        dsp_map_left, valid_mask_left = self._read_disparity(self._disparities[index][0])
         dsp_map_right, valid_mask_right = self._read_disparity(self._disparities[index][1])
 
         imgs = (img_left, img_right)
         dsp_maps = (dsp_map_left, dsp_map_right)
-        valid_masks = (valid_mask_right, valid_mask_right)
+        valid_masks = (valid_mask_left, valid_mask_right)
 
         if self.transforms is not None:
             imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
@@ -93,7 +97,7 @@ def __len__(self) -> int:
         return len(self._images)
 
 
-class CRESSyntethicStereo(StereoMatchingDataset):
+class CREStereoSynthetic(StereoMatchingDataset):
     """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
 
    Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
@@ -104,8 +108,11 @@ class CRESSyntethicStereo(StereoMatchingDataset):
 
     MAX_DISP = 256.
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = True):
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False):
         super().__init__(root, transforms)
+
+        root = Path(root) / "CREStereo"
+
         # if the API user requests a dataset download check that the user can download it
         if download:
             statvfs = os.statvfs(root)
@@ -130,12 +137,17 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
 
         for s in splits:
             imgs_left = sorted(glob(str(root / s / "*_left.jpg")))
-            imgs_right = (p.replace("_left", "_right") for p in imgs_left)
+            imgs_right = list(p.replace("_left", "_right") for p in imgs_left)
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
-            disparity_maps_left = (p.replace("_left", "_left.disp") for p in imgs_left)
-            disparity_maps_right = (p.replace("_right", "_right.disp") for p in imgs_right)
+            disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
+            disparity_maps_right = list(p.replace("_right", "_right.disp") for p in imgs_right)
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -158,7 +170,7 @@ def _download_dataset(self, root: str) -> None:
                 download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
 
 
-class Middlebury2014(StereoMatchingDataset):
+class StereoMiddlebury2014(StereoMatchingDataset):
     """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
 
     The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
@@ -219,12 +231,11 @@ class Middlebury2014(StereoMatchingDataset):
     splits = {
         "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
         "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
-        "test": ['Plants', 'Classroom2E', 'Classroom2', 'Australia', 'DjembeL', 'CrusadeP', 'Crusade', 'Hoops', 'Bicycle2', 'Staircase', 'Newkuba', 'AustraliaP', 'Djembe', 'Livingroom', 'Computer']
+        "test": ["Plants", "Classroom2E", "Classroom2", "Australia", "DjembeL", "CrusadeP", "Crusade", "Hoops", "Bicycle2", "Staircase", "Newkuba", "AustraliaP", "Djembe", "Livingroom", "Computer"]
     }
 
     def __init__(
         self,
-        *,
         root: str,
         split: str = "train",
         use_ambient_views: bool = False,
@@ -237,7 +248,7 @@ def __init__(
         if download:
             self._download_dataset(root)
 
-        root = Path(root) / "FlyingChairs"
+        root = Path(root) / "Middlebury2014"
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -245,11 +256,19 @@ def __init__(
 
         split_scenes = self.splits[split]
         # check that the provided root folder contains the scene splits
-        if not all(s in os.listdir(root / split) for s in split_scenes):
+        if not any(
+            # using startswith to account for perfect / imperfect calibrartion
+            scene.startswith(s) for scene in os.listdir(root / split)
+            for s in split_scenes
+        ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
 
         imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
         imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
 
         if split == "test":
@@ -312,7 +331,7 @@ def _download_dataset(self, root: str):
             shutil.rmtree(os.path.join(root, "MiddEval3"))
 
 
-class ETH3D(StereoMatchingDataset):
+class StereoETH3D(StereoMatchingDataset):
     """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
 
     The dataset is expected to have the following structure: ::
@@ -370,16 +389,20 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
         root = Path(root) / "ETH3D"
-        img_dir = "two_view_training" if split == "train" else "two_view_testing"
+
+        img_dir = "two_view_training" if split == "train" else "two_view_test"
         anot_dir = "two_view_training_gt"
 
         imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "test":
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
-            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*[0-1].pfm")))
+            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*0GT.pfm")))
             # no masks for the right view, always using left as reference
             disparity_maps_right = list("" for _ in disparity_maps_left)
 
@@ -395,11 +418,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.array(valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class Kitti2012(StereoMatchingDataset):
+class StereoKitti2012(StereoMatchingDataset):
     """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
     Uses the RGB images for consistency with Kitti 2015.
 
@@ -433,11 +456,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_left = sorted(glob(str(root / "colored_0" / "*_10.png")))
         imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
             disparity_maps_right = list("" for _ in disparity_maps_left)
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -455,7 +481,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class Kitti2015(StereoMatchingDataset):
+class StereoKitti2015(StereoMatchingDataset):
     """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
 
     The dataset is expected to have the following structure: ::
@@ -468,8 +494,8 @@ class Kitti2015(StereoMatchingDataset):
                 training
                     image_2
                     image_3
-                    disp_noc_0
-                    disp_noc_1
+                    disp_occ_0
+                    disp_occ_1
                     calib
 
     Args:
@@ -488,11 +514,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         imgs_left = sorted(glob(str(root / "image_2" / "*_10.png")))
         imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
             disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in disparity_maps_left), list("" for _ in disparity_maps_right)
+            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -510,7 +539,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class SintelDataset(StereoMatchingDataset):
+class StereoSintel(StereoMatchingDataset):
     """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
     Args:
@@ -527,6 +556,9 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
 
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
         disparity_maps_right = list("" for _ in dps_masks_left)
 
@@ -554,16 +586,16 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
         return super().__getitem__(index)
 
 
-class SceneFlowDataset(StereoMatchingDataset):
+class StereoSceneFlow(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
 
     def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
-        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
-        split = split.upper()
+        root = Path(root) / "SceneFlow"
 
-        verify_str_arg(split, "pass_name", valid_values=("clean", "final", "both"))
+        verify_str_arg(split, "split", valid_values=("FlyingThings3D", "Driving", "Monkaa"))
+        verify_str_arg(pass_name, "pass_name", valid_values=("clean", "final", "both"))
 
         passes = {
             "clean": ["frames_cleanpass"],
@@ -571,16 +603,21 @@ def __init__(self, root: str, split: str = "train", pass_name: str = "clean", tr
             "both": ["frames_cleanpass, frames_finalpass"],
         }[pass_name]
 
-        root = Path(root) / split
+        root = root / split
 
         for p in passes:
-            imgs_left = sorted(glob(str(root / p / "left" / "*" / "*.png")))
-            imgs_right = sorted(glob(str(root / p / "right" / "*" / "*.png")))
+            imgs_left = sorted(glob(str(root / p / "*" / "left" / "*.png")))
+            imgs_right = sorted(glob(str(root / p / "*" / "right" / "*.png")))
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root / p))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
             disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -589,8 +626,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid = np.ones_like(disparity)
         return disparity, valid
 
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        return super().__getitem__(index)
 
-class FallingThingsDataset(StereoMatchingDataset):
+
+class StereoFallingThings(StereoMatchingDataset):
     """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
 
     The dataset is expected to have the following structre: ::
@@ -644,11 +684,16 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
         for s in splits:
             imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
             imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
+
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
             disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
@@ -660,3 +705,59 @@ def _read_disparity(self, file_path: str) -> Tuple:
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0
             return disparity, valid
+
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        return super().__getitem__(index)
+
+
+class InStereo2k(StereoMatchingDataset):
+    """InStereo2k `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            InStereo2k
+                train
+                    scene1
+                        left.png
+                        right.png
+                        left_disp.png
+                        right_disp.png
+                        ...
+                    scene2
+                    ...
+                test
+                    scene1
+                        left.png
+                        right.png
+                        left_disp.png
+                        right_disp.png
+                        ...
+                    scene2
+                    ...
+    """
+
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+        super().__init__(root, transforms)
+
+        root = Path(root) / "InStereo2k" / split
+
+        imgs_left = sorted(glob(str(root / "*" / "left.png")))
+        imgs_right = list(p.replace("left", "right") for p in imgs_left)
+
+        if not len(imgs_left) or not len(imgs_right):
+            raise FileNotFoundError("No images found in {}".format(root))
+
+        imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        self._images = imgs
+
+        disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
+        disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
+
+        disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._disparities = disparity_maps
+
+    def _read_disparity(self, file_path: str) -> Tuple:
+        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        valid = np.ones_like(disparity)
+        return disparity, valid

From 2deab62984a831a80ca9dc15bf81ae96ac21f434 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 15:10:17 +0100
Subject: [PATCH 17/35] "Added calibrartion arg for Middlebury2014 (#6259)"

---
 test/test_datasets.py                    |  50 ++++++++---
 torchvision/datasets/_stereo_matching.py | 107 +++++++++++++++++++----
 2 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index d390c30cee9..5d557020ac8 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -193,10 +193,7 @@ def inject_fake_data(self, tmpdir, config):
             scene_name = split_scene_map[config["split"]][idx]
             self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
 
-        # account for perfect / imperfect calibrations
-        if config["split"] != "test":
-            num_examples *= 2
-
+        # TODO: add calibration argument test
         return num_examples
 
     def test_train_splits(self):
@@ -428,12 +425,15 @@ class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]):
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
+        paths = []
         for i in range(num_examples):
             datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+            paths.append(str(root / file_name_fn(i)))
+        return paths
 
     def inject_fake_data(self, tmpdir, config):
         scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
@@ -447,27 +447,25 @@ def inject_fake_data(self, tmpdir, config):
             "final": "frames_finalpass",
         }
 
-        num_examples = 4
+        num_examples = 1
         pass_dir_name = pass_dir_map[config["pass_name"]]
         # create pass directories
         pass_dir = split_dir / pass_dir_name
-        disp_dir = split_dir / "disp"
+        disp_dir = split_dir / "disparity"
         os.makedirs(pass_dir, exist_ok=True)
         os.makedirs(disp_dir, exist_ok=True)
 
-        # root / pass / direction / scene / .imgs
-        # root / disparity / direction / scene / .imgs
         for direction in ["left", "right"]:
             for scene_idx in range(num_examples):
-                # scene_dir = pass_dir / direction / f"scene_{scene_idx:06d}"
                 os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
                 datasets_utils.create_image_folder(
                     root=pass_dir / f"scene_{scene_idx:06d}",
                     name=direction,
                     file_name_fn=lambda i: f"{i:06d}.png",
                     num_examples=3,
-                    size=(3, 100, 200),
+                    size=(3, 200, 100),
                 )
+
                 os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
                 self._create_pfm_folder(
                     root=disp_dir / f"scene_{scene_idx:06d}",
@@ -480,18 +478,20 @@ def inject_fake_data(self, tmpdir, config):
         return num_examples * 3
 
     def test_train_splits(self):
-        for split, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
-            with self.create_dataset(split=split, pass_name=pass_name) as (dataset, _):
+        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
+                    print(f"Split {split_name} pass {pass_name}")
                     left_array = np.array(left)
                     right_array = np.array(right)
                     h, w, c = left_array.shape
                     # check that left and right are the same size
                     assert left_array.shape == right_array.shape
+                    print(left_array.shape)
                     # check general shapes
                     assert c == 3
                     assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
+                    assert len(valid_mask.shape) == 3
                     assert disparity.shape == (h, w, 3)
                     # check that valid mask is the same size as the disparity
                     dh, dw, _ = disparity.shape
@@ -534,6 +534,28 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples
 
+    def test_splits(self):
+        for split_name in ["single", "mixed"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    print(f"Split {split_name}")
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    print(left_array.shape)
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (h, w)
+                    # check that valid mask is the same size as the disparity
+                    dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
 
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index bcca2b12efb..0bd75fe82a4 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -6,6 +6,7 @@
 import re
 import shutil
 from typing import Callable, List, Optional, Tuple, Any
+import warnings
 from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
@@ -238,6 +239,7 @@ def __init__(
         self,
         root: str,
         split: str = "train",
+        calibration: Optional[str] = None,
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False
@@ -245,6 +247,22 @@ def __init__(
         super().__init__(root, transforms)
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
 
+        if calibration:
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", None))
+            if split == "test":
+                warnings.warn(
+                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
+                    RuntimeWarning
+                )
+        else:
+            if split != "test":
+                calibration = "perfect"
+                warnings.warn(
+                    f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                    f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+                    RuntimeWarning
+                )
+
         if download:
             self._download_dataset(root)
 
@@ -263,25 +281,36 @@ def __init__(
         ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
 
-        imgs_left = sorted(glob(str(root / split / "*" / "im0.png")))
-        imgs_right = sorted(glob(str(root / split / "*" / "im1.png")))
+        calibrartion_suffixes = {
+            None: [""],
+            "perfect": ["-perfect"],
+            "imperfect": ["-imperfect"],
+            "both": ["-perfect", "-imperfect"],
+        }[calibration]
 
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
+        for calibration_suffix in calibrartion_suffixes:
+            scene_pattern = "*" + calibration_suffix
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
+            imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))
 
-        if split == "test":
-            dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
-        else:
+            if not len(imgs_left) or not len(imgs_right):
+                raise FileNotFoundError("No images found in {}".format(root))
+
+            self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
+
+            if split == "test":
+                dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+            else:
+
+                dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+                dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
 
-            dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
-            dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
-        self._disparities = list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+            self._disparities += list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
@@ -579,17 +608,60 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # out of frame mask
         off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) == 0
         # combine the masks together
-        valid_mask = np.logical_or(off_mask, valid_mask)
+        valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
 
 class StereoSceneFlow(StereoMatchingDataset):
-    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets."""
+    """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
+
+    The dataset is expected to have the following structre: ::
+
+        root
+            SceneFlow
+                Monkaa
+                    frames_cleanpass
+                        scene1
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                        scene2
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                    frames_finalpass
+                        scene1
+                            left
+                                img1.png
+                                img2.png
+                            right
+                                img1.png
+                                img2.png
+                        ...
+                        ...
+                    disparity
+                        scene1
+                            left
+                                img1.pfm
+                                img2.pfm
+                            right
+                                img1.pfm
+                                img2.pfm
+                FlyingThings3D
+                    ...
+                    ...
+    """
 
-    def __init__(self, root: str, split: str = "train", pass_name: str = "clean", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -622,6 +694,9 @@ def __init__(self, root: str, split: str = "train", pass_name: str = "clean", tr
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
+        if not os.path.exists(file_path):
+            raise FileNotFoundError("Disparity map {} not found".format(file_path))
+
         disparity = read_pfm_file(file_path)
         valid = np.ones_like(disparity)
         return disparity, valid

From cbc55f30e8adaaa20513c9076f52d317442f6c2b Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Wed, 13 Jul 2022 15:58:46 +0100
Subject: [PATCH 18/35] "Fixed test calibration test Middlebury2014 (#6259)"

---
 test/test_datasets.py                    | 40 +++++++++++++++++++++---
 torchvision/datasets/_stereo_matching.py |  7 +++--
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 5d557020ac8..518a95362b9 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -150,7 +150,11 @@ def test_bad_input(self):
 
 class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StereoMiddlebury2014
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "additional"), use_ambient_views=(True, False))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "additional"),
+        calibration=("perfect", "imperfect", "both"),
+        use_ambient_views=(True, False),
+    )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
@@ -193,12 +197,15 @@ def inject_fake_data(self, tmpdir, config):
             scene_name = split_scene_map[config["split"]][idx]
             self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
 
-        # TODO: add calibration argument test
+            print(f"Created {scene_name} for split {config['split']}")        
+
+        if config["calibration"] == "both":
+            num_examples *= 2
         return num_examples
 
     def test_train_splits(self):
-        for split in ["train", "additional"]:
-            with self.create_dataset(split=split) as (dataset, _):
+        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
+            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
                     left_array = np.array(left)
                     right_array = np.array(right)
@@ -219,7 +226,7 @@ def test_train_splits(self):
 
     def test_test_split(self):
         for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
+            with self.create_dataset(split=split, calibration=None) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
                     left_array = np.array(left)
                     right_array = np.array(right)
@@ -239,6 +246,29 @@ def test_augmented_view_usage(self):
                 # check that left and right are the same size
                 assert left_array.shape == right_array.shape
 
+    def test_warnings_train(self):
+        # train set invalid
+        split = "train"
+        calibration = None
+        with pytest.warns(
+            RuntimeWarning,
+            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_warnings_test(self):
+        # test set invalid
+        split = "test"
+        calibration = "perfect"
+        with pytest.warns(
+            RuntimeWarning,
+            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
             with self.create_dataset(split="bad"):
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 0bd75fe82a4..702386b05bd 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -239,7 +239,7 @@ def __init__(
         self,
         root: str,
         split: str = "train",
-        calibration: Optional[str] = None,
+        calibration: Optional[str] = "perfect",
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False
@@ -248,8 +248,9 @@ def __init__(
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
 
         if calibration:
-            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", None))
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both", None))
             if split == "test":
+                calibration = None
                 warnings.warn(
                     "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
                     RuntimeWarning
@@ -267,6 +268,7 @@ def __init__(
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
+        print(split)
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -290,6 +292,7 @@ def __init__(
 
         for calibration_suffix in calibrartion_suffixes:
             scene_pattern = "*" + calibration_suffix
+            print(scene_pattern)
 
             imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
             imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))

From 0759706aacda6e9aa93ff5140bbc5e906fd257f9 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:03:27 +0100
Subject: [PATCH 19/35] Clean-up. Disp map format to (C, H, W) & valid mask to
 (H, W). (#6259)

---
 test/test_datasets.py                    | 3552 +++++++++++-----------
 torchvision/datasets/__init__.py         |   11 +-
 torchvision/datasets/_stereo_matching.py |  288 +-
 3 files changed, 2081 insertions(+), 1770 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 518a95362b9..dd3c89b9bdc 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -1,4 +1,3 @@
-from abc import abstractmethod
 import bz2
 import contextlib
 import csv
@@ -25,701 +24,542 @@
 from torchvision import datasets
 
 
-class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoETH3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
-
-    @staticmethod
-    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
-        # create the scene folder
-        image_paths = []
-        # make the root_dir if it does not exits
-        os.makedirs(root_dir, exist_ok=True)
-
-        for i in range(num_examples):
-            scene_dir = os.path.join(root_dir, f"scene_{i}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # populate with left right images
-            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
-            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
-        return image_paths
-
-    @staticmethod
-    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
-        paths = []
-        # make the root_dir if it does not exits
-        os.makedirs(root_dir, exist_ok=True)
+class STL10TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.STL10
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-        # create scene directories
-        for i in range(num_examples):
-            scene_dir = os.path.join(root_dir, f"scene_{i}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # populate with a random png file for occlusion mask, and a pfm file for disparity
-            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
-            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
-            paths.append(pfm_path)
-        return paths
+    @ staticmethod
+    def _make_binary_file(num_elements, root, name):
+        file_name = os.path.join(root, name)
+        np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    def inject_fake_data(self, tmpdir, config):
-        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+    @ staticmethod
+    def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
+        STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-        num_examples = 2 if config["split"] == "train" else 3
+    @ staticmethod
+    def _make_label_file(num_images, root, name):
+        STL10TestCase._make_binary_file(num_images, root, name)
 
-        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
-        split_dir = os.path.join(eth3d_dir, split_name)
-        self._create_scene_folder(num_examples, split_dir)
+    @ staticmethod
+    def _make_class_names_file(root, name="class_names.txt"):
+        with open(os.path.join(root, name), "w") as fh:
+            for cname in ("airplane", "bird"):
+                fh.write(f"{cname}\n")
 
-        if config["split"] == "train":
-            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
-            self._create_annotation_folder(num_examples, annot_dir)
+    @ staticmethod
+    def _make_fold_indices_file(root):
+        num_folds = 10
+        offset = 0
+        with open(os.path.join(root, "fold_indices.txt"), "w") as fh:
+            for fold in range(num_folds):
+                line = " ".join([str(idx) for idx in range(offset, offset + fold + 1)])
+                fh.write(f"{line}\n")
+                offset += fold + 1
 
-        return num_examples
+        return tuple(range(1, num_folds + 1))
 
-    def test_training_test_splits(self):
-        with self.create_dataset(split="train") as (dataset, _):
-            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
-            for _, _, disparity, valid_mask in dataset:
-                assert len(disparity.shape) == 3
-                assert len(valid_mask.shape) == 2
-                dh, dw, _ = disparity.shape
-                mh, mw = valid_mask.shape
-                assert dh == mh
-                assert dw == mw
+    @ staticmethod
+    def _make_train_files(root, num_unlabeled_images=1):
+        num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
+        num_train_images = sum(num_images_in_fold)
 
-        with self.create_dataset(split="test") as (dataset, _):
-            assert all(d == ("", "") for d in dataset._disparities)
-            for _, _, disparity, valid_mask in dataset:
-                assert disparity is None
-                assert valid_mask is None
+        STL10TestCase._make_image_file(num_train_images, root, "train_X.bin")
+        STL10TestCase._make_label_file(num_train_images, root, "train_y.bin")
+        STL10TestCase._make_image_file(1, root, "unlabeled_X.bin")
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
+    @ staticmethod
+    def _make_test_files(root, num_images=2):
+        STL10TestCase._make_image_file(num_images, root, "test_X.bin")
+        STL10TestCase._make_label_file(num_images, root, "test_y.bin")
 
-class CREStereoSynthethicTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CREStereoSynthetic
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+        return dict(test=num_images)
 
     def inject_fake_data(self, tmpdir, config):
-        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
-        os.makedirs(crestereo_dir, exist_ok=True)
+        root_folder = os.path.join(tmpdir, "stl10_binary")
+        os.mkdir(root_folder)
 
-        split_dir = crestereo_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
-        num_examples = 4
+        num_images_in_split = self._make_train_files(root_folder)
+        num_images_in_split.update(self._make_test_files(root_folder))
+        self._make_class_names_file(root_folder)
 
-        for idx in range(num_examples):
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
-            # these are going to end up being gray scale images
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+        return sum(num_images_in_split[part] for part in config["split"].split("+"))
 
-        return num_examples
+    def test_folds(self):
+        for fold in range(10):
+            with self.create_dataset(split="train", folds=fold) as (dataset, _):
+                assert len(dataset) == fold + 1
 
-    def test_splits(self):
-        for split in ("tree", "shapenet", "reflective", "hole"):
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+    def test_unlabeled(self):
+        with self.create_dataset(split="unlabeled") as (dataset, _):
+            labels = [dataset[idx][1] for idx in range(len(dataset))]
+            assert all(label == -1 for label in labels)
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
+    def test_invalid_folds1(self):
+        with pytest.raises(ValueError):
+            with self.create_dataset(folds=10):
                 pass
 
+    def test_invalid_folds2(self):
+        with pytest.raises(ValueError):
+            with self.create_dataset(folds="0"):
+                pass
+
+
+class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Caltech101
+    FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
 
-class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoMiddlebury2014
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("train", "additional"),
-        calibration=("perfect", "imperfect", "both"),
-        use_ambient_views=(True, False),
+        target_type=("category", "annotation", ["category", "annotation"])
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
-
-    @staticmethod
-    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
-        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
-        scene_dirs = []
-        for c in calibrations:
-            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
-            os.makedirs(scene_dir, exist_ok=True)
-            # make normal images first
-            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
-            # these are going to end up being gray scale images
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
-            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
-            scene_dirs.append(scene_dir)
-        return scene_dirs
+    REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir, config):
-        split_scene_map = {
-            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
-            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
-            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
-        }
+        root = pathlib.Path(tmpdir) / "caltech101"
+        images = root / "101_ObjectCategories"
+        annotations = root / "Annotations"
 
-        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
-        os.makedirs(middlebury_dir, exist_ok=True)
+        categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"), ("ying_yang", "ying_yang"))
+        num_images_per_category = 2
 
-        split_dir = middlebury_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
+        for image_category, annotation_category in categories:
+            datasets_utils.create_image_folder(
+                root=images,
+                name=image_category,
+                file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
+                num_examples=num_images_per_category,
+            )
+            self._create_annotation_folder(
+                root=annotations,
+                name=annotation_category,
+                file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
+                num_examples=num_images_per_category,
+            )
 
-        num_examples = 4
-        for idx in range(num_examples):
-            # special case for test_bad_input
-            if config["split"] not in split_scene_map:
-                return 0
+        # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices.
+        os.makedirs(images / "BACKGROUND_Google")
 
-            scene_name = split_scene_map[config["split"]][idx]
-            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+        return num_images_per_category * len(categories)
 
-            print(f"Created {scene_name} for split {config['split']}")        
+    def _create_annotation_folder(self, root, name, file_name_fn, num_examples):
+        root = pathlib.Path(root) / name
+        os.makedirs(root)
 
-        if config["calibration"] == "both":
-            num_examples *= 2
-        return num_examples
+        for idx in range(num_examples):
+            self._create_annotation_file(root, file_name_fn(idx))
 
-    def test_train_splits(self):
-        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
-            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 3
-                    assert disparity.shape == (h, w, 3)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw, c = disparity.shape
-                    print(valid_mask.shape)
-                    mh, mw, _ = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+    def _create_annotation_file(self, root, name):
+        mdict = dict(obj_contour=torch.rand((2, torch.randint(3, 6, size=())), dtype=torch.float64).numpy())
+        datasets_utils.lazy_importer.scipy.io.savemat(str(pathlib.Path(root) / name), mdict)
 
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split, calibration=None) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
+    def test_combined_targets(self):
+        target_types = ["category", "annotation"]
 
-    def test_augmented_view_usage(self):
-        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
-            for left, right, _, _ in dataset:
-                left_array = np.array(left)
-                right_array = np.array(right)
-                # check that left and right are the same size
-                assert left_array.shape == right_array.shape
+        individual_targets = []
+        for target_type in target_types:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                _, target = dataset[0]
+                individual_targets.append(target)
 
-    def test_warnings_train(self):
-        # train set invalid
-        split = "train"
-        calibration = None
-        with pytest.warns(
-            RuntimeWarning,
-            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
-                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
-        ):
-            with self.create_dataset(split=split, calibration=calibration):
-                pass
+        with self.create_dataset(target_type=target_types) as (dataset, _):
+            _, combined_targets = dataset[0]
 
-    def test_warnings_test(self):
-        # test set invalid
-        split = "test"
-        calibration = "perfect"
-        with pytest.warns(
-            RuntimeWarning,
-            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
-        ):
-            with self.create_dataset(split=split, calibration=calibration):
-                pass
+        actual = len(individual_targets)
+        expected = len(combined_targets)
+        assert (
+            actual == expected
+        ), "The number of the returned combined targets does not match the the number targets if requested "
+        f"individually: {actual} != {expected}",
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        for target_type, combined_target, individual_target in zip(target_types, combined_targets, individual_targets):
+            with self.subTest(target_type=target_type):
+                actual = type(combined_target)
+                expected = type(individual_target)
+                assert (
+                    actual is expected
+                ), "Type of the combined target does not match the type of the corresponding individual target: "
+                f"{actual} is not {expected}",
 
 
-class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2012
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Caltech256
 
     def inject_fake_data(self, tmpdir, config):
-        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
-        os.makedirs(kitti_dir, exist_ok=True)
-
-        split_dir = kitti_dir / (config["split"] + "ing")
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
 
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="colored_0",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="colored_1",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
+        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
+        num_images_per_category = 2
 
-        if config["split"] == "train":
+        for idx, category in categories:
             datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_noc",
-                file_name_fn=lambda i: f"{i:06d}.png",
-                num_examples=num_examples,
-                # Kitti2012 uses a single channel image for disparities
-                size=(1, 100, 200),
+                tmpdir,
+                name=f"{idx:03d}.{category}",
+                file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
+                num_examples=num_images_per_category,
             )
 
-        return num_examples
-
-    def test_train_splits(self):
-        for split in ["train"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
-
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
-
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return num_images_per_category * len(categories)
 
 
-class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2015
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.WIDERFace
+    FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
-        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
-        os.makedirs(kitti_dir, exist_ok=True)
-
-        split_dir = kitti_dir / (config["split"] + "ing")
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        widerface_dir = pathlib.Path(tmpdir) / "widerface"
+        annotations_dir = widerface_dir / "wider_face_split"
+        os.makedirs(annotations_dir)
 
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="image_2",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
-        datasets_utils.create_image_folder(
-            root=split_dir,
-            name="image_3",
-            file_name_fn=lambda i: f"{i:06d}_10.png",
-            num_examples=num_examples,
-            size=(3, 100, 200),
-        )
+        split_to_idx = split_to_num_examples = {
+            "train": 1,
+            "val": 2,
+            "test": 3,
+        }
 
-        if config["split"] == "train":
-            datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_occ_0",
-                file_name_fn=lambda i: f"{i:06d}.png",
-                num_examples=num_examples,
-                # Kitti2015 uses a single channel image for disparities
-                size=(1, 100, 200),
-            )
+        # We need to create all folders regardless of the split in config
+        for split in ("train", "val", "test"):
+            split_idx = split_to_idx[split]
+            num_examples = split_to_num_examples[split]
 
             datasets_utils.create_image_folder(
-                root=split_dir,
-                name="disp_occ_1",
-                file_name_fn=lambda i: f"{i:06d}.png",
+                root=tmpdir,
+                name=widerface_dir / f"WIDER_{split}" / "images" / "0--Parade",
+                file_name_fn=lambda image_idx: f"0_Parade_marchingband_1_{split_idx + image_idx}.jpg",
                 num_examples=num_examples,
-                # Kitti2015 uses a single channel image for disparities
-                size=(1, 100, 200),
             )
 
-        return num_examples
+            annotation_file_name = {
+                "train": annotations_dir / "wider_face_train_bbx_gt.txt",
+                "val": annotations_dir / "wider_face_val_bbx_gt.txt",
+                "test": annotations_dir / "wider_face_test_filelist.txt",
+            }[split]
 
-    def test_train_splits(self):
-        for split in ["train"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 2
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+            annotation_content = {
+                "train": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n449 330 122 149 0 0 0 0 0 0\n"
+                    for image_idx in range(num_examples)
+                ),
+                "val": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n501 160 285 443 0 0 0 0 0 0\n"
+                    for image_idx in range(num_examples)
+                ),
+                "test": "".join(
+                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n"
+                    for image_idx in range(num_examples)
+                ),
+            }[split]
 
-    def test_test_split(self):
-        for split in ["test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
+            with open(annotation_file_name, "w") as annotation_file:
+                annotation_file.write(annotation_content)
 
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        return split_to_num_examples[config["split"]]
 
 
-class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoSceneFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("FlyingThings3D", "Driving", "Monkaa"),
-        pass_name=("clean", "final")
+class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Cityscapes
+    TARGET_TYPES = (
+        "instance",
+        "semantic",
+        "polygon",
+        "color",
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    ADDITIONAL_CONFIGS = (
+        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *datasets_utils.combinations_grid(
+            mode=("coarse",),
+            split=("train", "train_extra", "val"),
+            target_type=TARGET_TYPES,
+        ),
+    )
+    FEATURE_TYPES = (PIL.Image.Image, (dict, PIL.Image.Image))
 
-    @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
-        root = pathlib.Path(root) / name
-        os.makedirs(root, exist_ok=True)
+    def inject_fake_data(self, tmpdir, config):
 
-        paths = []
-        for i in range(num_examples):
-            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
-            paths.append(str(root / file_name_fn(i)))
-        return paths
+        tmpdir = pathlib.Path(tmpdir)
 
-    def inject_fake_data(self, tmpdir, config):
-        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
-        os.makedirs(scene_flow_dir, exist_ok=True)
+        mode_to_splits = {
+            "Coarse": ["train", "train_extra", "val"],
+            "Fine": ["train", "test", "val"],
+        }
 
-        split_dir = scene_flow_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
+        if config["split"] == "train":  # just for coverage of the number of samples
+            cities = ["bochum", "bremen"]
+        else:
+            cities = ["bochum"]
 
-        pass_dir_map = {
-            "clean": "frames_cleanpass",
-            "final": "frames_finalpass",
+        polygon_target = {
+            "imgHeight": 1024,
+            "imgWidth": 2048,
+            "objects": [
+                {
+                    "label": "sky",
+                    "polygon": [
+                        [1241, 0],
+                        [1234, 156],
+                        [1478, 197],
+                        [1611, 172],
+                        [1606, 0],
+                    ],
+                },
+                {
+                    "label": "road",
+                    "polygon": [
+                        [0, 448],
+                        [1331, 274],
+                        [1473, 265],
+                        [2047, 605],
+                        [2047, 1023],
+                        [0, 1023],
+                    ],
+                },
+            ],
         }
 
-        num_examples = 1
-        pass_dir_name = pass_dir_map[config["pass_name"]]
-        # create pass directories
-        pass_dir = split_dir / pass_dir_name
-        disp_dir = split_dir / "disparity"
-        os.makedirs(pass_dir, exist_ok=True)
-        os.makedirs(disp_dir, exist_ok=True)
+        for mode in ["Coarse", "Fine"]:
+            gt_dir = tmpdir / f"gt{mode}"
+            for split in mode_to_splits[mode]:
+                for city in cities:
 
-        for direction in ["left", "right"]:
-            for scene_idx in range(num_examples):
-                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                    def make_image(name, size=10):
+                        datasets_utils.create_image_folder(
+                            root=gt_dir / split,
+                            name=city,
+                            file_name_fn=lambda _: name,
+                            size=size,
+                            num_examples=1,
+                        )
+
+                    make_image(f"{city}_000000_000000_gt{mode}_instanceIds.png")
+                    make_image(f"{city}_000000_000000_gt{mode}_labelIds.png")
+                    make_image(f"{city}_000000_000000_gt{mode}_color.png", size=(4, 10, 10))
+
+                    polygon_target_name = gt_dir / split / city / f"{city}_000000_000000_gt{mode}_polygons.json"
+                    with open(polygon_target_name, "w") as outfile:
+                        json.dump(polygon_target, outfile)
+
+        # Create leftImg8bit folder
+        for split in ["test", "train_extra", "train", "val"]:
+            for city in cities:
                 datasets_utils.create_image_folder(
-                    root=pass_dir / f"scene_{scene_idx:06d}",
-                    name=direction,
-                    file_name_fn=lambda i: f"{i:06d}.png",
-                    num_examples=3,
-                    size=(3, 200, 100),
+                    root=tmpdir / "leftImg8bit" / split,
+                    name=city,
+                    file_name_fn=lambda _: f"{city}_000000_000000_leftImg8bit.png",
+                    num_examples=1,
                 )
 
-                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
-                self._create_pfm_folder(
-                    root=disp_dir / f"scene_{scene_idx:06d}",
-                    name=direction,
-                    file_name_fn=lambda i: f"{i:06d}.pfm",
-                    num_examples=3,
-                    size=(100, 200),
-                )
+        info = {"num_examples": len(cities)}
+        if config["target_type"] == "polygon":
+            info["expected_polygon_target"] = polygon_target
+        return info
 
-        return num_examples * 3
+    def test_combined_targets(self):
+        target_types = ["semantic", "polygon", "color"]
 
-    def test_train_splits(self):
-        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
-            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    print(f"Split {split_name} pass {pass_name}")
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    print(left_array.shape)
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 3
-                    assert disparity.shape == (h, w, 3)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw, _ = disparity.shape
-                    mh, mw, _ = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+        with self.create_dataset(target_type=target_types) as (dataset, _):
+            output = dataset[0]
+            assert isinstance(output, tuple)
+            assert len(output) == 2
+            assert isinstance(output[0], PIL.Image.Image)
+            assert isinstance(output[1], tuple)
+            assert len(output[1]) == 3
+            assert isinstance(output[1][0], PIL.Image.Image)  # semantic
+            assert isinstance(output[1][1], dict)  # polygon
+            assert isinstance(output[1][2], PIL.Image.Image)  # color
 
+    def test_feature_types_target_color(self):
+        with self.create_dataset(target_type="color") as (dataset, _):
+            color_img, color_target = dataset[0]
+            assert isinstance(color_img, PIL.Image.Image)
+            assert np.array(color_target).shape[2] == 4
 
-class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoFallingThings
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    def test_feature_types_target_polygon(self):
+        with self.create_dataset(target_type="polygon") as (dataset, info):
+            polygon_img, polygon_target = dataset[0]
+            assert isinstance(polygon_img, PIL.Image.Image)
+            (polygon_target, info["expected_polygon_target"])
 
-    @staticmethod
-    def _make_scene_folder(root: str, scene_name: str, num_examples: int, size: Tuple[int, int]):
-        root = pathlib.Path(root) / scene_name
-        os.makedirs(root, exist_ok=True)
 
-        datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[0], size[1]))
-        datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[0], size[1]))
-        # single channel depth maps
-        datasets_utils.create_image_file(root, "image1.left.depth.jpg", size=(1, size[0], size[1]))
-        datasets_utils.create_image_file(root, "image1.right.depth.jpg", size=(1, size[0], size[1]))
+class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.ImageNet
+    REQUIRED_PACKAGES = ("scipy",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
 
     def inject_fake_data(self, tmpdir, config):
-        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
-
-        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
-
-        num_examples = 4
+        tmpdir = pathlib.Path(tmpdir)
 
-        for i in range(num_examples):
-            self._make_scene_folder(
-                root=split_dir,
-                scene_name=f"scene_{i:06d}",
+        wnid = "n01234567"
+        if config["split"] == "train":
+            num_examples = 3
+            datasets_utils.create_image_folder(
+                root=tmpdir,
+                name=tmpdir / "train" / wnid / wnid,
+                file_name_fn=lambda image_idx: f"{wnid}_{image_idx}.JPEG",
+                num_examples=num_examples,
+            )
+        else:
+            num_examples = 1
+            datasets_utils.create_image_folder(
+                root=tmpdir,
+                name=tmpdir / "val" / wnid,
+                file_name_fn=lambda image_ifx: "ILSVRC2012_val_0000000{image_idx}.JPEG",
                 num_examples=num_examples,
-                size=(100, 200),
             )
 
+        wnid_to_classes = {wnid: [1]}
+        torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
         return num_examples
 
-    def test_splits(self):
-        for split_name in ["single", "mixed"]:
-            with self.create_dataset(split=split_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    print(f"Split {split_name}")
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    print(left_array.shape)
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (h, w)
-                    # check that valid mask is the same size as the disparity
-                    dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
 
+class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CIFAR10
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
 
-class STL10TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    _VERSION_CONFIG = dict(
+        base_folder="cifar-10-batches-py",
+        train_files=tuple(f"data_batch_{idx}" for idx in range(1, 6)),
+        test_files=("test_batch",),
+        labels_key="labels",
+        meta_file="batches.meta",
+        num_categories=10,
+        categories_key="label_names",
+    )
 
-    @ staticmethod
-    def _make_binary_file(num_elements, root, name):
-        file_name = os.path.join(root, name)
-        np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir) / self._VERSION_CONFIG["base_folder"]
+        os.makedirs(tmpdir)
 
-    @ staticmethod
-    def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
-        STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
+        num_images_per_file = 1
+        for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]):
+            self._create_batch_file(tmpdir, name, num_images_per_file)
 
-    @ staticmethod
-    def _make_label_file(num_images, root, name):
-        STL10TestCase._make_binary_file(num_images, root, name)
+        categories = self._create_meta_file(tmpdir)
 
-    @ staticmethod
-    def _make_class_names_file(root, name="class_names.txt"):
-        with open(os.path.join(root, name), "w") as fh:
-            for cname in ("airplane", "bird"):
-                fh.write(f"{cname}\n")
+        return dict(
+            num_examples=num_images_per_file
+            * len(self._VERSION_CONFIG["train_files"] if config["train"] else self._VERSION_CONFIG["test_files"]),
+            categories=categories,
+        )
 
-    @ staticmethod
-    def _make_fold_indices_file(root):
-        num_folds = 10
-        offset = 0
-        with open(os.path.join(root, "fold_indices.txt"), "w") as fh:
-            for fold in range(num_folds):
-                line = " ".join([str(idx) for idx in range(offset, offset + fold + 1)])
-                fh.write(f"{line}\n")
-                offset += fold + 1
+    def _create_batch_file(self, root, name, num_images):
+        np_rng = np.random.RandomState(0)
+        data = datasets_utils.create_image_or_video_tensor((num_images, 32 * 32 * 3))
+        labels = np_rng.randint(0, self._VERSION_CONFIG["num_categories"], size=num_images).tolist()
+        self._create_binary_file(root, name, {"data": data, self._VERSION_CONFIG["labels_key"]: labels})
 
-        return tuple(range(1, num_folds + 1))
+    def _create_meta_file(self, root):
+        categories = [
+            f"{idx:0{len(str(self._VERSION_CONFIG['num_categories'] - 1))}d}"
+            for idx in range(self._VERSION_CONFIG["num_categories"])
+        ]
+        self._create_binary_file(
+            root, self._VERSION_CONFIG["meta_file"], {self._VERSION_CONFIG["categories_key"]: categories}
+        )
+        return categories
 
-    @ staticmethod
-    def _make_train_files(root, num_unlabeled_images=1):
-        num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
-        num_train_images = sum(num_images_in_fold)
+    def _create_binary_file(self, root, name, content):
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            pickle.dump(content, fh)
 
-        STL10TestCase._make_image_file(num_train_images, root, "train_X.bin")
-        STL10TestCase._make_label_file(num_train_images, root, "train_y.bin")
-        STL10TestCase._make_image_file(1, root, "unlabeled_X.bin")
+    def test_class_to_idx(self):
+        with self.create_dataset() as (dataset, info):
+            expected = {category: label for label, category in enumerate(info["categories"])}
+            actual = dataset.class_to_idx
+            assert actual == expected
 
-        return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @ staticmethod
-    def _make_test_files(root, num_images=2):
-        STL10TestCase._make_image_file(num_images, root, "test_X.bin")
-        STL10TestCase._make_label_file(num_images, root, "test_y.bin")
+class CIFAR100(CIFAR10TestCase):
+    DATASET_CLASS = datasets.CIFAR100
 
-        return dict(test=num_images)
+    _VERSION_CONFIG = dict(
+        base_folder="cifar-100-python",
+        train_files=("train",),
+        test_files=("test",),
+        labels_key="fine_labels",
+        meta_file="meta",
+        num_categories=100,
+        categories_key="fine_label_names",
+    )
 
-    def inject_fake_data(self, tmpdir, config):
-        root_folder = os.path.join(tmpdir, "stl10_binary")
-        os.mkdir(root_folder)
 
-        num_images_in_split = self._make_train_files(root_folder)
-        num_images_in_split.update(self._make_test_files(root_folder))
-        self._make_class_names_file(root_folder)
+class CelebATestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CelebA
+    FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
 
-        return sum(num_images_in_split[part] for part in config["split"].split("+"))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "valid", "test", "all"),
+        target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
+    )
 
-    def test_folds(self):
-        for fold in range(10):
-            with self.create_dataset(split="train", folds=fold) as (dataset, _):
-                assert len(dataset) == fold + 1
+    _SPLIT_TO_IDX = dict(train=0, valid=1, test=2)
 
-    def test_unlabeled(self):
-        with self.create_dataset(split="unlabeled") as (dataset, _):
-            labels = [dataset[idx][1] for idx in range(len(dataset))]
-            assert all(label == -1 for label in labels)
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = pathlib.Path(tmpdir) / "celeba"
+        os.makedirs(base_folder)
 
-    def test_invalid_folds1(self):
-        with pytest.raises(ValueError):
-            with self.create_dataset(folds=10):
-                pass
+        num_images, num_images_per_split = self._create_split_txt(base_folder)
 
-    def test_invalid_folds2(self):
-        with pytest.raises(ValueError):
-            with self.create_dataset(folds="0"):
-                pass
+        datasets_utils.create_image_folder(
+            base_folder, "img_align_celeba", lambda idx: f"{idx + 1:06d}.jpg", num_images
+        )
+        attr_names = self._create_attr_txt(base_folder, num_images)
+        self._create_identity_txt(base_folder, num_images)
+        self._create_bbox_txt(base_folder, num_images)
+        self._create_landmarks_txt(base_folder, num_images)
 
+        return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
 
-class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Caltech101
-    FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
+    def _create_split_txt(self, root):
+        num_images_per_split = dict(train=4, valid=3, test=2)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
-    REQUIRED_PACKAGES = ("scipy",)
+        data = [
+            [self._SPLIT_TO_IDX[split]] for split, num_images in num_images_per_split.items() for _ in range(num_images)
+        ]
+        self._create_txt(root, "list_eval_partition.txt", data)
 
-    def inject_fake_data(self, tmpdir, config):
-        root = pathlib.Path(tmpdir) / "caltech101"
-        images = root / "101_ObjectCategories"
-        annotations = root / "Annotations"
+        num_images_per_split["all"] = num_images = sum(num_images_per_split.values())
+        return num_images, num_images_per_split
 
-        categories = (("Faces", "Faces_2"), ("helicopter", "helicopter"), ("ying_yang", "ying_yang"))
-        num_images_per_category = 2
+    def _create_attr_txt(self, root, num_images):
+        header = ("5_o_Clock_Shadow", "Young")
+        data = torch.rand((num_images, len(header))).ge(0.5).int().mul(2).sub(1).tolist()
+        self._create_txt(root, "list_attr_celeba.txt", data, header=header, add_num_examples=True)
+        return header
 
-        for image_category, annotation_category in categories:
-            datasets_utils.create_image_folder(
-                root=images,
-                name=image_category,
-                file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
-                num_examples=num_images_per_category,
-            )
-            self._create_annotation_folder(
-                root=annotations,
-                name=annotation_category,
-                file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
-                num_examples=num_images_per_category,
-            )
+    def _create_identity_txt(self, root, num_images):
+        data = torch.randint(1, 4, size=(num_images, 1)).tolist()
+        self._create_txt(root, "identity_CelebA.txt", data)
 
-        # This is included in the original archive, but is removed by the dataset. Thus, an empty directory suffices.
-        os.makedirs(images / "BACKGROUND_Google")
+    def _create_bbox_txt(self, root, num_images):
+        header = ("x_1", "y_1", "width", "height")
+        data = torch.randint(10, size=(num_images, len(header))).tolist()
+        self._create_txt(
+            root, "list_bbox_celeba.txt", data, header=header, add_num_examples=True, add_image_id_to_header=True
+        )
 
-        return num_images_per_category * len(categories)
+    def _create_landmarks_txt(self, root, num_images):
+        header = ("lefteye_x", "rightmouth_y")
+        data = torch.randint(10, size=(num_images, len(header))).tolist()
+        self._create_txt(root, "list_landmarks_align_celeba.txt", data, header=header, add_num_examples=True)
 
-    def _create_annotation_folder(self, root, name, file_name_fn, num_examples):
-        root = pathlib.Path(root) / name
-        os.makedirs(root)
+    def _create_txt(self, root, name, data, header=None, add_num_examples=False, add_image_id_to_header=False):
+        with open(pathlib.Path(root) / name, "w") as fh:
+            if add_num_examples:
+                fh.write(f"{len(data)}\n")
 
-        for idx in range(num_examples):
-            self._create_annotation_file(root, file_name_fn(idx))
+            if header:
+                if add_image_id_to_header:
+                    header = ("image_id", *header)
+                fh.write(f"{' '.join(header)}\n")
 
-    def _create_annotation_file(self, root, name):
-        mdict = dict(obj_contour=torch.rand((2, torch.randint(3, 6, size=())), dtype=torch.float64).numpy())
-        datasets_utils.lazy_importer.scipy.io.savemat(str(pathlib.Path(root) / name), mdict)
+            for idx, line in enumerate(data, 1):
+                fh.write(f"{' '.join((f'{idx:06d}.jpg', *[str(value) for value in line]))}\n")
 
     def test_combined_targets(self):
-        target_types = ["category", "annotation"]
+        target_types = ["attr", "identity", "bbox", "landmarks"]
 
         individual_targets = []
         for target_type in target_types:
@@ -746,1062 +586,659 @@ def test_combined_targets(self):
                 ), "Type of the combined target does not match the type of the corresponding individual target: "
                 f"{actual} is not {expected}",
 
+    def test_no_target(self):
+        with self.create_dataset(target_type=[]) as (dataset, _):
+            _, target = dataset[0]
 
-class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Caltech256
+        assert target is None
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
+    def test_attr_names(self):
+        with self.create_dataset() as (dataset, info):
+            assert tuple(dataset.attr_names) == info["attr_names"]
 
-        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
-        num_images_per_category = 2
+    def test_images_names_split(self):
+        with self.create_dataset(split="all") as (dataset, _):
+            all_imgs_names = set(dataset.filename)
 
-        for idx, category in categories:
-            datasets_utils.create_image_folder(
-                tmpdir,
-                name=f"{idx:03d}.{category}",
-                file_name_fn=lambda image_idx: f"{idx:03d}_{image_idx + 1:04d}.jpg",
-                num_examples=num_images_per_category,
-            )
+        merged_imgs_names = set()
+        for split in ["train", "valid", "test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                merged_imgs_names.update(dataset.filename)
 
-        return num_images_per_category * len(categories)
+        assert merged_imgs_names == all_imgs_names
 
 
-class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.WIDERFace
-    FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.VOCSegmentation
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
+
+    ADDITIONAL_CONFIGS = (
+        *datasets_utils.combinations_grid(
+            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
+        ),
+        dict(year="2007", image_set="test"),
+        dict(year="2007-test", image_set="test"),
+    )
 
     def inject_fake_data(self, tmpdir, config):
-        widerface_dir = pathlib.Path(tmpdir) / "widerface"
-        annotations_dir = widerface_dir / "wider_face_split"
-        os.makedirs(annotations_dir)
+        year, is_test_set = (
+            ("2007", True)
+            if config["year"] == "2007-test" or config["image_set"] == "test"
+            else (config["year"], False)
+        )
+        image_set = config["image_set"]
 
-        split_to_idx = split_to_num_examples = {
-            "train": 1,
-            "val": 2,
-            "test": 3,
-        }
+        base_dir = pathlib.Path(tmpdir)
+        if year == "2011":
+            base_dir /= "TrainVal"
+        base_dir = base_dir / "VOCdevkit" / f"VOC{year}"
+        os.makedirs(base_dir)
 
-        # We need to create all folders regardless of the split in config
-        for split in ("train", "val", "test"):
-            split_idx = split_to_idx[split]
-            num_examples = split_to_num_examples[split]
+        num_images, num_images_per_image_set = self._create_image_set_files(base_dir, "ImageSets", is_test_set)
+        datasets_utils.create_image_folder(base_dir, "JPEGImages", lambda idx: f"{idx:06d}.jpg", num_images)
 
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=widerface_dir / f"WIDER_{split}" / "images" / "0--Parade",
-                file_name_fn=lambda image_idx: f"0_Parade_marchingband_1_{split_idx + image_idx}.jpg",
-                num_examples=num_examples,
-            )
+        datasets_utils.create_image_folder(base_dir, "SegmentationClass", lambda idx: f"{idx:06d}.png", num_images)
+        annotation = self._create_annotation_files(base_dir, "Annotations", num_images)
 
-            annotation_file_name = {
-                "train": annotations_dir / "wider_face_train_bbx_gt.txt",
-                "val": annotations_dir / "wider_face_val_bbx_gt.txt",
-                "test": annotations_dir / "wider_face_test_filelist.txt",
-            }[split]
+        return dict(num_examples=num_images_per_image_set[image_set], annotation=annotation)
 
-            annotation_content = {
-                "train": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n449 330 122 149 0 0 0 0 0 0\n"
-                    for image_idx in range(num_examples)
-                ),
-                "val": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n1\n501 160 285 443 0 0 0 0 0 0\n"
-                    for image_idx in range(num_examples)
-                ),
-                "test": "".join(
-                    f"0--Parade/0_Parade_marchingband_1_{split_idx + image_idx}.jpg\n"
-                    for image_idx in range(num_examples)
-                ),
-            }[split]
+    def _create_image_set_files(self, root, name, is_test_set):
+        root = pathlib.Path(root) / name
+        src = pathlib.Path(root) / "Main"
+        os.makedirs(src, exist_ok=True)
 
-            with open(annotation_file_name, "w") as annotation_file:
-                annotation_file.write(annotation_content)
+        idcs = dict(train=(0, 1, 2), val=(3, 4), test=(5,))
+        idcs["trainval"] = (*idcs["train"], *idcs["val"])
 
-        return split_to_num_examples[config["split"]]
+        for image_set in ("test",) if is_test_set else ("train", "val", "trainval"):
+            self._create_image_set_file(src, image_set, idcs[image_set])
 
+        shutil.copytree(src, root / "Segmentation")
 
-class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Cityscapes
-    TARGET_TYPES = (
-        "instance",
-        "semantic",
-        "polygon",
-        "color",
-    )
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
-            mode=("coarse",),
-            split=("train", "train_extra", "val"),
-            target_type=TARGET_TYPES,
-        ),
-    )
-    FEATURE_TYPES = (PIL.Image.Image, (dict, PIL.Image.Image))
+        num_images = max(itertools.chain(*idcs.values())) + 1
+        num_images_per_image_set = {image_set: len(idcs_) for image_set, idcs_ in idcs.items()}
+        return num_images, num_images_per_image_set
 
-    def inject_fake_data(self, tmpdir, config):
+    def _create_image_set_file(self, root, image_set, idcs):
+        with open(pathlib.Path(root) / f"{image_set}.txt", "w") as fh:
+            fh.writelines([f"{idx:06d}\n" for idx in idcs])
 
-        tmpdir = pathlib.Path(tmpdir)
+    def _create_annotation_files(self, root, name, num_images):
+        root = pathlib.Path(root) / name
+        os.makedirs(root)
 
-        mode_to_splits = {
-            "Coarse": ["train", "train_extra", "val"],
-            "Fine": ["train", "test", "val"],
-        }
+        for idx in range(num_images):
+            annotation = self._create_annotation_file(root, f"{idx:06d}.xml")
 
-        if config["split"] == "train":  # just for coverage of the number of samples
-            cities = ["bochum", "bremen"]
-        else:
-            cities = ["bochum"]
+        return annotation
 
-        polygon_target = {
-            "imgHeight": 1024,
-            "imgWidth": 2048,
-            "objects": [
-                {
-                    "label": "sky",
-                    "polygon": [
-                        [1241, 0],
-                        [1234, 156],
-                        [1478, 197],
-                        [1611, 172],
-                        [1606, 0],
-                    ],
-                },
-                {
-                    "label": "road",
-                    "polygon": [
-                        [0, 448],
-                        [1331, 274],
-                        [1473, 265],
-                        [2047, 605],
-                        [2047, 1023],
-                        [0, 1023],
-                    ],
-                },
-            ],
-        }
+    def _create_annotation_file(self, root, name):
+        def add_child(parent, name, text=None):
+            child = ET.SubElement(parent, name)
+            child.text = text
+            return child
 
-        for mode in ["Coarse", "Fine"]:
-            gt_dir = tmpdir / f"gt{mode}"
-            for split in mode_to_splits[mode]:
-                for city in cities:
+        def add_name(obj, name="dog"):
+            add_child(obj, "name", name)
+            return name
 
-                    def make_image(name, size=10):
-                        datasets_utils.create_image_folder(
-                            root=gt_dir / split,
-                            name=city,
-                            file_name_fn=lambda _: name,
-                            size=size,
-                            num_examples=1,
-                        )
+        def add_bndbox(obj, bndbox=None):
+            if bndbox is None:
+                bndbox = {"xmin": "1", "xmax": "2", "ymin": "3", "ymax": "4"}
 
-                    make_image(f"{city}_000000_000000_gt{mode}_instanceIds.png")
-                    make_image(f"{city}_000000_000000_gt{mode}_labelIds.png")
-                    make_image(f"{city}_000000_000000_gt{mode}_color.png", size=(4, 10, 10))
+            obj = add_child(obj, "bndbox")
+            for name, text in bndbox.items():
+                add_child(obj, name, text)
 
-                    polygon_target_name = gt_dir / split / city / f"{city}_000000_000000_gt{mode}_polygons.json"
-                    with open(polygon_target_name, "w") as outfile:
-                        json.dump(polygon_target, outfile)
+            return bndbox
 
-        # Create leftImg8bit folder
-        for split in ["test", "train_extra", "train", "val"]:
-            for city in cities:
-                datasets_utils.create_image_folder(
-                    root=tmpdir / "leftImg8bit" / split,
-                    name=city,
-                    file_name_fn=lambda _: f"{city}_000000_000000_leftImg8bit.png",
-                    num_examples=1,
-                )
+        annotation = ET.Element("annotation")
+        obj = add_child(annotation, "object")
+        data = dict(name=add_name(obj), bndbox=add_bndbox(obj))
 
-        info = {"num_examples": len(cities)}
-        if config["target_type"] == "polygon":
-            info["expected_polygon_target"] = polygon_target
-        return info
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            fh.write(ET.tostring(annotation))
 
-    def test_combined_targets(self):
-        target_types = ["semantic", "polygon", "color"]
+        return data
 
-        with self.create_dataset(target_type=target_types) as (dataset, _):
-            output = dataset[0]
-            assert isinstance(output, tuple)
-            assert len(output) == 2
-            assert isinstance(output[0], PIL.Image.Image)
-            assert isinstance(output[1], tuple)
-            assert len(output[1]) == 3
-            assert isinstance(output[1][0], PIL.Image.Image)  # semantic
-            assert isinstance(output[1][1], dict)  # polygon
-            assert isinstance(output[1][2], PIL.Image.Image)  # color
 
-    def test_feature_types_target_color(self):
-        with self.create_dataset(target_type="color") as (dataset, _):
-            color_img, color_target = dataset[0]
-            assert isinstance(color_img, PIL.Image.Image)
-            assert np.array(color_target).shape[2] == 4
+class VOCDetectionTestCase(VOCSegmentationTestCase):
+    DATASET_CLASS = datasets.VOCDetection
+    FEATURE_TYPES = (PIL.Image.Image, dict)
 
-    def test_feature_types_target_polygon(self):
-        with self.create_dataset(target_type="polygon") as (dataset, info):
-            polygon_img, polygon_target = dataset[0]
-            assert isinstance(polygon_img, PIL.Image.Image)
-            (polygon_target, info["expected_polygon_target"])
+    def test_annotations(self):
+        with self.create_dataset() as (dataset, info):
+            _, target = dataset[0]
 
+            assert "annotation" in target
+            annotation = target["annotation"]
 
-class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.ImageNet
-    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+            assert "object" in annotation
+            objects = annotation["object"]
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+            assert len(objects) == 1
+            object = objects[0]
 
-        wnid = "n01234567"
-        if config["split"] == "train":
-            num_examples = 3
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=tmpdir / "train" / wnid / wnid,
-                file_name_fn=lambda image_idx: f"{wnid}_{image_idx}.JPEG",
-                num_examples=num_examples,
-            )
-        else:
-            num_examples = 1
-            datasets_utils.create_image_folder(
-                root=tmpdir,
-                name=tmpdir / "val" / wnid,
-                file_name_fn=lambda image_ifx: "ILSVRC2012_val_0000000{image_idx}.JPEG",
-                num_examples=num_examples,
-            )
+            assert object == info["annotation"]
 
-        wnid_to_classes = {wnid: [1]}
-        torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
-        return num_examples
 
+class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CocoDetection
+    FEATURE_TYPES = (PIL.Image.Image, list)
 
-class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    REQUIRED_PACKAGES = ("pycocotools",)
 
-    _VERSION_CONFIG = dict(
-        base_folder="cifar-10-batches-py",
-        train_files=tuple(f"data_batch_{idx}" for idx in range(1, 6)),
-        test_files=("test_batch",),
-        labels_key="labels",
-        meta_file="batches.meta",
-        num_categories=10,
-        categories_key="label_names",
-    )
+    _IMAGE_FOLDER = "images"
+    _ANNOTATIONS_FOLDER = "annotations"
+    _ANNOTATIONS_FILE = "annotations.json"
 
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir) / self._VERSION_CONFIG["base_folder"]
-        os.makedirs(tmpdir)
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._IMAGE_FOLDER
+        annotation_file = tmpdir / self._ANNOTATIONS_FOLDER / self._ANNOTATIONS_FILE
+        return root, annotation_file
 
-        num_images_per_file = 1
-        for name in itertools.chain(self._VERSION_CONFIG["train_files"], self._VERSION_CONFIG["test_files"]):
-            self._create_batch_file(tmpdir, name, num_images_per_file)
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
 
-        categories = self._create_meta_file(tmpdir)
+        num_images = 3
+        num_annotations_per_image = 2
 
-        return dict(
-            num_examples=num_images_per_file
-            * len(self._VERSION_CONFIG["train_files"] if config["train"] else self._VERSION_CONFIG["test_files"]),
-            categories=categories,
+        files = datasets_utils.create_image_folder(
+            tmpdir, name=self._IMAGE_FOLDER, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images
         )
+        file_names = [file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files]
 
-    def _create_batch_file(self, root, name, num_images):
-        np_rng = np.random.RandomState(0)
-        data = datasets_utils.create_image_or_video_tensor((num_images, 32 * 32 * 3))
-        labels = np_rng.randint(0, self._VERSION_CONFIG["num_categories"], size=num_images).tolist()
-        self._create_binary_file(root, name, {"data": data, self._VERSION_CONFIG["labels_key"]: labels})
+        annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
+        os.makedirs(annotation_folder)
+        info = self._create_annotation_file(
+            annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image
+        )
 
-    def _create_meta_file(self, root):
-        categories = [
-            f"{idx:0{len(str(self._VERSION_CONFIG['num_categories'] - 1))}d}"
-            for idx in range(self._VERSION_CONFIG["num_categories"])
-        ]
-        self._create_binary_file(
-            root, self._VERSION_CONFIG["meta_file"], {self._VERSION_CONFIG["categories_key"]: categories}
+        info["num_examples"] = num_images
+        return info
+
+    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image):
+        image_ids = [int(file_name.stem) for file_name in file_names]
+        images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)]
+
+        annotations, info = self._create_annotations(image_ids, num_annotations_per_image)
+        self._create_json(root, name, dict(images=images, annotations=annotations))
+
+        return info
+
+    def _create_annotations(self, image_ids, num_annotations_per_image):
+        annotations = datasets_utils.combinations_grid(
+            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
         )
-        return categories
+        for id, annotation in enumerate(annotations):
+            annotation["id"] = id
+        return annotations, dict()
 
-    def _create_binary_file(self, root, name, content):
-        with open(pathlib.Path(root) / name, "wb") as fh:
-            pickle.dump(content, fh)
+    def _create_json(self, root, name, content):
+        file = pathlib.Path(root) / name
+        with open(file, "w") as fh:
+            json.dump(content, fh)
+        return file
 
-    def test_class_to_idx(self):
-        with self.create_dataset() as (dataset, info):
-            expected = {category: label for label, category in enumerate(info["categories"])}
-            actual = dataset.class_to_idx
-            assert actual == expected
 
+class CocoCaptionsTestCase(CocoDetectionTestCase):
+    DATASET_CLASS = datasets.CocoCaptions
 
-class CIFAR100(CIFAR10TestCase):
-    DATASET_CLASS = datasets.CIFAR100
+    def _create_annotations(self, image_ids, num_annotations_per_image):
+        captions = [str(idx) for idx in range(num_annotations_per_image)]
+        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        for id, annotation in enumerate(annotations):
+            annotation["id"] = id
+        return annotations, dict(captions=captions)
 
-    _VERSION_CONFIG = dict(
-        base_folder="cifar-100-python",
-        train_files=("train",),
-        test_files=("test",),
-        labels_key="fine_labels",
-        meta_file="meta",
-        num_categories=100,
-        categories_key="fine_label_names",
-    )
+    def test_captions(self):
+        with self.create_dataset() as (dataset, info):
+            _, captions = dataset[0]
+            assert tuple(captions) == tuple(info["captions"])
 
 
-class CelebATestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CelebA
-    FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
+class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.UCF101
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("train", "valid", "test", "all"),
-        target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
-    )
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
 
-    _SPLIT_TO_IDX = dict(train=0, valid=1, test=2)
+    _VIDEO_FOLDER = "videos"
+    _ANNOTATIONS_FOLDER = "annotations"
+
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._VIDEO_FOLDER
+        annotation_path = tmpdir / self._ANNOTATIONS_FOLDER
+        return root, annotation_path
 
     def inject_fake_data(self, tmpdir, config):
-        base_folder = pathlib.Path(tmpdir) / "celeba"
-        os.makedirs(base_folder)
+        tmpdir = pathlib.Path(tmpdir)
 
-        num_images, num_images_per_split = self._create_split_txt(base_folder)
+        video_folder = tmpdir / self._VIDEO_FOLDER
+        os.makedirs(video_folder)
+        video_files = self._create_videos(video_folder)
 
-        datasets_utils.create_image_folder(
-            base_folder, "img_align_celeba", lambda idx: f"{idx + 1:06d}.jpg", num_images
-        )
-        attr_names = self._create_attr_txt(base_folder, num_images)
-        self._create_identity_txt(base_folder, num_images)
-        self._create_bbox_txt(base_folder, num_images)
-        self._create_landmarks_txt(base_folder, num_images)
+        annotations_folder = tmpdir / self._ANNOTATIONS_FOLDER
+        os.makedirs(annotations_folder)
+        num_examples = self._create_annotation_files(annotations_folder, video_files, config["fold"], config["train"])
 
-        return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names)
+        return num_examples
 
-    def _create_split_txt(self, root):
-        num_images_per_split = dict(train=4, valid=3, test=2)
+    def _create_videos(self, root, num_examples_per_class=3):
+        def file_name_fn(cls, idx, clips_per_group=2):
+            return f"v_{cls}_g{(idx // clips_per_group) + 1:02d}_c{(idx % clips_per_group) + 1:02d}.avi"
 
-        data = [
-            [self._SPLIT_TO_IDX[split]] for split, num_images in num_images_per_split.items() for _ in range(num_images)
+        video_files = [
+            datasets_utils.create_video_folder(root, cls, lambda idx: file_name_fn(cls, idx), num_examples_per_class)
+            for cls in ("ApplyEyeMakeup", "YoYo")
         ]
-        self._create_txt(root, "list_eval_partition.txt", data)
-
-        num_images_per_split["all"] = num_images = sum(num_images_per_split.values())
-        return num_images, num_images_per_split
+        return [path.relative_to(root) for path in itertools.chain(*video_files)]
 
-    def _create_attr_txt(self, root, num_images):
-        header = ("5_o_Clock_Shadow", "Young")
-        data = torch.rand((num_images, len(header))).ge(0.5).int().mul(2).sub(1).tolist()
-        self._create_txt(root, "list_attr_celeba.txt", data, header=header, add_num_examples=True)
-        return header
+    def _create_annotation_files(self, root, video_files, fold, train):
+        current_videos = random.sample(video_files, random.randrange(1, len(video_files) - 1))
+        current_annotation = self._annotation_file_name(fold, train)
+        self._create_annotation_file(root, current_annotation, current_videos)
 
-    def _create_identity_txt(self, root, num_images):
-        data = torch.randint(1, 4, size=(num_images, 1)).tolist()
-        self._create_txt(root, "identity_CelebA.txt", data)
+        other_videos = set(video_files) - set(current_videos)
+        other_annotations = [
+            self._annotation_file_name(fold, train) for fold, train in itertools.product((1, 2, 3), (True, False))
+        ]
+        other_annotations.remove(current_annotation)
+        for name in other_annotations:
+            self._create_annotation_file(root, name, other_videos)
 
-    def _create_bbox_txt(self, root, num_images):
-        header = ("x_1", "y_1", "width", "height")
-        data = torch.randint(10, size=(num_images, len(header))).tolist()
-        self._create_txt(
-            root, "list_bbox_celeba.txt", data, header=header, add_num_examples=True, add_image_id_to_header=True
-        )
+        return len(current_videos)
 
-    def _create_landmarks_txt(self, root, num_images):
-        header = ("lefteye_x", "rightmouth_y")
-        data = torch.randint(10, size=(num_images, len(header))).tolist()
-        self._create_txt(root, "list_landmarks_align_celeba.txt", data, header=header, add_num_examples=True)
+    def _annotation_file_name(self, fold, train):
+        return f"{'train' if train else 'test'}list{fold:02d}.txt"
 
-    def _create_txt(self, root, name, data, header=None, add_num_examples=False, add_image_id_to_header=False):
+    def _create_annotation_file(self, root, name, video_files):
         with open(pathlib.Path(root) / name, "w") as fh:
-            if add_num_examples:
-                fh.write(f"{len(data)}\n")
+            fh.writelines(f"{str(file).replace(os.sep, '/')}\n" for file in sorted(video_files))
 
-            if header:
-                if add_image_id_to_header:
-                    header = ("image_id", *header)
-                fh.write(f"{' '.join(header)}\n")
 
-            for idx, line in enumerate(data, 1):
-                fh.write(f"{' '.join((f'{idx:06d}.jpg', *[str(value) for value in line]))}\n")
+class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.LSUN
 
-    def test_combined_targets(self):
-        target_types = ["attr", "identity", "bbox", "landmarks"]
+    REQUIRED_PACKAGES = ("lmdb",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
+    )
 
-        individual_targets = []
-        for target_type in target_types:
-            with self.create_dataset(target_type=target_type) as (dataset, _):
-                _, target = dataset[0]
-                individual_targets.append(target)
+    _CATEGORIES = (
+        "bedroom",
+        "bridge",
+        "church_outdoor",
+        "classroom",
+        "conference_room",
+        "dining_room",
+        "kitchen",
+        "living_room",
+        "restaurant",
+        "tower",
+    )
 
-        with self.create_dataset(target_type=target_types) as (dataset, _):
-            _, combined_targets = dataset[0]
+    def inject_fake_data(self, tmpdir, config):
+        root = pathlib.Path(tmpdir)
 
-        actual = len(individual_targets)
-        expected = len(combined_targets)
-        assert (
-            actual == expected
-        ), "The number of the returned combined targets does not match the the number targets if requested "
-        f"individually: {actual} != {expected}",
+        num_images = 0
+        for cls in self._parse_classes(config["classes"]):
+            num_images += self._create_lmdb(root, cls)
 
-        for target_type, combined_target, individual_target in zip(target_types, combined_targets, individual_targets):
-            with self.subTest(target_type=target_type):
-                actual = type(combined_target)
-                expected = type(individual_target)
-                assert (
-                    actual is expected
-                ), "Type of the combined target does not match the type of the corresponding individual target: "
-                f"{actual} is not {expected}",
+        return num_images
 
-    def test_no_target(self):
-        with self.create_dataset(target_type=[]) as (dataset, _):
-            _, target = dataset[0]
+    @ contextlib.contextmanager
+    def create_dataset(self, *args, **kwargs):
+        with super().create_dataset(*args, **kwargs) as output:
+            yield output
+            # Currently datasets.LSUN caches the keys in the current directory rather than in the root directory. Thus,
+            # this creates a number of _cache_* files in the current directory that will not be removed together
+            # with the temporary directory
+            for file in os.listdir(os.getcwd()):
+                if file.startswith("_cache_"):
+                    try:
+                        os.remove(file)
+                    except FileNotFoundError:
+                        # When the same test is run in parallel (in fb internal tests), a thread may remove another
+                        # thread's file. We should be able to remove the try/except when
+                        # https://github.com/pytorch/vision/issues/825 is fixed.
+                        pass
 
-        assert target is None
+    def _parse_classes(self, classes):
+        if not isinstance(classes, str):
+            return classes
 
-    def test_attr_names(self):
-        with self.create_dataset() as (dataset, info):
-            assert tuple(dataset.attr_names) == info["attr_names"]
+        split = classes
+        if split == "test":
+            return [split]
 
-    def test_images_names_split(self):
-        with self.create_dataset(split="all") as (dataset, _):
-            all_imgs_names = set(dataset.filename)
+        return [f"{category}_{split}" for category in self._CATEGORIES]
 
-        merged_imgs_names = set()
-        for split in ["train", "valid", "test"]:
-            with self.create_dataset(split=split) as (dataset, _):
-                merged_imgs_names.update(dataset.filename)
+    def _create_lmdb(self, root, cls):
+        lmdb = datasets_utils.lazy_importer.lmdb
+        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
 
-        assert merged_imgs_names == all_imgs_names
+        folder = f"{cls}_lmdb"
 
+        num_images = torch.randint(1, 4, size=()).item()
+        format = "png"
+        files = datasets_utils.create_image_folder(root, folder, lambda idx: f"{idx}.{format}", num_images)
 
-class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.VOCSegmentation
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
-
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
-        dict(year="2007", image_set="test"),
-        dict(year="2007-test", image_set="test"),
-    )
-
-    def inject_fake_data(self, tmpdir, config):
-        year, is_test_set = (
-            ("2007", True)
-            if config["year"] == "2007-test" or config["image_set"] == "test"
-            else (config["year"], False)
-        )
-        image_set = config["image_set"]
-
-        base_dir = pathlib.Path(tmpdir)
-        if year == "2011":
-            base_dir /= "TrainVal"
-        base_dir = base_dir / "VOCdevkit" / f"VOC{year}"
-        os.makedirs(base_dir)
+        with lmdb.open(str(root / folder)) as env, env.begin(write=True) as txn:
+            for file in files:
+                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
 
-        num_images, num_images_per_image_set = self._create_image_set_files(base_dir, "ImageSets", is_test_set)
-        datasets_utils.create_image_folder(base_dir, "JPEGImages", lambda idx: f"{idx:06d}.jpg", num_images)
+                buffer = io.BytesIO()
+                PIL.Image.open(file).save(buffer, format)
+                buffer.seek(0)
+                value = buffer.read()
 
-        datasets_utils.create_image_folder(base_dir, "SegmentationClass", lambda idx: f"{idx:06d}.png", num_images)
-        annotation = self._create_annotation_files(base_dir, "Annotations", num_images)
+                txn.put(key, value)
 
-        return dict(num_examples=num_images_per_image_set[image_set], annotation=annotation)
+                os.remove(file)
 
-    def _create_image_set_files(self, root, name, is_test_set):
-        root = pathlib.Path(root) / name
-        src = pathlib.Path(root) / "Main"
-        os.makedirs(src, exist_ok=True)
+        return num_images
 
-        idcs = dict(train=(0, 1, 2), val=(3, 4), test=(5,))
-        idcs["trainval"] = (*idcs["train"], *idcs["val"])
+    def test_not_found_or_corrupted(self):
+        # LSUN does not raise built-in exception, but a custom one. It is expressive enough to not 'cast' it to
+        # RuntimeError or FileNotFoundError that are normally checked by this test.
+        with pytest.raises(datasets_utils.lazy_importer.lmdb.Error):
+            super().test_not_found_or_corrupted()
 
-        for image_set in ("test",) if is_test_set else ("train", "val", "trainval"):
-            self._create_image_set_file(src, image_set, idcs[image_set])
 
-        shutil.copytree(src, root / "Segmentation")
+class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.Kinetics
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
 
-        num_images = max(itertools.chain(*idcs.values())) + 1
-        num_images_per_image_set = {image_set: len(idcs_) for image_set, idcs_ in idcs.items()}
-        return num_images, num_images_per_image_set
+    def inject_fake_data(self, tmpdir, config):
+        classes = ("Abseiling", "Zumba")
+        num_videos_per_class = 2
+        tmpdir = pathlib.Path(tmpdir) / config["split"]
+        digits = string.ascii_letters + string.digits + "-_"
+        for cls in classes:
+            datasets_utils.create_video_folder(
+                tmpdir,
+                cls,
+                lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4",
+                num_videos_per_class,
+            )
+        return num_videos_per_class * len(classes)
 
-    def _create_image_set_file(self, root, image_set, idcs):
-        with open(pathlib.Path(root) / f"{image_set}.txt", "w") as fh:
-            fh.writelines([f"{idx:06d}\n" for idx in idcs])
 
-    def _create_annotation_files(self, root, name, num_images):
-        root = pathlib.Path(root) / name
-        os.makedirs(root)
+class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.Kinetics400
 
-        for idx in range(num_images):
-            annotation = self._create_annotation_file(root, f"{idx:06d}.xml")
+    def inject_fake_data(self, tmpdir, config):
+        classes = ("Abseiling", "Zumba")
+        num_videos_per_class = 2
 
-        return annotation
+        digits = string.ascii_letters + string.digits + "-_"
+        for cls in classes:
+            datasets_utils.create_video_folder(
+                tmpdir,
+                cls,
+                lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi",
+                num_videos_per_class,
+            )
 
-    def _create_annotation_file(self, root, name):
-        def add_child(parent, name, text=None):
-            child = ET.SubElement(parent, name)
-            child.text = text
-            return child
+        return num_videos_per_class * len(classes)
 
-        def add_name(obj, name="dog"):
-            add_child(obj, "name", name)
-            return name
 
-        def add_bndbox(obj, bndbox=None):
-            if bndbox is None:
-                bndbox = {"xmin": "1", "xmax": "2", "ymin": "3", "ymax": "4"}
+class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
+    DATASET_CLASS = datasets.HMDB51
 
-            obj = add_child(obj, "bndbox")
-            for name, text in bndbox.items():
-                add_child(obj, name, text)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
 
-            return bndbox
+    _VIDEO_FOLDER = "videos"
+    _SPLITS_FOLDER = "splits"
+    _CLASSES = ("brush_hair", "wave")
 
-        annotation = ET.Element("annotation")
-        obj = add_child(annotation, "object")
-        data = dict(name=add_name(obj), bndbox=add_bndbox(obj))
+    def dataset_args(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
+        root = tmpdir / self._VIDEO_FOLDER
+        annotation_path = tmpdir / self._SPLITS_FOLDER
+        return root, annotation_path
 
-        with open(pathlib.Path(root) / name, "wb") as fh:
-            fh.write(ET.tostring(annotation))
+    def inject_fake_data(self, tmpdir, config):
+        tmpdir = pathlib.Path(tmpdir)
 
-        return data
+        video_folder = tmpdir / self._VIDEO_FOLDER
+        os.makedirs(video_folder)
+        video_files = self._create_videos(video_folder)
 
+        splits_folder = tmpdir / self._SPLITS_FOLDER
+        os.makedirs(splits_folder)
+        num_examples = self._create_split_files(splits_folder, video_files, config["fold"], config["train"])
 
-class VOCDetectionTestCase(VOCSegmentationTestCase):
-    DATASET_CLASS = datasets.VOCDetection
-    FEATURE_TYPES = (PIL.Image.Image, dict)
+        return num_examples
 
-    def test_annotations(self):
-        with self.create_dataset() as (dataset, info):
-            _, target = dataset[0]
+    def _create_videos(self, root, num_examples_per_class=3):
+        def file_name_fn(cls, idx, clips_per_group=2):
+            return f"{cls}_{(idx // clips_per_group) + 1:d}_{(idx % clips_per_group) + 1:d}.avi"
 
-            assert "annotation" in target
-            annotation = target["annotation"]
+        return [
+            (
+                cls,
+                datasets_utils.create_video_folder(
+                    root,
+                    cls,
+                    lambda idx: file_name_fn(cls, idx),
+                    num_examples_per_class,
+                ),
+            )
+            for cls in self._CLASSES
+        ]
 
-            assert "object" in annotation
-            objects = annotation["object"]
+    def _create_split_files(self, root, video_files, fold, train):
+        num_videos = num_train_videos = 0
 
-            assert len(objects) == 1
-            object = objects[0]
+        for cls, videos in video_files:
+            num_videos += len(videos)
 
-            assert object == info["annotation"]
+            train_videos = set(random.sample(videos, random.randrange(1, len(videos) - 1)))
+            num_train_videos += len(train_videos)
 
+            with open(pathlib.Path(root) / f"{cls}_test_split{fold}.txt", "w") as fh:
+                fh.writelines(f"{file.name} {1 if file in train_videos else 2}\n" for file in videos)
 
-class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CocoDetection
-    FEATURE_TYPES = (PIL.Image.Image, list)
+        return num_train_videos if train else (num_videos - num_train_videos)
 
-    REQUIRED_PACKAGES = ("pycocotools",)
 
-    _IMAGE_FOLDER = "images"
-    _ANNOTATIONS_FOLDER = "annotations"
-    _ANNOTATIONS_FILE = "annotations.json"
+class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Omniglot
 
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._IMAGE_FOLDER
-        annotation_file = tmpdir / self._ANNOTATIONS_FOLDER / self._ANNOTATIONS_FILE
-        return root, annotation_file
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+        target_folder = (
+            pathlib.Path(tmpdir) / "omniglot-py" / f"images_{'background' if config['background'] else 'evaluation'}"
+        )
+        os.makedirs(target_folder)
 
-        num_images = 3
-        num_annotations_per_image = 2
+        num_images = 0
+        for name in ("Alphabet_of_the_Magi", "Tifinagh"):
+            num_images += self._create_alphabet_folder(target_folder, name)
 
-        files = datasets_utils.create_image_folder(
-            tmpdir, name=self._IMAGE_FOLDER, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_images
-        )
-        file_names = [file.relative_to(tmpdir / self._IMAGE_FOLDER) for file in files]
+        return num_images
 
-        annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER
-        os.makedirs(annotation_folder)
-        info = self._create_annotation_file(
-            annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image
-        )
+    def _create_alphabet_folder(self, root, name):
+        num_images_total = 0
+        for idx in range(torch.randint(1, 4, size=()).item()):
+            num_images = torch.randint(1, 4, size=()).item()
+            num_images_total += num_images
 
-        info["num_examples"] = num_images
-        return info
+            datasets_utils.create_image_folder(
+                root / name, f"character{idx:02d}", lambda image_idx: f"{image_idx:02d}.png", num_images
+            )
 
-    def _create_annotation_file(self, root, name, file_names, num_annotations_per_image):
-        image_ids = [int(file_name.stem) for file_name in file_names]
-        images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)]
+        return num_images_total
 
-        annotations, info = self._create_annotations(image_ids, num_annotations_per_image)
-        self._create_json(root, name, dict(images=images, annotations=annotations))
 
-        return info
+class SBUTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SBU
+    FEATURE_TYPES = (PIL.Image.Image, str)
 
-    def _create_annotations(self, image_ids, num_annotations_per_image):
-        annotations = datasets_utils.combinations_grid(
-            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
-        )
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
-        return annotations, dict()
+    def inject_fake_data(self, tmpdir, config):
+        num_images = 3
 
-    def _create_json(self, root, name, content):
-        file = pathlib.Path(root) / name
-        with open(file, "w") as fh:
-            json.dump(content, fh)
-        return file
+        dataset_folder = pathlib.Path(tmpdir) / "dataset"
+        images = datasets_utils.create_image_folder(tmpdir, "dataset", self._create_file_name, num_images)
 
+        self._create_urls_txt(dataset_folder, images)
+        self._create_captions_txt(dataset_folder, num_images)
 
-class CocoCaptionsTestCase(CocoDetectionTestCase):
-    DATASET_CLASS = datasets.CocoCaptions
-
-    def _create_annotations(self, image_ids, num_annotations_per_image):
-        captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
-        return annotations, dict(captions=captions)
-
-    def test_captions(self):
-        with self.create_dataset() as (dataset, info):
-            _, captions = dataset[0]
-            assert tuple(captions) == tuple(info["captions"])
+        return num_images
 
+    def _create_file_name(self, idx):
+        part1 = datasets_utils.create_random_string(10, string.digits)
+        part2 = datasets_utils.create_random_string(10, string.ascii_lowercase, string.digits[:6])
+        return f"{part1}_{part2}.jpg"
 
-class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.UCF101
+    def _create_urls_txt(self, root, images):
+        with open(root / "SBU_captioned_photo_dataset_urls.txt", "w") as fh:
+            for image in images:
+                fh.write(
+                    f"http://static.flickr.com/{datasets_utils.create_random_string(4, string.digits)}/{image.name}\n"
+                )
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    def _create_captions_txt(self, root, num_images):
+        with open(root / "SBU_captioned_photo_dataset_captions.txt", "w") as fh:
+            for _ in range(num_images):
+                fh.write(f"{datasets_utils.create_random_string(10)}\n")
 
-    _VIDEO_FOLDER = "videos"
-    _ANNOTATIONS_FOLDER = "annotations"
 
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._VIDEO_FOLDER
-        annotation_path = tmpdir / self._ANNOTATIONS_FOLDER
-        return root, annotation_path
+class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SEMEION
 
     def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
+        num_images = 3
 
-        video_folder = tmpdir / self._VIDEO_FOLDER
-        os.makedirs(video_folder)
-        video_files = self._create_videos(video_folder)
+        images = torch.rand(num_images, 256)
+        labels = F.one_hot(torch.randint(10, size=(num_images,)))
+        with open(pathlib.Path(tmpdir) / "semeion.data", "w") as fh:
+            for image, one_hot_labels in zip(images, labels):
+                image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
+                labels_columns = " ".join([str(label.item()) for label in one_hot_labels])
+                fh.write(f"{image_columns} {labels_columns}\n")
 
-        annotations_folder = tmpdir / self._ANNOTATIONS_FOLDER
-        os.makedirs(annotations_folder)
-        num_examples = self._create_annotation_files(annotations_folder, video_files, config["fold"], config["train"])
+        return num_images
 
-        return num_examples
 
-    def _create_videos(self, root, num_examples_per_class=3):
-        def file_name_fn(cls, idx, clips_per_group=2):
-            return f"v_{cls}_g{(idx // clips_per_group) + 1:02d}_c{(idx % clips_per_group) + 1:02d}.avi"
+class USPSTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.USPS
 
-        video_files = [
-            datasets_utils.create_video_folder(root, cls, lambda idx: file_name_fn(cls, idx), num_examples_per_class)
-            for cls in ("ApplyEyeMakeup", "YoYo")
-        ]
-        return [path.relative_to(root) for path in itertools.chain(*video_files)]
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
 
-    def _create_annotation_files(self, root, video_files, fold, train):
-        current_videos = random.sample(video_files, random.randrange(1, len(video_files) - 1))
-        current_annotation = self._annotation_file_name(fold, train)
-        self._create_annotation_file(root, current_annotation, current_videos)
+    def inject_fake_data(self, tmpdir, config):
+        num_images = 2 if config["train"] else 1
 
-        other_videos = set(video_files) - set(current_videos)
-        other_annotations = [
-            self._annotation_file_name(fold, train) for fold, train in itertools.product((1, 2, 3), (True, False))
-        ]
-        other_annotations.remove(current_annotation)
-        for name in other_annotations:
-            self._create_annotation_file(root, name, other_videos)
+        images = torch.rand(num_images, 256) * 2 - 1
+        labels = torch.randint(1, 11, size=(num_images,))
 
-        return len(current_videos)
+        with bz2.open(pathlib.Path(tmpdir) / f"usps{'.t' if not config['train'] else ''}.bz2", "w") as fh:
+            for image, label in zip(images, labels):
+                line = " ".join((str(label.item()), *[f"{idx}:{pixel:.6f}" for idx, pixel in enumerate(image, 1)]))
+                fh.write(f"{line}\n".encode())
 
-    def _annotation_file_name(self, fold, train):
-        return f"{'train' if train else 'test'}list{fold:02d}.txt"
+        return num_images
 
-    def _create_annotation_file(self, root, name, video_files):
-        with open(pathlib.Path(root) / name, "w") as fh:
-            fh.writelines(f"{str(file).replace(os.sep, '/')}\n" for file in sorted(video_files))
 
+class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SBDataset
+    FEATURE_TYPES = (PIL.Image.Image, (np.ndarray, PIL.Image.Image))
 
-class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.LSUN
+    REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
 
-    REQUIRED_PACKAGES = ("lmdb",)
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
+        image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
     )
 
-    _CATEGORIES = (
-        "bedroom",
-        "bridge",
-        "church_outdoor",
-        "classroom",
-        "conference_room",
-        "dining_room",
-        "kitchen",
-        "living_room",
-        "restaurant",
-        "tower",
-    )
+    _NUM_CLASSES = 20
 
     def inject_fake_data(self, tmpdir, config):
-        root = pathlib.Path(tmpdir)
-
-        num_images = 0
-        for cls in self._parse_classes(config["classes"]):
-            num_images += self._create_lmdb(root, cls)
-
-        return num_images
+        num_images, num_images_per_image_set = self._create_split_files(tmpdir)
 
-    @ contextlib.contextmanager
-    def create_dataset(self, *args, **kwargs):
-        with super().create_dataset(*args, **kwargs) as output:
-            yield output
-            # Currently datasets.LSUN caches the keys in the current directory rather than in the root directory. Thus,
-            # this creates a number of _cache_* files in the current directory that will not be removed together
-            # with the temporary directory
-            for file in os.listdir(os.getcwd()):
-                if file.startswith("_cache_"):
-                    try:
-                        os.remove(file)
-                    except FileNotFoundError:
-                        # When the same test is run in parallel (in fb internal tests), a thread may remove another
-                        # thread's file. We should be able to remove the try/except when
-                        # https://github.com/pytorch/vision/issues/825 is fixed.
-                        pass
+        sizes = self._create_target_folder(tmpdir, "cls", num_images)
 
-    def _parse_classes(self, classes):
-        if not isinstance(classes, str):
-            return classes
+        datasets_utils.create_image_folder(
+            tmpdir, "img", lambda idx: f"{self._file_stem(idx)}.jpg", num_images, size=lambda idx: sizes[idx]
+        )
 
-        split = classes
-        if split == "test":
-            return [split]
+        return num_images_per_image_set[config["image_set"]]
 
-        return [f"{category}_{split}" for category in self._CATEGORIES]
+    def _create_split_files(self, root):
+        root = pathlib.Path(root)
 
-    def _create_lmdb(self, root, cls):
-        lmdb = datasets_utils.lazy_importer.lmdb
-        hexdigits_lowercase = string.digits + string.ascii_lowercase[:6]
+        splits = dict(train=(0, 1, 2), train_noval=(0, 2), val=(3,))
 
-        folder = f"{cls}_lmdb"
+        for split, idcs in splits.items():
+            self._create_split_file(root, split, idcs)
 
-        num_images = torch.randint(1, 4, size=()).item()
-        format = "png"
-        files = datasets_utils.create_image_folder(root, folder, lambda idx: f"{idx}.{format}", num_images)
+        num_images = max(itertools.chain(*splits.values())) + 1
+        num_images_per_split = {split: len(idcs) for split, idcs in splits.items()}
+        return num_images, num_images_per_split
 
-        with lmdb.open(str(root / folder)) as env, env.begin(write=True) as txn:
-            for file in files:
-                key = "".join(random.choice(hexdigits_lowercase) for _ in range(40)).encode()
+    def _create_split_file(self, root, name, idcs):
+        with open(root / f"{name}.txt", "w") as fh:
+            fh.writelines(f"{self._file_stem(idx)}\n" for idx in idcs)
 
-                buffer = io.BytesIO()
-                PIL.Image.open(file).save(buffer, format)
-                buffer.seek(0)
-                value = buffer.read()
+    def _create_target_folder(self, root, name, num_images):
+        io = datasets_utils.lazy_importer.scipy.io
 
-                txn.put(key, value)
+        target_folder = pathlib.Path(root) / name
+        os.makedirs(target_folder)
 
-                os.remove(file)
+        sizes = [torch.randint(1, 4, size=(2,)).tolist() for _ in range(num_images)]
+        for idx, size in enumerate(sizes):
+            content = dict(
+                GTcls=dict(Boundaries=self._create_boundaries(size), Segmentation=self._create_segmentation(size))
+            )
+            io.savemat(target_folder / f"{self._file_stem(idx)}.mat", content)
 
-        return num_images
+        return sizes
 
-    def test_not_found_or_corrupted(self):
-        # LSUN does not raise built-in exception, but a custom one. It is expressive enough to not 'cast' it to
-        # RuntimeError or FileNotFoundError that are normally checked by this test.
-        with pytest.raises(datasets_utils.lazy_importer.lmdb.Error):
-            super().test_not_found_or_corrupted()
+    def _create_boundaries(self, size):
+        sparse = datasets_utils.lazy_importer.scipy.sparse
+        return [
+            [sparse.csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())]
+            for _ in range(self._NUM_CLASSES)
+        ]
 
+    def _create_segmentation(self, size):
+        return torch.randint(0, self._NUM_CLASSES + 1, size=size, dtype=torch.uint8).numpy()
 
-class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    def _file_stem(self, idx):
+        return f"2008_{idx:06d}"
 
-    def inject_fake_data(self, tmpdir, config):
-        classes = ("Abseiling", "Zumba")
-        num_videos_per_class = 2
-        tmpdir = pathlib.Path(tmpdir) / config["split"]
-        digits = string.ascii_letters + string.digits + "-_"
-        for cls in classes:
-            datasets_utils.create_video_folder(
-                tmpdir,
-                cls,
-                lambda _: f"{datasets_utils.create_random_string(11, digits)}.mp4",
-                num_videos_per_class,
-            )
-        return num_videos_per_class * len(classes)
 
+class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.FakeData
+    FEATURE_TYPES = (PIL.Image.Image, int)
 
-class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.Kinetics400
+    def dataset_args(self, tmpdir, config):
+        return ()
 
     def inject_fake_data(self, tmpdir, config):
-        classes = ("Abseiling", "Zumba")
-        num_videos_per_class = 2
+        return config["size"]
 
-        digits = string.ascii_letters + string.digits + "-_"
-        for cls in classes:
-            datasets_utils.create_video_folder(
-                tmpdir,
-                cls,
-                lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi",
-                num_videos_per_class,
-            )
+    def test_not_found_or_corrupted(self):
+        self.skipTest("The data is generated at creation and thus cannot be non-existent or corrupted.")
 
-        return num_videos_per_class * len(classes)
 
+class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.PhotoTour
 
-class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
-    DATASET_CLASS = datasets.HMDB51
+    # The PhotoTour dataset returns examples with different features with respect to the 'train' parameter. Thus,
+    # we overwrite 'FEATURE_TYPES' with a dummy value to satisfy the initial checks of the base class. Furthermore, we
+    # overwrite the 'test_feature_types()' method to select the correct feature types before the test is run.
+    FEATURE_TYPES = ()
+    _TRAIN_FEATURE_TYPES = (torch.Tensor,)
+    _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    datasets_utils.combinations_grid(train=(True, False))
 
-    _VIDEO_FOLDER = "videos"
-    _SPLITS_FOLDER = "splits"
-    _CLASSES = ("brush_hair", "wave")
-
-    def dataset_args(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-        root = tmpdir / self._VIDEO_FOLDER
-        annotation_path = tmpdir / self._SPLITS_FOLDER
-        return root, annotation_path
-
-    def inject_fake_data(self, tmpdir, config):
-        tmpdir = pathlib.Path(tmpdir)
-
-        video_folder = tmpdir / self._VIDEO_FOLDER
-        os.makedirs(video_folder)
-        video_files = self._create_videos(video_folder)
-
-        splits_folder = tmpdir / self._SPLITS_FOLDER
-        os.makedirs(splits_folder)
-        num_examples = self._create_split_files(splits_folder, video_files, config["fold"], config["train"])
-
-        return num_examples
-
-    def _create_videos(self, root, num_examples_per_class=3):
-        def file_name_fn(cls, idx, clips_per_group=2):
-            return f"{cls}_{(idx // clips_per_group) + 1:d}_{(idx % clips_per_group) + 1:d}.avi"
-
-        return [
-            (
-                cls,
-                datasets_utils.create_video_folder(
-                    root,
-                    cls,
-                    lambda idx: file_name_fn(cls, idx),
-                    num_examples_per_class,
-                ),
-            )
-            for cls in self._CLASSES
-        ]
-
-    def _create_split_files(self, root, video_files, fold, train):
-        num_videos = num_train_videos = 0
-
-        for cls, videos in video_files:
-            num_videos += len(videos)
-
-            train_videos = set(random.sample(videos, random.randrange(1, len(videos) - 1)))
-            num_train_videos += len(train_videos)
-
-            with open(pathlib.Path(root) / f"{cls}_test_split{fold}.txt", "w") as fh:
-                fh.writelines(f"{file.name} {1 if file in train_videos else 2}\n" for file in videos)
-
-        return num_train_videos if train else (num_videos - num_train_videos)
-
-
-class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Omniglot
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
-
-    def inject_fake_data(self, tmpdir, config):
-        target_folder = (
-            pathlib.Path(tmpdir) / "omniglot-py" / f"images_{'background' if config['background'] else 'evaluation'}"
-        )
-        os.makedirs(target_folder)
-
-        num_images = 0
-        for name in ("Alphabet_of_the_Magi", "Tifinagh"):
-            num_images += self._create_alphabet_folder(target_folder, name)
-
-        return num_images
-
-    def _create_alphabet_folder(self, root, name):
-        num_images_total = 0
-        for idx in range(torch.randint(1, 4, size=()).item()):
-            num_images = torch.randint(1, 4, size=()).item()
-            num_images_total += num_images
-
-            datasets_utils.create_image_folder(
-                root / name, f"character{idx:02d}", lambda image_idx: f"{image_idx:02d}.png", num_images
-            )
-
-        return num_images_total
-
-
-class SBUTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SBU
-    FEATURE_TYPES = (PIL.Image.Image, str)
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 3
-
-        dataset_folder = pathlib.Path(tmpdir) / "dataset"
-        images = datasets_utils.create_image_folder(tmpdir, "dataset", self._create_file_name, num_images)
-
-        self._create_urls_txt(dataset_folder, images)
-        self._create_captions_txt(dataset_folder, num_images)
-
-        return num_images
-
-    def _create_file_name(self, idx):
-        part1 = datasets_utils.create_random_string(10, string.digits)
-        part2 = datasets_utils.create_random_string(10, string.ascii_lowercase, string.digits[:6])
-        return f"{part1}_{part2}.jpg"
-
-    def _create_urls_txt(self, root, images):
-        with open(root / "SBU_captioned_photo_dataset_urls.txt", "w") as fh:
-            for image in images:
-                fh.write(
-                    f"http://static.flickr.com/{datasets_utils.create_random_string(4, string.digits)}/{image.name}\n"
-                )
-
-    def _create_captions_txt(self, root, num_images):
-        with open(root / "SBU_captioned_photo_dataset_captions.txt", "w") as fh:
-            for _ in range(num_images):
-                fh.write(f"{datasets_utils.create_random_string(10)}\n")
-
-
-class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SEMEION
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 3
-
-        images = torch.rand(num_images, 256)
-        labels = F.one_hot(torch.randint(10, size=(num_images,)))
-        with open(pathlib.Path(tmpdir) / "semeion.data", "w") as fh:
-            for image, one_hot_labels in zip(images, labels):
-                image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
-                labels_columns = " ".join([str(label.item()) for label in one_hot_labels])
-                fh.write(f"{image_columns} {labels_columns}\n")
-
-        return num_images
-
-
-class USPSTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.USPS
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images = 2 if config["train"] else 1
-
-        images = torch.rand(num_images, 256) * 2 - 1
-        labels = torch.randint(1, 11, size=(num_images,))
-
-        with bz2.open(pathlib.Path(tmpdir) / f"usps{'.t' if not config['train'] else ''}.bz2", "w") as fh:
-            for image, label in zip(images, labels):
-                line = " ".join((str(label.item()), *[f"{idx}:{pixel:.6f}" for idx, pixel in enumerate(image, 1)]))
-                fh.write(f"{line}\n".encode())
-
-        return num_images
-
-
-class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.SBDataset
-    FEATURE_TYPES = (PIL.Image.Image, (np.ndarray, PIL.Image.Image))
-
-    REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
-    )
-
-    _NUM_CLASSES = 20
-
-    def inject_fake_data(self, tmpdir, config):
-        num_images, num_images_per_image_set = self._create_split_files(tmpdir)
-
-        sizes = self._create_target_folder(tmpdir, "cls", num_images)
-
-        datasets_utils.create_image_folder(
-            tmpdir, "img", lambda idx: f"{self._file_stem(idx)}.jpg", num_images, size=lambda idx: sizes[idx]
-        )
-
-        return num_images_per_image_set[config["image_set"]]
-
-    def _create_split_files(self, root):
-        root = pathlib.Path(root)
-
-        splits = dict(train=(0, 1, 2), train_noval=(0, 2), val=(3,))
-
-        for split, idcs in splits.items():
-            self._create_split_file(root, split, idcs)
-
-        num_images = max(itertools.chain(*splits.values())) + 1
-        num_images_per_split = {split: len(idcs) for split, idcs in splits.items()}
-        return num_images, num_images_per_split
-
-    def _create_split_file(self, root, name, idcs):
-        with open(root / f"{name}.txt", "w") as fh:
-            fh.writelines(f"{self._file_stem(idx)}\n" for idx in idcs)
-
-    def _create_target_folder(self, root, name, num_images):
-        io = datasets_utils.lazy_importer.scipy.io
-
-        target_folder = pathlib.Path(root) / name
-        os.makedirs(target_folder)
-
-        sizes = [torch.randint(1, 4, size=(2,)).tolist() for _ in range(num_images)]
-        for idx, size in enumerate(sizes):
-            content = dict(
-                GTcls=dict(Boundaries=self._create_boundaries(size), Segmentation=self._create_segmentation(size))
-            )
-            io.savemat(target_folder / f"{self._file_stem(idx)}.mat", content)
-
-        return sizes
-
-    def _create_boundaries(self, size):
-        sparse = datasets_utils.lazy_importer.scipy.sparse
-        return [
-            [sparse.csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())]
-            for _ in range(self._NUM_CLASSES)
-        ]
-
-    def _create_segmentation(self, size):
-        return torch.randint(0, self._NUM_CLASSES + 1, size=size, dtype=torch.uint8).numpy()
-
-    def _file_stem(self, idx):
-        return f"2008_{idx:06d}"
-
-
-class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.FakeData
-    FEATURE_TYPES = (PIL.Image.Image, int)
-
-    def dataset_args(self, tmpdir, config):
-        return ()
-
-    def inject_fake_data(self, tmpdir, config):
-        return config["size"]
-
-    def test_not_found_or_corrupted(self):
-        self.skipTest("The data is generated at creation and thus cannot be non-existent or corrupted.")
-
-
-class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.PhotoTour
-
-    # The PhotoTour dataset returns examples with different features with respect to the 'train' parameter. Thus,
-    # we overwrite 'FEATURE_TYPES' with a dummy value to satisfy the initial checks of the base class. Furthermore, we
-    # overwrite the 'test_feature_types()' method to select the correct feature types before the test is run.
-    FEATURE_TYPES = ()
-    _TRAIN_FEATURE_TYPES = (torch.Tensor,)
-    _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
-
-    datasets_utils.combinations_grid(train=(True, False))
-
-    _NAME = "liberty"
+    _NAME = "liberty"
 
     def dataset_args(self, tmpdir, config):
         return tmpdir, self._NAME
@@ -2898,341 +2335,1042 @@ def inject_fake_data(self, tmpdir: str, config):
                 )
             )
 
-        meta_folder = data_folder / "labels"
-        meta_folder.mkdir()
-        image_ids = [str(path.relative_to(path.parents[1])).replace(os.sep, "/") for path in image_files]
-        image_ids_in_config = random.choices(image_ids, k=len(image_files) // 2)
-        with open(meta_folder / f"{config['split']}{config['partition']}.txt", "w") as file:
-            file.write("\n".join(image_ids_in_config) + "\n")
+        meta_folder = data_folder / "labels"
+        meta_folder.mkdir()
+        image_ids = [str(path.relative_to(path.parents[1])).replace(os.sep, "/") for path in image_files]
+        image_ids_in_config = random.choices(image_ids, k=len(image_files) // 2)
+        with open(meta_folder / f"{config['split']}{config['partition']}.txt", "w") as file:
+            file.write("\n".join(image_ids_in_config) + "\n")
+
+        return len(image_ids_in_config)
+
+
+class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.FER2013
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, "fer2013")
+        os.makedirs(base_folder)
+
+        num_samples = 5
+        with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
+            writer = csv.DictWriter(
+                file,
+                fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
+                quoting=csv.QUOTE_NONNUMERIC,
+                quotechar='"',
+            )
+            writer.writeheader()
+            for _ in range(num_samples):
+                row = dict(
+                    pixels=" ".join(
+                        str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
+                    )
+                )
+                if config["split"] == "train":
+                    row["emotion"] = str(int(torch.randint(0, 7, ())))
+
+                writer.writerow(row)
+
+        return num_samples
+
+
+class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.GTSRB
+    FEATURE_TYPES = (PIL.Image.Image, int)
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    def inject_fake_data(self, tmpdir: str, config):
+        root_folder = os.path.join(tmpdir, "gtsrb")
+        os.makedirs(root_folder, exist_ok=True)
+
+        # Train data
+        train_folder = os.path.join(root_folder, "GTSRB", "Training")
+        os.makedirs(train_folder, exist_ok=True)
+
+        num_examples = 3 if config["split"] == "train" else 4
+        classes = ("00000", "00042", "00012")
+        for class_idx in classes:
+            datasets_utils.create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples,
+            )
+
+        total_number_of_examples = num_examples * len(classes)
+        # Test data
+        test_folder = os.path.join(root_folder, "GTSRB", "Final_Test", "Images")
+        os.makedirs(test_folder, exist_ok=True)
+
+        with open(os.path.join(root_folder, "GT-final_test.csv"), "w") as csv_file:
+            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+
+            for _ in range(total_number_of_examples):
+                image_file = datasets_utils.create_random_string(5, string.digits) + ".ppm"
+                datasets_utils.create_image_file(test_folder, image_file)
+                row = [
+                    image_file,
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(1, 100, size=()).item(),
+                    torch.randint(0, 43, size=()).item(),
+                ]
+                csv_file.write(";".join(map(str, row)) + "\n")
+
+        return total_number_of_examples
+
+
+class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CLEVRClassification
+    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+
+    def inject_fake_data(self, tmpdir, config):
+        data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
+
+        images_folder = data_folder / "images"
+        image_files = datasets_utils.create_image_folder(
+            images_folder, config["split"], lambda idx: f"CLEVR_{config['split']}_{idx:06d}.png", num_examples=5
+        )
+
+        scenes_folder = data_folder / "scenes"
+        scenes_folder.mkdir()
+        if config["split"] != "test":
+            with open(scenes_folder / f"CLEVR_{config['split']}_scenes.json", "w") as file:
+                json.dump(
+                    dict(
+                        info=dict(),
+                        scenes=[
+                            dict(image_filename=image_file.name, objects=[dict()] * int(torch.randint(10, ())))
+                            for image_file in image_files
+                        ],
+                    ),
+                    file,
+                )
+
+        return len(image_files)
+
+
+class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.OxfordIIITPet
+    FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("trainval", "test"),
+        target_types=("category", "segmentation", ["category", "segmentation"], []),
+    )
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, "oxford-iiit-pet")
+
+        classification_anns_meta = (
+            dict(cls="Abyssinian", label=0, species="cat"),
+            dict(cls="Keeshond", label=18, species="dog"),
+            dict(cls="Yorkshire Terrier", label=37, species="dog"),
+        )
+        split_and_classification_anns = [
+            self._meta_to_split_and_classification_ann(meta, idx)
+            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
+        ]
+        image_ids, *_ = zip(*split_and_classification_anns)
+
+        image_files = datasets_utils.create_image_folder(
+            base_folder, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
+        )
+
+        anns_folder = os.path.join(base_folder, "annotations")
+        os.makedirs(anns_folder)
+        split_and_classification_anns_in_split = random.choices(split_and_classification_anns, k=len(image_ids) // 2)
+        with open(os.path.join(anns_folder, f"{config['split']}.txt"), "w", newline="") as file:
+            writer = csv.writer(file, delimiter=" ")
+            for split_and_classification_ann in split_and_classification_anns_in_split:
+                writer.writerow(split_and_classification_ann)
+
+        segmentation_files = datasets_utils.create_image_folder(
+            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
+        )
+
+        # The dataset has some rogue files
+        for path in image_files[:2]:
+            path.with_suffix(".mat").touch()
+        for path in segmentation_files:
+            path.with_name(f".{path.name}").touch()
+
+        return len(split_and_classification_anns_in_split)
+
+    def _meta_to_split_and_classification_ann(self, meta, idx):
+        image_id = "_".join(
+            [
+                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
+                str(idx),
+            ]
+        )
+        class_id = str(meta["label"] + 1)
+        species = "1" if meta["species"] == "cat" else "2"
+        breed_id = "-1"
+        return (image_id, class_id, species, breed_id)
+
+
+class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StanfordCars
+    REQUIRED_PACKAGES = ("scipy",)
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+
+    def inject_fake_data(self, tmpdir, config):
+        import scipy.io as io
+        from numpy.core.records import fromarrays
+
+        num_examples = {"train": 5, "test": 7}[config["split"]]
+        num_classes = 3
+        base_folder = pathlib.Path(tmpdir) / "stanford_cars"
+
+        devkit = base_folder / "devkit"
+        devkit.mkdir(parents=True)
+
+        if config["split"] == "train":
+            images_folder_name = "cars_train"
+            annotations_mat_path = devkit / "cars_train_annos.mat"
+        else:
+            images_folder_name = "cars_test"
+            annotations_mat_path = base_folder / "cars_test_annos_withlabels.mat"
+
+        datasets_utils.create_image_folder(
+            root=base_folder,
+            name=images_folder_name,
+            file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
+            num_examples=num_examples,
+        )
+
+        classes = np.random.randint(1, num_classes + 1, num_examples, dtype=np.uint8)
+        fnames = [f"{i:5d}.jpg" for i in range(num_examples)]
+        rec_array = fromarrays(
+            [classes, fnames],
+            names=["class", "fname"],
+        )
+        io.savemat(annotations_mat_path, {"annotations": rec_array})
+
+        random_class_names = ["random_name"] * num_classes
+        io.savemat(devkit / "cars_meta.mat", {"class_names": random_class_names})
+
+        return num_examples
+
+
+class Country211TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Country211
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+
+    def inject_fake_data(self, tmpdir: str, config):
+        split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
+        split_folder.mkdir(parents=True, exist_ok=True)
+
+        num_examples = {
+            "train": 3,
+            "valid": 4,
+            "test": 5,
+        }[config["split"]]
+
+        classes = ("AD", "BS", "GR")
+        for cls in classes:
+            datasets_utils.create_image_folder(
+                split_folder,
+                name=cls,
+                file_name_fn=lambda idx: f"{idx}.jpg",
+                num_examples=num_examples,
+            )
+
+        return num_examples * len(classes)
+
+
+class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Flowers102
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    REQUIRED_PACKAGES = ("scipy",)
+
+    def inject_fake_data(self, tmpdir: str, config):
+        base_folder = pathlib.Path(tmpdir) / "flowers-102"
+
+        num_classes = 3
+        num_images_per_split = dict(train=5, val=4, test=3)
+        num_images_total = sum(num_images_per_split.values())
+        datasets_utils.create_image_folder(
+            base_folder,
+            "jpg",
+            file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
+            num_examples=num_images_total,
+        )
+
+        label_dict = dict(
+            labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
+
+        setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
+        np.random.shuffle(setid_mat)
+        setid_dict = dict(
+            trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
+            valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
+            tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
+        )
+        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
+
+        return num_images_per_split[config["split"]]
+
+
+class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.PCAM
+
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    REQUIRED_PACKAGES = ("h5py",)
+
+    def inject_fake_data(self, tmpdir: str, config):
+        base_folder = pathlib.Path(tmpdir) / "pcam"
+        base_folder.mkdir()
+
+        num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+
+        images_file = datasets.PCAM._FILES[config["split"]]["images"][0]
+        with datasets_utils.lazy_importer.h5py.File(str(base_folder / images_file), "w") as f:
+            f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+
+        targets_file = datasets.PCAM._FILES[config["split"]]["targets"][0]
+        with datasets_utils.lazy_importer.h5py.File(str(base_folder / targets_file), "w") as f:
+            f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+
+        return num_images
+
+
+class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.RenderedSST2
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
+
+    def inject_fake_data(self, tmpdir: str, config):
+        root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
+        image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
+
+        num_images_per_class = {"train": 5, "test": 6, "val": 7}
+        sampled_classes = ["positive", "negative"]
+        for cls in sampled_classes:
+            datasets_utils.create_image_folder(
+                image_folder,
+                cls,
+                file_name_fn=lambda idx: f"{idx}.png",
+                num_examples=num_images_per_class[config["split"]],
+            )
+
+        return len(sampled_classes) * num_images_per_class[config["split"]]
+
+
+class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoETH3D
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        # create the scene folder
+        image_paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with left right images
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(100, 100)))
+            image_paths.append(datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(100, 100)))
+        return image_paths
+
+    @staticmethod
+    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+        paths = []
+        # make the root_dir if it does not exits
+        os.makedirs(root_dir, exist_ok=True)
+
+        # create scene directories
+        for i in range(num_examples):
+            scene_dir = os.path.join(root_dir, f"scene_{i}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # populate with a random png file for occlusion mask, and a pfm file for disparity
+            paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
+            pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
+            paths.append(pfm_path)
+        return paths
+
+    def inject_fake_data(self, tmpdir, config):
+        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+
+        num_examples = 2 if config["split"] == "train" else 3
+
+        split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
+        split_dir = os.path.join(eth3d_dir, split_name)
+        self._create_scene_folder(num_examples, split_dir)
+
+        if config["split"] == "train":
+            annot_dir = os.path.join(eth3d_dir, "two_view_training_gt")
+            self._create_annotation_folder(num_examples, annot_dir)
+
+        return num_examples
+
+    def test_training_test_splits(self):
+        with self.create_dataset(split="train") as (dataset, _):
+            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            for _, _, disparity, valid_mask in dataset:
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                _, dh, dw = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
+
+        with self.create_dataset(split="test") as (dataset, _):
+            assert all(d == ("", "") for d in dataset._disparities)
+            for _, _, disparity, valid_mask in dataset:
+                assert disparity is None
+                assert valid_mask is None
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.CREStereo
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
+        os.makedirs(crestereo_dir, exist_ok=True)
+
+        split_dir = crestereo_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5}.get(config["split"], 0)
+
+        for idx in range(num_examples):
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
+            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+
+        return num_examples
+
+    def test_splits(self):
+        for split in ("tree", "shapenet", "reflective", "hole"):
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoMiddlebury2014
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("train", "additional"),
+        calibration=("perfect", "imperfect", "both"),
+        use_ambient_views=(True, False),
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    @staticmethod
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
+        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
+        scene_dirs = []
+        for c in calibrations:
+            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
+            os.makedirs(scene_dir, exist_ok=True)
+            # make normal images first
+            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            # these are going to end up being gray scale images
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
+            datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
+            scene_dirs.append(scene_dir)
+        return scene_dirs
+
+    def inject_fake_data(self, tmpdir, config):
+        split_scene_map = {
+            "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
+            "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+        }
+
+        middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
+        os.makedirs(middlebury_dir, exist_ok=True)
+
+        split_dir = middlebury_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"train": 2, "additional": 3, "test": 4}.get(config["split"], 0)
+        for idx in range(num_examples):
+            scene_name = split_scene_map[config["split"]][idx]
+            self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"])
+
+        if config["calibration"] == "both":
+            num_examples *= 2
+        return num_examples
+
+    def test_train_splits(self):
+        for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
+            with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    print("disparities", disparity.shape, valid_mask.shape)
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split, calibration=None) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert disparity is None
+                    assert valid_mask is None
+
+    def test_augmented_view_usage(self):
+        with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
+            for left, right, _, _ in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+
+    def test_warnings_train(self):
+        # train set invalid
+        split = "train"
+        calibration = None
+        with pytest.warns(
+            RuntimeWarning,
+            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_warnings_test(self):
+        # test set invalid
+        split = "test"
+        calibration = "perfect"
+        with pytest.warns(
+            RuntimeWarning,
+            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+        ):
+            with self.create_dataset(split=split, calibration=calibration):
+                pass
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
+
+
+class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2012
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+
+    def inject_fake_data(self, tmpdir, config):
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2012"
+        os.makedirs(kitti_dir, exist_ok=True)
+
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
+
+        num_examples = {"train": 4, "test": 3}.get(config["split"], 0)
+
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_0",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="colored_1",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_noc",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2012 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
+
+        return num_examples
+
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
+
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    assert disparity is None
+                    assert valid_mask is None
 
-        return len(image_ids_in_config)
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
-class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.FER2013
+class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoKitti2015
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
-
-    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
-        base_folder = os.path.join(tmpdir, "fer2013")
-        os.makedirs(base_folder)
-
-        num_samples = 5
-        with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file:
-            writer = csv.DictWriter(
-                file,
-                fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",),
-                quoting=csv.QUOTE_NONNUMERIC,
-                quotechar='"',
-            )
-            writer.writeheader()
-            for _ in range(num_samples):
-                row = dict(
-                    pixels=" ".join(
-                        str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist()
-                    )
-                )
-                if config["split"] == "train":
-                    row["emotion"] = str(int(torch.randint(0, 7, ())))
-
-                writer.writerow(row)
-
-        return num_samples
-
+        kitti_dir = pathlib.Path(tmpdir) / "Kitti2015"
+        os.makedirs(kitti_dir, exist_ok=True)
 
-class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.GTSRB
-    FEATURE_TYPES = (PIL.Image.Image, int)
+        split_dir = kitti_dir / (config["split"] + "ing")
+        os.makedirs(split_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+        num_examples = {"train": 4, "test": 6}.get(config["split"], 0)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        root_folder = os.path.join(tmpdir, "gtsrb")
-        os.makedirs(root_folder, exist_ok=True)
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_2",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
+        datasets_utils.create_image_folder(
+            root=split_dir,
+            name="image_3",
+            file_name_fn=lambda i: f"{i:06d}_10.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
+        )
 
-        # Train data
-        train_folder = os.path.join(root_folder, "GTSRB", "Training")
-        os.makedirs(train_folder, exist_ok=True)
+        if config["split"] == "train":
+            datasets_utils.create_image_folder(
+                root=split_dir,
+                name="disp_occ_0",
+                file_name_fn=lambda i: f"{i:06d}.png",
+                num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
+            )
 
-        num_examples = 3 if config["split"] == "train" else 4
-        classes = ("00000", "00042", "00012")
-        for class_idx in classes:
             datasets_utils.create_image_folder(
-                train_folder,
-                name=class_idx,
-                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                root=split_dir,
+                name="disp_occ_1",
+                file_name_fn=lambda i: f"{i:06d}.png",
                 num_examples=num_examples,
+                # Kitti2015 uses a single channel image for disparities
+                size=(1, 100, 200),
             )
 
-        total_number_of_examples = num_examples * len(classes)
-        # Test data
-        test_folder = os.path.join(root_folder, "GTSRB", "Final_Test", "Images")
-        os.makedirs(test_folder, exist_ok=True)
+        return num_examples
 
-        with open(os.path.join(root_folder, "GT-final_test.csv"), "w") as csv_file:
-            csv_file.write("Filename;Width;Height;Roi.X1;Roi.Y1;Roi.X2;Roi.Y2;ClassId\n")
+    def test_train_splits(self):
+        for split in ["train"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-            for _ in range(total_number_of_examples):
-                image_file = datasets_utils.create_random_string(5, string.digits) + ".ppm"
-                datasets_utils.create_image_file(test_folder, image_file)
-                row = [
-                    image_file,
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(1, 100, size=()).item(),
-                    torch.randint(0, 43, size=()).item(),
-                ]
-                csv_file.write(";".join(map(str, row)) + "\n")
+    def test_test_split(self):
+        for split in ["test"]:
+            with self.create_dataset(split=split) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert disparity is None
+                    assert valid_mask is None
 
-        return total_number_of_examples
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
-class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.CLEVRClassification
-    FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
+class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSceneFlow
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+        split=("FlyingThings3D", "Driving", "Monkaa"),
+        pass_name=("clean", "final")
+    )
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    @staticmethod
+    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
 
-    def inject_fake_data(self, tmpdir, config):
-        data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
+        paths = []
+        for i in range(num_examples):
+            datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
+            paths.append(str(root / file_name_fn(i)))
+        return paths
 
-        images_folder = data_folder / "images"
-        image_files = datasets_utils.create_image_folder(
-            images_folder, config["split"], lambda idx: f"CLEVR_{config['split']}_{idx:06d}.png", num_examples=5
-        )
+    def inject_fake_data(self, tmpdir, config):
+        scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
+        os.makedirs(scene_flow_dir, exist_ok=True)
 
-        scenes_folder = data_folder / "scenes"
-        scenes_folder.mkdir()
-        if config["split"] != "test":
-            with open(scenes_folder / f"CLEVR_{config['split']}_scenes.json", "w") as file:
-                json.dump(
-                    dict(
-                        info=dict(),
-                        scenes=[
-                            dict(image_filename=image_file.name, objects=[dict()] * int(torch.randint(10, ())))
-                            for image_file in image_files
-                        ],
-                    ),
-                    file,
-                )
+        split_dir = scene_flow_dir / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
-        return len(image_files)
+        pass_dir_map = {
+            "clean": "frames_cleanpass",
+            "final": "frames_finalpass",
+        }
 
+        num_examples = 1
+        pass_dir_name = pass_dir_map.get(config["pass_name"], None)
 
-class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.OxfordIIITPet
-    FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
+        # create pass directories
+        pass_dir = split_dir / pass_dir_name
+        disp_dir = split_dir / "disparity"
+        os.makedirs(pass_dir, exist_ok=True)
+        os.makedirs(disp_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("trainval", "test"),
-        target_types=("category", "segmentation", ["category", "segmentation"], []),
-    )
+        num_examples = {"FlyingThings3D": 4, "Driving": 6, "Monkaa": 5}.get(config["split"], 0)
 
-    def inject_fake_data(self, tmpdir, config):
-        base_folder = os.path.join(tmpdir, "oxford-iiit-pet")
+        for direction in ["left", "right"]:
+            for scene_idx in range(num_examples):
+                os.makedirs(pass_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                datasets_utils.create_image_folder(
+                    root=pass_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.png",
+                    num_examples=1,
+                    size=(3, 200, 100),
+                )
 
-        classification_anns_meta = (
-            dict(cls="Abyssinian", label=0, species="cat"),
-            dict(cls="Keeshond", label=18, species="dog"),
-            dict(cls="Yorkshire Terrier", label=37, species="dog"),
-        )
-        split_and_classification_anns = [
-            self._meta_to_split_and_classification_ann(meta, idx)
-            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
-        ]
-        image_ids, *_ = zip(*split_and_classification_anns)
+                os.makedirs(disp_dir / f"scene_{scene_idx:06d}", exist_ok=True)
+                self._create_pfm_folder(
+                    root=disp_dir / f"scene_{scene_idx:06d}",
+                    name=direction,
+                    file_name_fn=lambda i: f"{i:06d}.pfm",
+                    num_examples=1,
+                    size=(100, 200),
+                )
 
-        image_files = datasets_utils.create_image_folder(
-            base_folder, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
-        )
+        return num_examples
 
-        anns_folder = os.path.join(base_folder, "annotations")
-        os.makedirs(anns_folder)
-        split_and_classification_anns_in_split = random.choices(split_and_classification_anns, k=len(image_ids) // 2)
-        with open(os.path.join(anns_folder, f"{config['split']}.txt"), "w", newline="") as file:
-            writer = csv.writer(file, delimiter=" ")
-            for split_and_classification_ann in split_and_classification_anns_in_split:
-                writer.writerow(split_and_classification_ann)
+    def test_splits(self):
+        for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
+            with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        segmentation_files = datasets_utils.create_image_folder(
-            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
-        )
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
-        # The dataset has some rogue files
-        for path in image_files[:2]:
-            path.with_suffix(".mat").touch()
-        for path in segmentation_files:
-            path.with_name(f".{path.name}").touch()
 
-        return len(split_and_classification_anns_in_split)
+class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoFallingThings
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-    def _meta_to_split_and_classification_ann(self, meta, idx):
-        image_id = "_".join(
-            [
-                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
-                str(idx),
-            ]
-        )
-        class_id = str(meta["label"] + 1)
-        species = "1" if meta["species"] == "cat" else "2"
-        breed_id = "-1"
-        return (image_id, class_id, species, breed_id)
+    @staticmethod
+    def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
+        file = pathlib.Path(root) / name
+        image = np.ones((size[0], size[1]), dtype=np.uint8)
+        PIL.Image.fromarray(image).save(file)
 
+    @staticmethod
+    def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> List[str]:
+        paths = []
+        root = pathlib.Path(root) / scene_name
+        os.makedirs(root, exist_ok=True)
+        # jpg images
+        paths.append(datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0])))
+        paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
+        # single channel depth maps
+        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1])))
+        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1])))
+        # camera settings json. Minimal example for _read_disparity function testing
+        settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
+        with open(root / "_camera_settings.json", "w") as f:
+            json.dump(settings_json, f)
 
-class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StanfordCars
-    REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+        return paths
 
     def inject_fake_data(self, tmpdir, config):
-        import scipy.io as io
-        from numpy.core.records import fromarrays
+        fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
+        os.makedirs(fallingthings_dir, exist_ok=True)
 
-        num_examples = {"train": 5, "test": 7}[config["split"]]
-        num_classes = 3
-        base_folder = pathlib.Path(tmpdir) / "stanford_cars"
+        split_dir = pathlib.Path(fallingthings_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
-        devkit = base_folder / "devkit"
-        devkit.mkdir(parents=True)
+        num_examples = {"single": 2, "mixed": 3}.get(config["split"], 0)
 
-        if config["split"] == "train":
-            images_folder_name = "cars_train"
-            annotations_mat_path = devkit / "cars_train_annos.mat"
-        else:
-            images_folder_name = "cars_test"
-            annotations_mat_path = base_folder / "cars_test_annos_withlabels.mat"
+        for i in range(num_examples):
+            self._make_scene_folder(
+                root=split_dir,
+                scene_name=f"scene_{i:06d}",
+                size=(100, 200),
+            )
 
-        datasets_utils.create_image_folder(
-            root=base_folder,
-            name=images_folder_name,
-            file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
-            num_examples=num_examples,
-        )
+        return num_examples
 
-        classes = np.random.randint(1, num_classes + 1, num_examples, dtype=np.uint8)
-        fnames = [f"{i:5d}.jpg" for i in range(num_examples)]
-        rec_array = fromarrays(
-            [classes, fnames],
-            names=["class", "fname"],
-        )
-        io.savemat(annotations_mat_path, {"annotations": rec_array})
+    def test_splits(self):
+        for split_name in ["single", "mixed"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        random_class_names = ["random_name"] * num_classes
-        io.savemat(devkit / "cars_meta.mat", {"class_names": random_class_names})
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
-        return num_examples
 
+class StereoSintelTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.StereoSintel
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
-class Country211TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Country211
+    def inject_fake_data(self, tmpdir, config):
+        sintel_dir = pathlib.Path(tmpdir) / "Sintel"
+        os.makedirs(sintel_dir, exist_ok=True)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+        split_dir = pathlib.Path(sintel_dir) / "training"
+        os.makedirs(split_dir, exist_ok=True)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
-        split_folder.mkdir(parents=True, exist_ok=True)
+        # a single setting, since there are no splits
+        num_examples = 4
 
-        num_examples = {
-            "train": 3,
-            "valid": 4,
-            "test": 5,
-        }[config["split"]]
+        for view in ["final_left", "final_right"]:
+            root = split_dir / view
+            os.makedirs(root, exist_ok=True)
 
-        classes = ("AD", "BS", "GR")
-        for cls in classes:
             datasets_utils.create_image_folder(
-                split_folder,
-                name=cls,
-                file_name_fn=lambda idx: f"{idx}.jpg",
+                root=root,
+                name="scene1",
+                file_name_fn=lambda i: f"{i:06d}.png",
                 num_examples=num_examples,
+                size=(3, 100, 200),
             )
 
-        return num_examples * len(classes)
-
-
-class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.Flowers102
-
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    REQUIRED_PACKAGES = ("scipy",)
-
-    def inject_fake_data(self, tmpdir: str, config):
-        base_folder = pathlib.Path(tmpdir) / "flowers-102"
-
-        num_classes = 3
-        num_images_per_split = dict(train=5, val=4, test=3)
-        num_images_total = sum(num_images_per_split.values())
         datasets_utils.create_image_folder(
-            base_folder,
-            "jpg",
-            file_name_fn=lambda idx: f"image_{idx + 1:05d}.jpg",
-            num_examples=num_images_total,
+            root=split_dir / "occlusions",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(1, 100, 200),
         )
 
-        label_dict = dict(
-            labels=np.random.randint(1, num_classes + 1, size=(1, num_images_total), dtype=np.uint8),
+        datasets_utils.create_image_folder(
+            root=split_dir / "outofframe",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(1, 100, 200),
         )
-        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "imagelabels.mat"), label_dict)
 
-        setid_mat = np.arange(1, num_images_total + 1, dtype=np.uint16)
-        np.random.shuffle(setid_mat)
-        setid_dict = dict(
-            trnid=setid_mat[: num_images_per_split["train"]].reshape(1, -1),
-            valid=setid_mat[num_images_per_split["train"] : -num_images_per_split["test"]].reshape(1, -1),
-            tstid=setid_mat[-num_images_per_split["test"] :].reshape(1, -1),
+        datasets_utils.create_image_folder(
+            root=split_dir / "disparities",
+            name="scene1",
+            file_name_fn=lambda i: f"{i:06d}.png",
+            num_examples=num_examples,
+            size=(3, 100, 200),
         )
-        datasets_utils.lazy_importer.scipy.io.savemat(str(base_folder / "setid.mat"), setid_dict)
-
-        return num_images_per_split[config["split"]]
 
+        return num_examples
 
-class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.PCAM
+    def test_splits(self):
+        with self.create_dataset() as (dataset, _):
+            for left, right, disparity, valid_mask in dataset:
+                left_array = np.array(left)
+                right_array = np.array(right)
+                h, w, c = left_array.shape
+                # check that left and right are the same size
+                assert left_array.shape == right_array.shape
+                # check general shapes
+                assert c == 3
+                assert len(disparity.shape) == 3
+                assert len(valid_mask.shape) == 2
+                assert disparity.shape == (1, h, w)
+                # check that valid mask is the same size as the disparity
+                _, dh, dw = disparity.shape
+                mh, mw = valid_mask.shape
+                assert dh == mh
+                assert dw == mw
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    REQUIRED_PACKAGES = ("h5py",)
 
-    def inject_fake_data(self, tmpdir: str, config):
-        base_folder = pathlib.Path(tmpdir) / "pcam"
-        base_folder.mkdir()
+class InStereo2k(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.InStereo2k
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
 
-        num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+    @staticmethod
+    def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
+        root = pathlib.Path(root) / name
+        os.makedirs(root, exist_ok=True)
 
-        images_file = datasets.PCAM._FILES[config["split"]]["images"][0]
-        with datasets_utils.lazy_importer.h5py.File(str(base_folder / images_file), "w") as f:
-            f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+        datasets_utils.create_image_file(root=root, name="left.png", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="right.png", size=(3, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="left_disp.png", size=(1, size[0], size[1]))
+        datasets_utils.create_image_file(root=root, name="right_disp.png", size=(1, size[0], size[1]))
 
-        targets_file = datasets.PCAM._FILES[config["split"]]["targets"][0]
-        with datasets_utils.lazy_importer.h5py.File(str(base_folder / targets_file), "w") as f:
-            f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+    def inject_fake_data(self, tmpdir, config):
+        in_stereo_dir = pathlib.Path(tmpdir) / "InStereo2k"
+        os.makedirs(in_stereo_dir, exist_ok=True)
 
-        return num_images
+        split_dir = pathlib.Path(in_stereo_dir) / config["split"]
+        os.makedirs(split_dir, exist_ok=True)
 
+        num_examples = {"train": 4, "test": 5}.get(config["split"], 0)
 
-class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
-    SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
+        for i in range(num_examples):
+            self._make_scene_folder(split_dir, f"scene_{i:06d}", (100, 200))
 
-    def inject_fake_data(self, tmpdir: str, config):
-        root_folder = pathlib.Path(tmpdir) / "rendered-sst2"
-        image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]]
+        return num_examples
 
-        num_images_per_class = {"train": 5, "test": 6, "val": 7}
-        sampled_classes = ["positive", "negative"]
-        for cls in sampled_classes:
-            datasets_utils.create_image_folder(
-                image_folder,
-                cls,
-                file_name_fn=lambda idx: f"{idx}.png",
-                num_examples=num_images_per_class[config["split"]],
-            )
+    def test_splits(self):
+        for split_name in ["train", "test"]:
+            with self.create_dataset(split=split_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    left_array = np.array(left)
+                    right_array = np.array(right)
+                    h, w, c = left_array.shape
+                    # check that left and right are the same size
+                    assert left_array.shape == right_array.shape
+                    # check general shapes
+                    assert c == 3
+                    assert len(disparity.shape) == 3
+                    assert len(valid_mask.shape) == 2
+                    assert disparity.shape == (1, h, w)
+                    # check that valid mask is the same size as the disparity
+                    _, dh, dw = disparity.shape
+                    mh, mw = valid_mask.shape
+                    assert dh == mh
+                    assert dw == mw
 
-        return len(sampled_classes) * num_images_per_class[config["split"]]
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
+            with self.create_dataset(split="bad"):
+                pass
 
 
 if __name__ == "__main__":
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index a7dd8397bab..8b38ba73a85 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,5 +1,5 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
-from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereoSynthetic 
+from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereo, InStereo2k
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
@@ -106,4 +106,13 @@
     "FGVCAircraft",
     "EuroSAT",
     "RenderedSST2",
+    "StereoETH3D",
+    "StereoFallingThings",
+    "StereoKitti2012",
+    "StereoKitti2015",
+    "StereoMiddlebury2014",
+    "StereoSceneFlow",
+    "StereoSintel",
+    "CREStereo",
+    "InStereo2k",
 )
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 702386b05bd..4de0b5b0532 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,31 +1,30 @@
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-import pathlib
 import random
 import re
 import shutil
-from typing import Callable, List, Optional, Tuple, Any
+from typing import Callable, List, Optional, Tuple
 import warnings
 from jsonschema import ValidationError
 from torch import Tensor
 from .vision import VisionDataset
-from .utils import download_and_extract_archive, download_url, verify_str_arg
+from .utils import download_and_extract_archive, verify_str_arg
 import os
 import numpy as np
 from PIL import Image
 import json
 
 __all__ = (
-    "CREStereo"  # waiting for download / need to find valid mask procedure
+    "CREStereo"
     "StereoMiddlebury2014"
     "StereoETH3D"
     "StereoKitti2012"
     "StereoKitti2015"
     "StereoSintel"
-    "StereoSceneFlow"  # need to find valid mask procedure
+    "StereoSceneFlow"
     "StereoFallingThings"
-    "InStereo2k"  # need to find valid mask procedure
+    "InStereo2k"
 )
 
 
@@ -54,13 +53,38 @@ def read_pfm_file(file_path: str) -> np.array:
         data = np.reshape(data, (height, width, channels))
         data = np.flipud(data)
 
-        return data
+        # PFM files for disparity maps should contain only a single channel
+        # they should also be returned in (C, H, W) format
+        return np.transpose(data[:, :, :1], (2, 0, 1))
 
 
 class StereoMatchingDataset(ABC, VisionDataset):
     """Base interface for Stereo matching datasets"""
 
     def __init__(self, root: str, transforms: Optional[Callable] = None):
+        """
+
+        Args:
+            root(str): Root directory of the dataset.
+            transforms(callable, optional): A function/transform that takes in Tuples of
+                (images, disparities, valid_masks) and returns a transformed version of each of them.
+                images is a Tuple of (``PIL.Image``, ``PIL.Image``)
+                disparities is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (1, H, W)
+                valid_masks is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (H, W)
+
+                In some cases, when a dataset does not provide disparties, the ``disparities`` and
+                ``valid_masks`` can be Tuples containing None values.
+
+                For training splits generally the datasets provide a minimal guarantee of
+                images: (``PIL.Image``, ``PIL.Image``)
+                disparities: (``np.ndarray``, ``None``) with shape (1, H, W)
+                valid_masks: (``np.ndarray``, ``None``) with shape (H, W)
+
+                For some test splits, the datasets provides outputs that look like:
+                imgaes: (``PIL.Image``, ``PIL.Image``)
+                disparities: (``None``, ``None``)
+                valid_masks: (``None``, ``None``)
+        """
         super().__init__(root=root)
         self.transforms = transforms
 
@@ -79,6 +103,18 @@ def _read_disparity(self, file_path: str) -> Tuple:
         pass
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` where ``valid_mask``
+            is a numpy boolean mask of shape (H, W)
+            indicating which disparity values are valid. The disparity is a numpy array of
+            shape (1, H, W) and the images are PIL images. ``disparity`` and ``valid_mask`` are None for
+            datasets on which for ``split="test"`` the authors did not provide annotations.
+        """
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
 
@@ -98,21 +134,59 @@ def __len__(self) -> int:
         return len(self._images)
 
 
-class CREStereoSynthetic(StereoMatchingDataset):
+class CREStereo(StereoMatchingDataset):
     """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
 
-   Ported from the download script in the paper github `repo <https://github.com/megvii-research/CREStereo>`_.
-   """
-    DOWNLOAD_SPACE = 4 * 1024 * 1024 * 1024  # dataset requires download requires about 400 GB of free space
+    Dataset details on the official paper `repo <https://github.com/megvii-research/CREStereo>`_.
 
-    EXPERIMENTAL_RANGE = 1  # TODO: remove after validating dataset structure / flow
+    The dataset is expected to have the following structure: ::
 
-    MAX_DISP = 256.
+        root
+            CREStereo
+                tree
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    img2_left.jpg
+                    img2_right.jpg
+                    img2_left.disp.jpg
+                    img2_right.disp.jpg
+                    ...
+                shapenet
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                reflective
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
+                hole
+                    img1_left.jpg
+                    img1_right.jpg
+                    img1_left.disp.jpg
+                    img1_right.disp.jpg
+                    ...
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False):
+    Args:
+        root (str): Root directory of the dataset.
+        split (str): The split of the dataset to use. One of ``"tree"``, ``"shapenet"``, ``"reflective"``, ``"hole"``
+        or ``"all"``. The ``"all"`` split contains all of the above splits.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+        download (bool, optional): If true, downloads the dataset from the internet and puts it in the root directory.
+        max_disparity (int, optional): Maximum disparity value. Used to compute the valid mask.
+   """
+    DOWNLOAD_SPACE = 400 * 1024 * 1024 * 1024
+
+    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False, max_disparity: float = 256.):
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
+        self.max_disparity = max_disparity
 
         # if the API user requests a dataset download check that the user can download it
         if download:
@@ -149,16 +223,23 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
             disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
             disparity_maps_right = list(p.replace("_right", "_right.disp") for p in imgs_right)
 
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
+
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = (disparity < self.MAX_DISP) & (disparity > 0.)
+        valid = (disparity < self.max_disparity) & (disparity > 0.)
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity = disparity[None, :, :]
         return disparity, valid
 
     def _download_dataset(self, root: str) -> None:
-        # TODO: remove before release, used only for testing purposes
         dirs = ["tree", "shapenet", "reflective", "hole"]
         # create directory subtree for the download
         for d in dirs:
@@ -221,11 +302,11 @@ class StereoMiddlebury2014(StereoMatchingDataset):
 
     Args:
         root (string): Root directory of the Middleburry 2014 Dataset.
-        split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
-        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible. Sampled with equal probability.
+        split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
+        use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
+        The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
         calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
     """
 
@@ -268,7 +349,7 @@ def __init__(
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
-        print(split)
+
         if not os.path.exists(root / split):
             raise FileNotFoundError(
                 f"The {split} directory was not found in the provided root directory"
@@ -292,24 +373,23 @@ def __init__(
 
         for calibration_suffix in calibrartion_suffixes:
             scene_pattern = "*" + calibration_suffix
-            print(scene_pattern)
 
             imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
             imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))
-
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
             self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
 
             if split == "test":
-                dsp_maps_left, dsp_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+                disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
             else:
+                disparity_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
+                disparity_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
+                if not len(disparity_maps_left) or not len(disparity_maps_right):
+                    raise FileNotFoundError("No disparity maps found in {}".format(root / split))
 
-                dsp_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
-                dsp_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
-
-            self._disparities += list((l, r) for l, r in zip(dsp_maps_left, dsp_maps_right))
+            self._disparities += list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
@@ -317,6 +397,7 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
+        """Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True."""
         if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
             # initialize sampleable container
             ambient_file_paths = list(file_path.replace("im1.png", view_name) for view_name in ["im1E.png", "im1L.png"])
@@ -332,6 +413,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return None, None
         disparity_map = read_pfm_file(file_path)
         valid_mask = disparity_map < 1e3
+        # remove the channel dimension from the valid mask
+        valid_mask = valid_mask[0, :, :]
         return disparity_map, valid_mask
 
     def _download_dataset(self, root: str):
@@ -357,10 +440,13 @@ def _download_dataset(self, root: str):
             download_and_extract_archive(url=test_set_url, download_root=root, remove_finished=True)
             for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
                 for scene in scene_names:
-                    shutil.move(os.path.join(scene_dir, scene), os.path.join(root, scene))
+                    scene_dst_dir = root / "test" / scene
+                    scene_src_dir = scene_dir / scene
+                    os.makedirs(scene_dst_dir, exist_ok=True)
+                    shutil.move(str(scene_src_dir), str(scene_dst_dir))
 
             # cleanup MiddEval3 directory
-            shutil.rmtree(os.path.join(root, "MiddEval3"))
+            shutil.rmtree(str(root / "MiddEval3"))
 
 
 class StereoETH3D(StereoMatchingDataset):
@@ -411,8 +497,7 @@ class StereoETH3D(StereoMatchingDataset):
         root (string): Root directory of the ETH3D Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
         calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -427,7 +512,6 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
         imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
@@ -435,8 +519,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
         else:
             disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*0GT.pfm")))
-            # no masks for the right view, always using left as reference
             disparity_maps_right = list("" for _ in disparity_maps_left)
+            if not len(disparity_maps_left):
+                raise FileNotFoundError("No disparity maps found in {}".format(root / anot_dir))
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
         self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
@@ -447,10 +532,10 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = read_pfm_file(file_path)
         valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
-        valid_mask = np.array(valid_mask)
+        valid_mask = np.array(valid_mask).astype(np.bool)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -474,8 +559,7 @@ class StereoKitti2012(StereoMatchingDataset):
     Args:
         root (string): Root directory where Kitti2012 is located.
         split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
     """
 
@@ -494,6 +578,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
             disparity_maps_right = list("" for _ in disparity_maps_left)
+            if not len(disparity_maps_left):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
@@ -506,7 +593,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = np.array(Image.open(file_path)) / 256.0
         valid_mask = disparity_map > 0.0
-
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
@@ -533,8 +621,7 @@ class StereoKitti2015(StereoMatchingDataset):
     Args:
         root (string): Root directory where Kitti2015 is located.
         split (string, optional): The dataset split of scenes, either "train" (default) or test.
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -552,6 +639,9 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if split == "train":
             disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
             disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
+            if not len(disparity_maps_left) or not len(disparity_maps_right):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
@@ -564,7 +654,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
 
         disparity_map = np.array(Image.open(file_path)) / 256.0
         valid_mask = disparity_map < 0.0
-
+        # unsqueeze the disparity map into (C, H, W) format
+        disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
@@ -574,10 +665,45 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 class StereoSintel(StereoMatchingDataset):
     """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
+    The dataset is expected to have the following structure: ::
+
+        root
+            Sintel
+                training
+                    final_left
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    final_right
+                        scene2
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    disparities
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    occlusions
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+                    outofframe
+                        scene1
+                            img1.png
+                            img2.png
+                            ...
+                        ...
+
     Args:
         root (string): Root directory where Sintel Stereo is located.
-        transforms (callalbe, optional): A function/transform that takes in
-            ``left_img, right_img, left_disparity, right_disparity`` and returns a transformed version.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, transforms: Optional[Callable] = None):
@@ -587,11 +713,13 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
         imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
         dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
+        if not len(dps_masks_left):
+            raise FileNotFoundError("No disparity maps found in {}".format(root))
+
         disparity_maps_right = list("" for _ in dps_masks_left)
 
         self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
@@ -605,7 +733,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = np.array(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
         disparity_map = r * 4 + g / (2**6) + b / (2**14)
-
+        # reshape into (C, H, W) format
+        disparity_map = np.transpose(disparity_map, (2, 0, 1))
         # occlusion mask
         valid_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) == 0
         # out of frame mask
@@ -662,6 +791,10 @@ class StereoSceneFlow(StereoMatchingDataset):
                 FlyingThings3D
                     ...
                     ...
+
+    Args:
+        root (string): Root directory where SceneFlow is located.
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
@@ -683,7 +816,6 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
         for p in passes:
             imgs_left = sorted(glob(str(root / p / "*" / "left" / "*.png")))
             imgs_right = sorted(glob(str(root / p / "*" / "right" / "*.png")))
-
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root / p))
 
@@ -693,15 +825,19 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
             disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
 
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
+
+            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
+
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        if not os.path.exists(file_path):
-            raise FileNotFoundError("Disparity map {} not found".format(file_path))
-
         disparity = read_pfm_file(file_path)
-        valid = np.ones_like(disparity)
+        # keep valid mask with shape (H, W)
+        valid = np.ones(disparity.shape[1:]).astype(np.bool)
         return disparity, valid
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
@@ -745,13 +881,20 @@ class StereoFallingThings(StereoMatchingDataset):
                         ...
                     scene2
                     ...
+
+    Args:
+        root (string): Root directory where FallingThings is located.
+        split (string): Either "single", "mixed", or "both".
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
+
     """
 
     def __init__(self, root: str, split: str = "single", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
+        root = Path(root) / "FallingThings"
+
         verify_str_arg(split, "split", valid_values=("single", "mixed", "both"))
-        split = split.upper()
 
         splits = {
             "single": ["single"],
@@ -760,28 +903,35 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
         }[split]
 
         for s in splits:
-            imgs_left = sorted(glob(str(root / s / "*.left.jpg")))
-            imgs_right = sorted(glob(str(root / s / "*.right.jpg")))
-
+            imgs_left = sorted(glob(str(root / s / "*" / "*.left.jpg")))
+            imgs_right = sorted(glob(str(root / s / "*" / "*.right.jpg")))
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
             imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
             self._images += imgs
 
-            disparity_maps_left = sorted(glob(str(root / s / "*.left.depth.png")))
-            disparity_maps_right = sorted(glob(str(root / s / "*.right.depth.png")))
+            disparity_maps_left = sorted(glob(str(root / s / "*" / "*.left.depth.png")))
+            disparity_maps_right = sorted(glob(str(root / s / "*" / "*.right.depth.png")))
+            if not len(disparity_maps_left) or not len(disparity_maps_right):
+                raise FileNotFoundError("No disparity maps found in {}".format(root))
 
             disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        depth = Image.Open(file_path)
-        with open(os.path.split(file_path)[0] + '_camera_settings.json', 'r') as f:
+        # (H, W) image
+        depth = np.array(Image.open(file_path))
+        # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
+        # in order to extract disparity from depth maps
+        with open(os.path.split(file_path)[0] + '/_camera_settings.json', 'r') as f:
             intrinsics = json.load(f)
             fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            # inverse of depth-from-disparity equation
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0
+            # unsqueeze disparity to (C, H, W)
+            disparity = disparity[None, :, :]
             return disparity, valid
 
     def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
@@ -789,7 +939,7 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
 
 
 class InStereo2k(StereoMatchingDataset):
-    """InStereo2k `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
+    """InStereo2k `<https://github.com/YuhuaXu/StereoDataset>`_ dataset
 
     The dataset is expected to have the following structre: ::
 
@@ -813,6 +963,11 @@ class InStereo2k(StereoMatchingDataset):
                         ...
                     scene2
                     ...
+
+    Args:
+        root (string): Root directory where InStereo2k is located.
+        split (string): Either "train" or "test".
+        transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -820,9 +975,10 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         root = Path(root) / "InStereo2k" / split
 
+        verify_str_arg(split, "split", valid_values=("train", "test"))
+
         imgs_left = sorted(glob(str(root / "*" / "left.png")))
         imgs_right = list(p.replace("left", "right") for p in imgs_left)
-
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
@@ -832,10 +988,18 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
         disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
 
+        if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
+            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
+
+        if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
+            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
+
         disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
         self._disparities = disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = np.ones_like(disparity)
+        valid = np.ones_like(disparity).astype(np.bool)
+        # unsqueeze disparity to (C, H, W)
+        disparity = disparity[None, :, :]
         return disparity, valid

From de94c2c8acd7811cb272b05cc0f94ca77f965511 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:12:06 +0100
Subject: [PATCH 20/35] Ran ufmt. (#6259)

---
 torchvision/datasets/__init__.py         |  12 +-
 torchvision/datasets/_stereo_matching.py | 195 +++++++++++++++--------
 2 files changed, 138 insertions(+), 69 deletions(-)

diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 8b38ba73a85..973d5ca9f7e 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,5 +1,15 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
-from ._stereo_matching import StereoETH3D, StereoFallingThings, StereoKitti2012, StereoKitti2015, StereoMiddlebury2014, StereoSceneFlow, StereoSintel, CREStereo, InStereo2k
+from ._stereo_matching import (
+    StereoETH3D,
+    StereoFallingThings,
+    StereoKitti2012,
+    StereoKitti2015,
+    StereoMiddlebury2014,
+    StereoSceneFlow,
+    StereoSintel,
+    CREStereo,
+    InStereo2k,
+)
 from .caltech import Caltech101, Caltech256
 from .celeba import CelebA
 from .cifar import CIFAR10, CIFAR100
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 4de0b5b0532..3edb0f639a5 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,19 +1,21 @@
-from abc import ABC, abstractmethod
-from glob import glob
-from pathlib import Path
+import json
+import os
 import random
 import re
 import shutil
-from typing import Callable, List, Optional, Tuple
 import warnings
+from abc import ABC, abstractmethod
+from glob import glob
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
 from jsonschema import ValidationError
+from PIL import Image
 from torch import Tensor
-from .vision import VisionDataset
+
 from .utils import download_and_extract_archive, verify_str_arg
-import os
-import numpy as np
-from PIL import Image
-import json
+from .vision import VisionDataset
 
 __all__ = (
     "CREStereo"
@@ -35,7 +37,7 @@ def read_pfm_file(file_path: str) -> np.array:
         if not header in [b"PF", b"Pf"]:
             raise ValidationError(f"Not a valid PFM file: {file_path}")
 
-        dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+        dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())
         if not dim_match:
             raise ValidationError(f"Malformed PFM header: {file_path}")
 
@@ -45,11 +47,11 @@ def read_pfm_file(file_path: str) -> np.array:
         # check for endian type
         if scale < 0:
             scale = -scale
-            endian = '<'
+            endian = "<"
         else:
-            endian = '>'
+            endian = ">"
 
-        data = np.fromfile(file, endian + 'f')
+        data = np.fromfile(file, endian + "f")
         data = np.reshape(data, (height, width, channels))
         data = np.flipud(data)
 
@@ -126,7 +128,11 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
         valid_masks = (valid_mask_left, valid_mask_right)
 
         if self.transforms is not None:
-            imgs, dsp_maps, valid_masks, = self.transforms(imgs, dsp_maps, valid_masks)
+            (
+                imgs,
+                dsp_maps,
+                valid_masks,
+            ) = self.transforms(imgs, dsp_maps, valid_masks)
 
         return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
 
@@ -135,7 +141,7 @@ def __len__(self) -> int:
 
 
 class CREStereo(StereoMatchingDataset):
-    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture. 
+    """Synthetic dataset used in training the `CREStereo <https://arxiv.org/pdf/2203.11483.pdf>`_ architecture.
 
     Dataset details on the official paper `repo <https://github.com/megvii-research/CREStereo>`_.
 
@@ -179,10 +185,18 @@ class CREStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (bool, optional): If true, downloads the dataset from the internet and puts it in the root directory.
         max_disparity (int, optional): Maximum disparity value. Used to compute the valid mask.
-   """
+    """
+
     DOWNLOAD_SPACE = 400 * 1024 * 1024 * 1024
 
-    def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable] = None, download: bool = False, max_disparity: float = 256.):
+    def __init__(
+        self,
+        root: str,
+        split: str = "tree",
+        transforms: Optional[Callable] = None,
+        download: bool = False,
+        max_disparity: float = 256.0,
+    ):
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
@@ -234,7 +248,7 @@ def __init__(self, root: str, split: str = "tree", transforms: Optional[Callable
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = (disparity < self.max_disparity) & (disparity > 0.)
+        valid = (disparity < self.max_disparity) & (disparity > 0.0)
         # unsqueeze the disparity map into (C, H, W) format
         disparity = disparity[None, :, :]
         return disparity, valid
@@ -261,33 +275,33 @@ class StereoMiddlebury2014(StereoMatchingDataset):
             Middlebury2014
                 train
                     scene1-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     scene2-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     ...
                 additional
                     scene1-{ ,perfect,imperfect}
-                        calib.txt                    
-                        im{0,1}.png                  
-                        im1E.png                     
-                        im1L.png                     
-                        disp{0,1}.pfm                
-                        disp{0,1}-n.png              
-                        disp{0,1}-sd.pfm             
+                        calib.txt
+                        im{0,1}.png
+                        im1E.png
+                        im1L.png
+                        disp{0,1}.pfm
+                        disp{0,1}-n.png
+                        disp{0,1}-sd.pfm
                         disp{0,1}y.pfm
                     ...
                 test
@@ -305,15 +319,56 @@ class StereoMiddlebury2014(StereoMatchingDataset):
         split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
         use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
         The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
     """
 
     splits = {
-        "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano", "Pipes", "Playroom", "Playtable", "Recycle", "Shelves", "Vintage"],
-        "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1", "Couch", "Flowers", "Mask", "Shopvac", "Sticks", "Storage", "Sword1", "Sword2", "Umbrella"],
-        "test": ["Plants", "Classroom2E", "Classroom2", "Australia", "DjembeL", "CrusadeP", "Crusade", "Hoops", "Bicycle2", "Staircase", "Newkuba", "AustraliaP", "Djembe", "Livingroom", "Computer"]
+        "train": [
+            "Adirondack",
+            "Jadeplant",
+            "Motorcycle",
+            "Piano",
+            "Pipes",
+            "Playroom",
+            "Playtable",
+            "Recycle",
+            "Shelves",
+            "Vintage",
+        ],
+        "additional": [
+            "Backpack",
+            "Bicycle1",
+            "Cable",
+            "Classroom1",
+            "Couch",
+            "Flowers",
+            "Mask",
+            "Shopvac",
+            "Sticks",
+            "Storage",
+            "Sword1",
+            "Sword2",
+            "Umbrella",
+        ],
+        "test": [
+            "Plants",
+            "Classroom2E",
+            "Classroom2",
+            "Australia",
+            "DjembeL",
+            "CrusadeP",
+            "Crusade",
+            "Hoops",
+            "Bicycle2",
+            "Staircase",
+            "Newkuba",
+            "AustraliaP",
+            "Djembe",
+            "Livingroom",
+            "Computer",
+        ],
     }
 
     def __init__(
@@ -323,7 +378,7 @@ def __init__(
         calibration: Optional[str] = "perfect",
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
-        download: bool = False
+        download: bool = False,
     ):
         super().__init__(root, transforms)
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
@@ -333,8 +388,7 @@ def __init__(
             if split == "test":
                 calibration = None
                 warnings.warn(
-                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.",
-                    RuntimeWarning
+                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.", RuntimeWarning
                 )
         else:
             if split != "test":
@@ -342,7 +396,7 @@ def __init__(
                 warnings.warn(
                     f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
                     f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
-                    RuntimeWarning
+                    RuntimeWarning,
                 )
 
         if download:
@@ -351,15 +405,14 @@ def __init__(
         root = Path(root) / "Middlebury2014"
 
         if not os.path.exists(root / split):
-            raise FileNotFoundError(
-                f"The {split} directory was not found in the provided root directory"
-            )
+            raise FileNotFoundError(f"The {split} directory was not found in the provided root directory")
 
         split_scenes = self.splits[split]
         # check that the provided root folder contains the scene splits
         if not any(
             # using startswith to account for perfect / imperfect calibrartion
-            scene.startswith(s) for scene in os.listdir(root / split)
+            scene.startswith(s)
+            for scene in os.listdir(root / split)
             for s in split_scenes
         ):
             raise FileNotFoundError(f"Provided root folder does not contain any scenes from the {split} split.")
@@ -429,7 +482,9 @@ def _download_dataset(self, root: str):
                 scene_name = f"{scene}-{calibration}"
                 for calibration in ["perfect", "imperfect"]:
                     scene_url = f"{base_url}/{scene_name}.zip"
-                    download_and_extract_archive(url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True)
+                    download_and_extract_archive(
+                        url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True
+                    )
 
         if any(s not in os.listdir(root) for s in self.splits["test"]):
             # test split is downloaded from a different location
@@ -450,7 +505,7 @@ def _download_dataset(self, root: str):
 
 
 class StereoETH3D(StereoMatchingDataset):
-    """"ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
+    """ "ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
 
     The dataset is expected to have the following structure: ::
 
@@ -458,13 +513,13 @@ class StereoETH3D(StereoMatchingDataset):
             ETH3D
                 two_view_training
                     scene1
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
                         calib.txt
                     scene2
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
@@ -480,13 +535,13 @@ class StereoETH3D(StereoMatchingDataset):
                     ...
                 two_view_testing
                     scene1
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
                         calib.txt
                     scene2
-                        im1.png 
+                        im1.png
                         im0.png
                         images.txt
                         cameras.txt
@@ -496,7 +551,7 @@ class StereoETH3D(StereoMatchingDataset):
     Args:
         root (string): Root directory of the ETH3D Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default) or "test".
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes. 
+        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
@@ -540,7 +595,7 @@ def __getitem__(self, index: int) -> Tuple:
 
 
 class StereoKitti2012(StereoMatchingDataset):
-    """"Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
+    """ "Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
     Uses the RGB images for consistency with Kitti 2015.
 
     The dataset is expected to have the following structure: ::
@@ -560,7 +615,7 @@ class StereoKitti2012(StereoMatchingDataset):
         root (string): Root directory where Kitti2012 is located.
         split (string, optional): The dataset split of scenes, either "train" (default), test, or "additional"
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory. 
+        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
     """
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
@@ -602,7 +657,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 
 
 class StereoKitti2015(StereoMatchingDataset):
-    """"Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
+    """ "Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
 
     The dataset is expected to have the following structure: ::
 
@@ -663,7 +718,7 @@ def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
 
 
 class StereoSintel(StereoMatchingDataset):
-    """"Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
+    """ "Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
     The dataset is expected to have the following structure: ::
 
@@ -732,7 +787,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # disparity decoding as per Sintel instructions
         disparity_map = np.array(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
-        disparity_map = r * 4 + g / (2**6) + b / (2**14)
+        disparity_map = r * 4 + g / (2 ** 6) + b / (2 ** 14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
         # occlusion mask
@@ -797,7 +852,9 @@ class StereoSceneFlow(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None):
+    def __init__(
+        self, root: str, split: str = "FlyingThings3D", pass_name: str = "clean", transforms: Optional[Callable] = None
+    ):
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -823,7 +880,9 @@ def __init__(self, root: str, split: str = "FlyingThings3D", pass_name: str = "c
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
-            disparity_maps_right = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right]
+            disparity_maps_right = [
+                file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right
+            ]
 
             if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
@@ -924,9 +983,9 @@ def _read_disparity(self, file_path: str) -> Tuple:
         depth = np.array(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
-        with open(os.path.split(file_path)[0] + '/_camera_settings.json', 'r') as f:
+        with open(os.path.split(file_path)[0] + "/_camera_settings.json", "r") as f:
             intrinsics = json.load(f)
-            fx = intrinsics['camera_settings'][0]['intrinsic_settings']['fx']
+            fx = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             # inverse of depth-from-disparity equation
             disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
             valid = disparity > 0

From 4256ca455917ef4e480aeb2a7a8ca65609ca4dd4 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 10:48:42 +0100
Subject: [PATCH 21/35] Adressed CI/CD errors

---
 torchvision/datasets/_stereo_matching.py | 41 ++++++++++++------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 3edb0f639a5..254d9d2624a 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -10,7 +10,6 @@
 from typing import Callable, List, Optional, Tuple
 
 import numpy as np
-from jsonschema import ValidationError
 from PIL import Image
 from torch import Tensor
 
@@ -35,11 +34,11 @@ def read_pfm_file(file_path: str) -> np.array:
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
         if not header in [b"PF", b"Pf"]:
-            raise ValidationError(f"Not a valid PFM file: {file_path}")
+            raise ValueError(f"Not a valid PFM file: {file_path}")
 
         dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())
         if not dim_match:
-            raise ValidationError(f"Malformed PFM header: {file_path}")
+            raise ValueError(f"Malformed PFM header: {file_path}")
 
         width, height = map(int, dim_match.groups())
         channels = 3 if header == b"PF" else 1
@@ -231,7 +230,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
@@ -243,7 +242,7 @@ def __init__(
             if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -432,7 +431,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            self._images += list((l, r) for l, r in zip(imgs_left, imgs_right))
+            self._images += list((left, right) for left, right in zip(imgs_left, imgs_right))
 
             if split == "test":
                 disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
@@ -442,7 +441,7 @@ def __init__(
                 if not len(disparity_maps_left) or not len(disparity_maps_right):
                     raise FileNotFoundError("No disparity maps found in {}".format(root / split))
 
-            self._disparities += list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities += list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
         self.use_ambient_views = use_ambient_views
 
@@ -578,8 +577,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             if not len(disparity_maps_left):
                 raise FileNotFoundError("No disparity maps found in {}".format(root / anot_dir))
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -639,8 +638,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -700,8 +699,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         else:
             disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -777,8 +776,8 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         disparity_maps_right = list("" for _ in dps_masks_left)
 
-        self._images = list((l, r) for l, r in zip(imgs_left, imgs_right))
-        self._disparities = list((l, r) for l, r in zip(dps_masks_left, disparity_maps_right))
+        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
+        self._disparities = list((left, right) for left, right in zip(dps_masks_left, disparity_maps_right))
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -876,7 +875,7 @@ def __init__(
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root / p))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
@@ -890,7 +889,7 @@ def __init__(
             if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
                 raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -967,7 +966,7 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
             if not len(imgs_left) or not len(imgs_right):
                 raise FileNotFoundError("No images found in {}".format(root))
 
-            imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
             self._images += imgs
 
             disparity_maps_left = sorted(glob(str(root / s / "*" / "*.left.depth.png")))
@@ -975,7 +974,7 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
             if not len(disparity_maps_left) or not len(disparity_maps_right):
                 raise FileNotFoundError("No disparity maps found in {}".format(root))
 
-            disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
@@ -1041,7 +1040,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
-        imgs = list((l, r) for l, r in zip(imgs_left, imgs_right))
+        imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
         self._images = imgs
 
         disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
@@ -1053,7 +1052,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
             raise FileNotFoundError("No disparity valid maps found in {}".format(root))
 
-        disparity_maps = list((l, r) for l, r in zip(disparity_maps_left, disparity_maps_right))
+        disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
         self._disparities = disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:

From d7882ca96175146c3e81424189f64f8cd4c4e8f4 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 11:21:36 +0100
Subject: [PATCH 22/35] Ran formatting pre-commit hook

---
 test/datasets_utils.py                   | 16 ++---
 test/test_datasets.py                    | 76 +++++++++++++-----------
 torchvision/datasets/_stereo_matching.py |  2 +-
 3 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index f051e325968..9afd8f741fd 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -561,9 +561,11 @@ def test_feature_types(self, config):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"], f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
+            assert (
+                len(dataset) == info["num_examples"]
+            ), f"The number of examples {len(dataset)} does not match the expected {info['num_examples']}"
 
-    @ test_all_configs
+    @test_all_configs
     def test_transforms(self, config):
         mock = unittest.mock.Mock(wraps=lambda *args: args[0] if len(args) == 1 else args)
         for kwarg in self._TRANSFORM_KWARGS:
@@ -587,7 +589,7 @@ class ImageDatasetTestCase(DatasetTestCase):
 
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def create_dataset(
         self,
         config: Optional[Dict[str, Any]] = None,
@@ -610,7 +612,7 @@ def create_dataset(
             with self._force_load_images():
                 yield dataset, info
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def _force_load_images(self):
         open = PIL.Image.open
 
@@ -649,7 +651,7 @@ def _set_default_frames_per_clip(self, inject_fake_data):
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @ functools.wraps(inject_fake_data)
+        @functools.wraps(inject_fake_data)
         def wrapper(tmpdir, config):
             args = inject_fake_data(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
@@ -748,7 +750,7 @@ def size(idx: int) -> Tuple[int, int, int]:
     ]
 
 
-@ requires_lazy_imports("av")
+@requires_lazy_imports("av")
 def create_video_file(
     root: Union[pathlib.Path, str],
     name: Union[pathlib.Path, str],
@@ -790,7 +792,7 @@ def create_video_file(
     return file
 
 
-@ requires_lazy_imports("av")
+@requires_lazy_imports("av")
 def create_video_folder(
     root: Union[str, pathlib.Path],
     name: Union[str, pathlib.Path],
diff --git a/test/test_datasets.py b/test/test_datasets.py
index dd3c89b9bdc..5db3be40b4f 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -10,10 +10,10 @@
 import random
 import shutil
 import string
-from typing import List, Callable, Tuple
 import unittest
 import xml.etree.ElementTree as ET
 import zipfile
+from typing import List, Callable, Tuple
 
 import datasets_utils
 import numpy as np
@@ -28,26 +28,26 @@ class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
-    @ staticmethod
+    @staticmethod
     def _make_binary_file(num_elements, root, name):
         file_name = os.path.join(root, name)
         np.zeros(num_elements, dtype=np.uint8).tofile(file_name)
 
-    @ staticmethod
+    @staticmethod
     def _make_image_file(num_images, root, name, num_channels=3, height=96, width=96):
         STL10TestCase._make_binary_file(num_images * num_channels * height * width, root, name)
 
-    @ staticmethod
+    @staticmethod
     def _make_label_file(num_images, root, name):
         STL10TestCase._make_binary_file(num_images, root, name)
 
-    @ staticmethod
+    @staticmethod
     def _make_class_names_file(root, name="class_names.txt"):
         with open(os.path.join(root, name), "w") as fh:
             for cname in ("airplane", "bird"):
                 fh.write(f"{cname}\n")
 
-    @ staticmethod
+    @staticmethod
     def _make_fold_indices_file(root):
         num_folds = 10
         offset = 0
@@ -59,7 +59,7 @@ def _make_fold_indices_file(root):
 
         return tuple(range(1, num_folds + 1))
 
-    @ staticmethod
+    @staticmethod
     def _make_train_files(root, num_unlabeled_images=1):
         num_images_in_fold = STL10TestCase._make_fold_indices_file(root)
         num_train_images = sum(num_images_in_fold)
@@ -70,7 +70,7 @@ def _make_train_files(root, num_unlabeled_images=1):
 
         return dict(train=num_train_images, unlabeled=num_unlabeled_images)
 
-    @ staticmethod
+    @staticmethod
     def _make_test_files(root, num_images=2):
         STL10TestCase._make_image_file(num_images, root, "test_X.bin")
         STL10TestCase._make_label_file(num_images, root, "test_y.bin")
@@ -888,7 +888,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_images
 
-    @ contextlib.contextmanager
+    @contextlib.contextmanager
     def create_dataset(self, *args, **kwargs):
         with super().create_dataset(*args, **kwargs) as output:
             yield output
@@ -1294,7 +1294,7 @@ def _create_archive(self, root, name, *files):
 
         return archive
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_feature_types(self, config):
         feature_types = self.FEATURE_TYPES
         self.FEATURE_TYPES = self._TRAIN_FEATURE_TYPES if config["train"] else self._TEST_FEATURE_TYPES
@@ -1572,7 +1572,7 @@ def _file_name_fn(self, cls, ext, idx):
     def _is_valid_file_to_extensions(self, is_valid_file):
         return {ext for ext in self._EXTENSIONS if is_valid_file(f"foo.{ext}")}
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_is_valid_file(self, config):
         extensions = config.pop("extensions")
         # We need to explicitly pass extensions=None here or otherwise it would be filled by the value from the
@@ -1582,7 +1582,7 @@ def test_is_valid_file(self, config):
         ) as (dataset, info):
             assert len(dataset) == info["num_examples"]
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1603,7 +1603,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return dict(num_examples=num_examples_total, classes=classes)
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_classes(self, config):
         with self.create_dataset(config) as (dataset, info):
             assert len(dataset.classes) == len(info["classes"])
@@ -1702,32 +1702,32 @@ class Places365TestCase(datasets_utils.ImageDatasetTestCase):
         *((f"{category}/Places365_train_00000001.png", idx) for category, idx in _CATEGORIES_CONTENT),
     )
 
-    @ staticmethod
+    @staticmethod
     def _make_txt(root, name, seq):
         file = os.path.join(root, name)
         with open(file, "w") as fh:
             for text, idx in seq:
                 fh.write(f"{text} {idx}\n")
 
-    @ staticmethod
+    @staticmethod
     def _make_categories_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._CATEGORIES_CONTENT)
 
-    @ staticmethod
+    @staticmethod
     def _make_file_list_txt(root, name):
         Places365TestCase._make_txt(root, name, Places365TestCase._FILE_LIST_CONTENT)
 
-    @ staticmethod
+    @staticmethod
     def _make_image(file_name, size):
         os.makedirs(os.path.dirname(file_name), exist_ok=True)
         PIL.Image.fromarray(np.zeros((*size, 3), dtype=np.uint8)).save(file_name)
 
-    @ staticmethod
+    @staticmethod
     def _make_devkit_archive(root, split):
         Places365TestCase._make_categories_txt(root, Places365TestCase._CATEGORIES)
         Places365TestCase._make_file_list_txt(root, Places365TestCase._FILE_LISTS[split])
 
-    @ staticmethod
+    @staticmethod
     def _make_images_archive(root, split, small):
         folder_name = Places365TestCase._IMAGES[(split, small)]
         image_size = (256, 256) if small else (512, random.randint(512, 1024))
@@ -2042,7 +2042,7 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples[config["split"]]
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_flow(self, config):
         # Make sure flow always exists, and make sure there are as many flow values as (pairs of) images
         # Also make sure the flow is properly decoded
@@ -2101,7 +2101,7 @@ def inject_fake_data(self, tmpdir, config):
         )
         return num_examples
 
-    @ datasets_utils.test_all_configs
+    @datasets_utils.test_all_configs
     def test_flow(self, config):
         h, w = self.FLOW_H, self.FLOW_W
         expected_flow = np.arange(3 * h * w).reshape(h, w, 3).transpose(2, 0, 1)
@@ -2726,7 +2726,9 @@ def inject_fake_data(self, tmpdir, config):
 
     def test_training_test_splits(self):
         with self.create_dataset(split="train") as (dataset, _):
-            assert dataset._images and len(dataset._images) == len(dataset._disparities), "Training images do not match with training disparities"
+            assert dataset._images and len(dataset._images) == len(
+                dataset._disparities
+            ), "Training images do not match with training disparities"
             for _, _, disparity, valid_mask in dataset:
                 assert len(disparity.shape) == 3
                 assert len(valid_mask.shape) == 2
@@ -2813,10 +2815,10 @@ def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
             scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
             os.makedirs(scene_dir, exist_ok=True)
             # make normal images first
-            datasets_utils.create_image_file(root=scene_dir, name=f"im0.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1E.png", size=(3, 100, 100))
-            datasets_utils.create_image_file(root=scene_dir, name=f"im1L.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1E.png", size=(3, 100, 100))
+            datasets_utils.create_image_file(root=scene_dir, name="im1L.png", size=(3, 100, 100))
             # these are going to end up being gray scale images
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
@@ -2827,7 +2829,7 @@ def inject_fake_data(self, tmpdir, config):
         split_scene_map = {
             "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"],
             "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"],
-            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"]
+            "test": ["Plants", "Classroom2E", "Classroom2", "Australia"],
         }
 
         middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014")
@@ -2895,7 +2897,7 @@ def test_warnings_train(self):
         with pytest.warns(
             RuntimeWarning,
             match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
-                  f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
+            f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
         ):
             with self.create_dataset(split=split, calibration=calibration):
                 pass
@@ -2905,8 +2907,7 @@ def test_warnings_test(self):
         split = "test"
         calibration = "perfect"
         with pytest.warns(
-            RuntimeWarning,
-            match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+            RuntimeWarning, match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
         ):
             with self.create_dataset(split=split, calibration=calibration):
                 pass
@@ -3086,13 +3087,14 @@ def test_bad_input(self):
 class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StereoSceneFlow
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        split=("FlyingThings3D", "Driving", "Monkaa"),
-        pass_name=("clean", "final")
+        split=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
-    def _create_pfm_folder(root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]) -> List[str]:
+    def _create_pfm_folder(
+        root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]
+    ) -> List[str]:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
@@ -3193,8 +3195,12 @@ def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> Lis
         paths.append(datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0])))
         paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
         # single channel depth maps
-        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1])))
-        paths.append(StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1])))
+        paths.append(
+            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
+        )
+        paths.append(
+            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
+        )
         # camera settings json. Minimal example for _read_disparity function testing
         settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
         with open(root / "_camera_settings.json", "w") as f:
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 254d9d2624a..8ef5f3e6e1a 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -33,7 +33,7 @@ def read_pfm_file(file_path: str) -> np.array:
     # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
     with open(file_path, "rb") as file:
         header = file.readline().rstrip()
-        if not header in [b"PF", b"Pf"]:
+        if header not in [b"PF", b"Pf"]:
             raise ValueError(f"Not a valid PFM file: {file_path}")
 
         dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())

From 5f291c292ca2611c21e6b5ac2d12b79f51e3cab4 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 16:12:41 +0100
Subject: [PATCH 23/35] Added reusable _pfm_read. Addressed CI issues.

---
 torchvision/datasets/_stereo_matching.py | 95 +++++++++---------------
 1 file changed, 34 insertions(+), 61 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 8ef5f3e6e1a..a8797d7d5c1 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -1,7 +1,7 @@
+import functools
 import json
 import os
 import random
-import re
 import shutil
 import warnings
 from abc import ABC, abstractmethod
@@ -11,9 +11,8 @@
 
 import numpy as np
 from PIL import Image
-from torch import Tensor
 
-from .utils import download_and_extract_archive, verify_str_arg
+from .utils import download_and_extract_archive, verify_str_arg, _read_pfm
 from .vision import VisionDataset
 
 __all__ = (
@@ -28,35 +27,7 @@
     "InStereo2k"
 )
 
-
-def read_pfm_file(file_path: str) -> np.array:
-    # adapted from https://github.com/ucbdrive/hd3/blob/master/utils/pfm.py
-    with open(file_path, "rb") as file:
-        header = file.readline().rstrip()
-        if header not in [b"PF", b"Pf"]:
-            raise ValueError(f"Not a valid PFM file: {file_path}")
-
-        dim_match = re.match(rb"^(\d+)\s(\d+)\s$", file.readline())
-        if not dim_match:
-            raise ValueError(f"Malformed PFM header: {file_path}")
-
-        width, height = map(int, dim_match.groups())
-        channels = 3 if header == b"PF" else 1
-        scale = float(file.readline().rstrip())
-        # check for endian type
-        if scale < 0:
-            scale = -scale
-            endian = "<"
-        else:
-            endian = ">"
-
-        data = np.fromfile(file, endian + "f")
-        data = np.reshape(data, (height, width, channels))
-        data = np.flipud(data)
-
-        # PFM files for disparity maps should contain only a single channel
-        # they should also be returned in (C, H, W) format
-        return np.transpose(data[:, :, :1], (2, 0, 1))
+_read_pfm_file = functools.partial(_read_pfm, slice_channels=1)
 
 
 class StereoMatchingDataset(ABC, VisionDataset):
@@ -103,7 +74,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    def __getitem__(self, index: int) -> Tuple:
         """Return example at given index.
 
         Args:
@@ -111,10 +82,10 @@ def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
 
         Returns:
             tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` where ``valid_mask``
-            is a numpy boolean mask of shape (H, W)
-            indicating which disparity values are valid. The disparity is a numpy array of
-            shape (1, H, W) and the images are PIL images. ``disparity`` and ``valid_mask`` are None for
-            datasets on which for ``split="test"`` the authors did not provide annotations.
+                is a numpy boolean mask of shape (H, W)
+                indicating which disparity values are valid. The disparity is a numpy array of
+                shape (1, H, W) and the images are PIL images. ``disparity`` and ``valid_mask`` are None for
+                datasets on which for ``split="test"`` the authors did not provide annotations.
         """
         img_left = self._read_img(self._images[index][0])
         img_right = self._read_img(self._images[index][1])
@@ -180,7 +151,7 @@ class CREStereo(StereoMatchingDataset):
     Args:
         root (str): Root directory of the dataset.
         split (str): The split of the dataset to use. One of ``"tree"``, ``"shapenet"``, ``"reflective"``, ``"hole"``
-        or ``"all"``. The ``"all"`` split contains all of the above splits.
+            or ``"all"``. The ``"all"`` split contains all of the above splits.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (bool, optional): If true, downloads the dataset from the internet and puts it in the root directory.
         max_disparity (int, optional): Maximum disparity value. Used to compute the valid mask.
@@ -208,10 +179,10 @@ def __init__(
             available_space = statvfs.f_frsize * statvfs.f_bavail
             if available_space - self.DOWNLOAD_SPACE < 0:
                 raise ValueError(
-                    f"The storage device for {root} is too small to download the dataset), "
-                    f"an additional {self.DOWNLOAD_SPACE - self.available_space:.2f} GB are required."
+                    f"The storage device for {str(root)} is too small to download the dataset), "
+                    f"an additional {self.DOWNLOAD_SPACE - available_space:.2f} GB are required."
                 )
-            self._download_dataset(root)
+            self._download_dataset(str(root))
 
         verify_str_arg(split, "split", valid_values=("tree", "shapenet", "reflective", "hole", "all"))
 
@@ -260,7 +231,7 @@ def _download_dataset(self, root: str) -> None:
             if not os.path.exists(d_path):
                 os.makedirs(d_path)
 
-            for i in range(self.EXPERIMENTAL_RANGE):
+            for i in range(10):
                 url = f"https://data.megengine.org.cn/research/crestereo/dataset/{d}/{i}.tar"
                 download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
 
@@ -317,7 +288,7 @@ class StereoMiddlebury2014(StereoMatchingDataset):
         root (string): Root directory of the Middleburry 2014 Dataset.
         split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
         use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
-        The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
+            The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
         calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
         download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
@@ -380,10 +351,11 @@ def __init__(
         download: bool = False,
     ):
         super().__init__(root, transforms)
+
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
 
         if calibration:
-            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both", None))
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both"))  # type: ignore
             if split == "test":
                 calibration = None
                 warnings.warn(
@@ -445,7 +417,7 @@ def __init__(
 
         self.use_ambient_views = use_ambient_views
 
-    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
@@ -463,7 +435,7 @@ def _read_img(self, file_path: str) -> Image.Image:
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):  # case when dealing with the test split
             return None, None
-        disparity_map = read_pfm_file(file_path)
+        disparity_map = _read_pfm_file(file_path)
         valid_mask = disparity_map < 1e3
         # remove the channel dimension from the valid mask
         valid_mask = valid_mask[0, :, :]
@@ -478,8 +450,8 @@ def _download_dataset(self, root: str):
                 continue
             split_root = root / split_name
             for scene in split_scenes:
-                scene_name = f"{scene}-{calibration}"
                 for calibration in ["perfect", "imperfect"]:
+                    scene_name = f"{scene}-{calibration}"
                     scene_url = f"{base_url}/{scene_name}.zip"
                     download_and_extract_archive(
                         url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True
@@ -491,11 +463,11 @@ def _download_dataset(self, root: str):
 
             # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
             # we want to move the contents from testF into the  directory
-            download_and_extract_archive(url=test_set_url, download_root=root, remove_finished=True)
+            download_and_extract_archive(url=test_set_url, download_root=str(root), remove_finished=True)
             for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
                 for scene in scene_names:
                     scene_dst_dir = root / "test" / scene
-                    scene_src_dir = scene_dir / scene
+                    scene_src_dir = Path(scene_dir) / scene
                     os.makedirs(scene_dst_dir, exist_ok=True)
                     shutil.move(str(scene_src_dir), str(scene_dst_dir))
 
@@ -584,9 +556,9 @@ def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        disparity_map = read_pfm_file(file_path)
+        disparity_map = _read_pfm_file(file_path)
         valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
-        valid_mask = np.array(valid_mask).astype(np.bool)
+        valid_mask = np.array(valid_mask).astype(np.bool_)
         return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple:
@@ -651,7 +623,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -712,7 +684,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tuple, Tuple, Tuple]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -797,7 +769,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -893,12 +865,12 @@ def __init__(
             self._disparities += disparity_maps
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = read_pfm_file(file_path)
+        disparity = _read_pfm_file(file_path)
         # keep valid mask with shape (H, W)
-        valid = np.ones(disparity.shape[1:]).astype(np.bool)
+        valid = np.ones(disparity.shape[1:]).astype(np.bool_)
         return disparity, valid
 
-    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -992,7 +964,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
             disparity = disparity[None, :, :]
             return disparity, valid
 
-    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
 
@@ -1037,11 +1009,12 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         imgs_left = sorted(glob(str(root / "*" / "left.png")))
         imgs_right = list(p.replace("left", "right") for p in imgs_left)
+
         if not len(imgs_left) or not len(imgs_right):
             raise FileNotFoundError("No images found in {}".format(root))
 
         imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._images = imgs
+        self._images = imgs  # type: ignore
 
         disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
         disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
@@ -1053,11 +1026,11 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             raise FileNotFoundError("No disparity valid maps found in {}".format(root))
 
         disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
-        self._disparities = disparity_maps
+        self._disparities = disparity_maps  # type: ignore
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
-        valid = np.ones_like(disparity).astype(np.bool)
+        valid = np.ones_like(disparity).astype(np.bool_)
         # unsqueeze disparity to (C, H, W)
         disparity = disparity[None, :, :]
         return disparity, valid

From af6b343a019bfd1a9b158bf214f52c9bfea5a5cc Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 17:05:08 +0100
Subject: [PATCH 24/35] Removed duplicate test code for stereo dataset
 testcases

---
 test/datasets_utils.py                   |  33 +++++
 test/test_datasets.py                    | 168 ++---------------------
 torchvision/datasets/_stereo_matching.py |   2 +-
 3 files changed, 49 insertions(+), 154 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index 9afd8f741fd..a643c43685a 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -16,6 +16,7 @@
 from collections import defaultdict
 from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 
+import numpy as np
 import PIL
 import PIL.Image
 import pytest
@@ -933,6 +934,38 @@ def create_random_string(length: int, *digits: str) -> str:
     return "".join(random.choice(digits) for _ in range(length))
 
 
+def shape_test_for_stereo_disp(
+    left: PIL.Image.Image, right: PIL.Image.Image, disparity: np.ndarray, valid_mask: np.ndarray
+):
+    left_array = np.array(left)
+    right_array = np.array(right)
+    h, w, c = left_array.shape
+    # check that left and right are the same size
+    assert left_array.shape == right_array.shape
+    # check general shapes
+    assert c == 3
+    assert len(disparity.shape) == 3
+    assert len(valid_mask.shape) == 2
+    assert disparity.shape == (1, h, w)
+    # check that valid mask is the same size as the disparity
+    _, dh, dw = disparity.shape
+    mh, mw = valid_mask.shape
+    assert dh == mh
+    assert dw == mw
+
+
+def shape_test_for_stereo_none(left: PIL.Image.Image, right: PIL.Image.Image, disparity: None, valid_mask: None):
+    left_array = np.array(left)
+    right_array = np.array(right)
+    _, _, c = left_array.shape
+    # check that left and right are the same size
+    assert left_array.shape == right_array.shape
+    # check general shapes
+    assert c == 3
+    assert disparity is None
+    assert valid_mask is None
+
+
 def make_fake_pfm_file(h, w, file_name):
     values = list(range(3 * h * w))
     # Note: we pack everything in little endian: -1.0, and "<"
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 5db3be40b4f..8ba77244c2f 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2729,19 +2729,13 @@ def test_training_test_splits(self):
             assert dataset._images and len(dataset._images) == len(
                 dataset._disparities
             ), "Training images do not match with training disparities"
-            for _, _, disparity, valid_mask in dataset:
-                assert len(disparity.shape) == 3
-                assert len(valid_mask.shape) == 2
-                _, dh, dw = disparity.shape
-                mh, mw = valid_mask.shape
-                assert dh == mh
-                assert dw == mw
+            for left, right, disparity, valid_mask in dataset:
+                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
         with self.create_dataset(split="test") as (dataset, _):
             assert all(d == ("", "") for d in dataset._disparities)
-            for _, _, disparity, valid_mask in dataset:
-                assert disparity is None
-                assert valid_mask is None
+            for left, right, disparity, valid_mask in dataset:
+                datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -2776,21 +2770,7 @@ def test_splits(self):
         for split in ("tree", "shapenet", "reflective", "hole"):
             with self.create_dataset(split=split) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -2851,36 +2831,13 @@ def test_train_splits(self):
         for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
             with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    print("disparities", disparity.shape, valid_mask.shape)
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split, calibration=None) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert disparity is None
-                    assert valid_mask is None
+                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
 
     def test_augmented_view_usage(self):
         with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
@@ -2963,32 +2920,13 @@ def test_train_splits(self):
         for split in ["train"]:
             with self.create_dataset(split=split) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    assert disparity is None
-                    assert valid_mask is None
+                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3050,33 +2988,13 @@ def test_train_splits(self):
         for split in ["train"]:
             with self.create_dataset(split=split) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert disparity is None
-                    assert valid_mask is None
+                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3153,21 +3071,7 @@ def test_splits(self):
         for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
             with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3230,21 +3134,7 @@ def test_splits(self):
         for split_name in ["single", "mixed"]:
             with self.create_dataset(split=split_name) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3307,21 +3197,7 @@ def inject_fake_data(self, tmpdir, config):
     def test_splits(self):
         with self.create_dataset() as (dataset, _):
             for left, right, disparity, valid_mask in dataset:
-                left_array = np.array(left)
-                right_array = np.array(right)
-                h, w, c = left_array.shape
-                # check that left and right are the same size
-                assert left_array.shape == right_array.shape
-                # check general shapes
-                assert c == 3
-                assert len(disparity.shape) == 3
-                assert len(valid_mask.shape) == 2
-                assert disparity.shape == (1, h, w)
-                # check that valid mask is the same size as the disparity
-                _, dh, dw = disparity.shape
-                mh, mw = valid_mask.shape
-                assert dh == mh
-                assert dw == mw
+                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
 
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
@@ -3357,21 +3233,7 @@ def test_splits(self):
         for split_name in ["train", "test"]:
             with self.create_dataset(split=split_name) as (dataset, _):
                 for left, right, disparity, valid_mask in dataset:
-                    left_array = np.array(left)
-                    right_array = np.array(right)
-                    h, w, c = left_array.shape
-                    # check that left and right are the same size
-                    assert left_array.shape == right_array.shape
-                    # check general shapes
-                    assert c == 3
-                    assert len(disparity.shape) == 3
-                    assert len(valid_mask.shape) == 2
-                    assert disparity.shape == (1, h, w)
-                    # check that valid mask is the same size as the disparity
-                    _, dh, dw = disparity.shape
-                    mh, mw = valid_mask.shape
-                    assert dh == mh
-                    assert dw == mw
+                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index a8797d7d5c1..991ec71ef53 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -445,7 +445,7 @@ def _download_dataset(self, root: str):
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
         # train and additional splits have 2 different calibration settings
         root = Path(root) / "Middlebury2014"
-        for split_name, split_scenes in self.splits.values():
+        for split_name, split_scenes in self.splits.items():
             if split_name == "test":
                 continue
             split_root = root / split_name

From 67eacf201265bd9fc00d997e312288a22f7e833e Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 18:08:51 +0100
Subject: [PATCH 25/35] Removed string replaces. Moved pattern matching in
 parent class.

---
 torchvision/datasets/_stereo_matching.py | 265 +++++++++++------------
 1 file changed, 121 insertions(+), 144 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 991ec71ef53..f7ced224bf6 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -69,6 +69,30 @@ def _read_img(self, file_path: str) -> Image.Image:
             img = img.convert("RGB")
         return img
 
+    def _scan_pairs(self, left_pattern: str, right_pattern: str, fill_empty: bool = False) -> List[Tuple[str, str]]:
+        left_paths = sorted(glob(left_pattern))
+        right_paths = sorted(glob(right_pattern))
+
+        # used when dealing with inexistent disparity for the right image
+        if fill_empty:
+            right_paths = list("" for _ in left_paths)
+
+        if not left_paths:
+            raise FileNotFoundError(f"Could not find any files matching the patterns: {left_pattern}")
+
+        if not right_paths:
+            raise FileNotFoundError(f"Could not find any files matching the patterns: {right_pattern}")
+
+        if len(left_paths) != len(right_paths):
+            raise ValueError(
+                f"Found {len(left_paths)} left files but {len(right_paths)} right files using:\n "
+                f"left pattern: {left_pattern}\n"
+                f"right pattern: {right_pattern}\n"
+            )
+
+        images = list((left, right) for left, right in zip(left_paths, right_paths))
+        return images
+
     @abstractmethod
     def _read_disparity(self, file_path: str) -> Tuple:
         # function that returns a disparity map and an occlusion map
@@ -195,26 +219,15 @@ def __init__(
         }[split]
 
         for s in splits:
-            imgs_left = sorted(glob(str(root / s / "*_left.jpg")))
-            imgs_right = list(p.replace("_left", "_right") for p in imgs_left)
-
-            if not len(imgs_left) or not len(imgs_right):
-                raise FileNotFoundError("No images found in {}".format(root))
-
-            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
+            left_image_pattern = str(root / s / "*_left.jpg")
+            right_image_pattern = str(root / s / "*_right.jpg")
+            imgs = self._scan_pairs(left_image_pattern, right_image_pattern)
             self._images += imgs
 
-            disparity_maps_left = list(p.replace("_left", "_left.disp") for p in imgs_left)
-            disparity_maps_right = list(p.replace("_right", "_right.disp") for p in imgs_right)
-
-            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
-                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
-
-            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
-                raise FileNotFoundError("No disparity valid maps found in {}".format(root / s))
-
-            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
-            self._disparities += disparity_maps
+            left_disparity_pattern = str(root / s / "*_left.disp.jpg")
+            right_disparity_pattern = str(root / s / "*_right.disp.jpg")
+            disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
+            self._disparities += disparities
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)
@@ -397,23 +410,16 @@ def __init__(
 
         for calibration_suffix in calibrartion_suffixes:
             scene_pattern = "*" + calibration_suffix
-
-            imgs_left = sorted(glob(str(root / split / scene_pattern / "im0.png")))
-            imgs_right = sorted(glob(str(root / split / scene_pattern / "im1.png")))
-            if not len(imgs_left) or not len(imgs_right):
-                raise FileNotFoundError("No images found in {}".format(root))
-
-            self._images += list((left, right) for left, right in zip(imgs_left, imgs_right))
+            left_img_pattern = str(root / split / scene_pattern / "im0.png")
+            right_img_pattern = str(root / split / scene_pattern / "im1.png")
+            self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
             if split == "test":
-                disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+                self._disparities += list(("", "") for _ in self._images)
             else:
-                disparity_maps_left = sorted(glob(str(root / split / "*" / "disp0.pfm")))
-                disparity_maps_right = sorted(glob(str(root / split / "*" / "disp1.pfm")))
-                if not len(disparity_maps_left) or not len(disparity_maps_right):
-                    raise FileNotFoundError("No disparity maps found in {}".format(root / split))
-
-            self._disparities += list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
+                left_dispartity_pattern = str(root / split / "*" / "disp0.pfm")
+                right_dispartity_pattern = str(root / split / "*" / "disp1.pfm")
+                self._disparities += self._scan_pairs(left_dispartity_pattern, right_dispartity_pattern)
 
         self.use_ambient_views = use_ambient_views
 
@@ -424,7 +430,8 @@ def _read_img(self, file_path: str) -> Image.Image:
         """Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True."""
         if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
             # initialize sampleable container
-            ambient_file_paths = list(file_path.replace("im1.png", view_name) for view_name in ["im1E.png", "im1L.png"])
+            base_path = os.path.basename(file_path)[0]
+            ambient_file_paths = list(os.path.join(base_path, view_name) for view_name in ["im1E.png", "im1L.png"])
             # double check that we're not going to try to read from an invalid file path
             ambient_file_paths = list(filter(lambda p: os.path.exists(p), ambient_file_paths))
             # keep the original image as an option as well for uniform sampling between base views
@@ -454,7 +461,7 @@ def _download_dataset(self, root: str):
                     scene_name = f"{scene}-{calibration}"
                     scene_url = f"{base_url}/{scene_name}.zip"
                     download_and_extract_archive(
-                        url=scene_url, filename=scene_name, download_root=str(split_root), remove_finished=True
+                        url=scene_url, filename=f"{scene_name}.zip", download_root=str(split_root), remove_finished=True
                     )
 
         if any(s not in os.listdir(root) for s in self.splits["test"]):
@@ -536,28 +543,23 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         img_dir = "two_view_training" if split == "train" else "two_view_test"
         anot_dir = "two_view_training_gt"
 
-        imgs_left = sorted(glob(str(root / img_dir / "*" / "*im0.png")))
-        imgs_right = sorted(glob(str(root / img_dir / "*" / "*im1.png")))
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
+        left_img_pattern = str(root / img_dir / "*" / "im0.png")
+        right_img_pattern = str(root / img_dir / "*" / "im1.png")
+        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "test":
-            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
+            self._disparities = list(("", "") for _ in self._images)
         else:
-            disparity_maps_left = sorted(glob(str(root / anot_dir / "*" / "*0GT.pfm")))
-            disparity_maps_right = list("" for _ in disparity_maps_left)
-            if not len(disparity_maps_left):
-                raise FileNotFoundError("No disparity maps found in {}".format(root / anot_dir))
-
-        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
+            disparity_pattern = str(root / anot_dir / "*" / "disp0GT.pfm")
+            self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
         disparity_map = _read_pfm_file(file_path)
-        valid_mask = Image.open(file_path.replace("disp0GT.pfm", "mask0nocc.png"))
+        mask_path = os.path.join(os.path.split(file_path)[0], "mask0nocc.png")
+        valid_mask = Image.open(mask_path)
         valid_mask = np.array(valid_mask).astype(np.bool_)
         return disparity_map, valid_mask
 
@@ -595,23 +597,16 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
         root = Path(root) / "Kitti2012" / (split + "ing")
-        imgs_left = sorted(glob(str(root / "colored_0" / "*_10.png")))
-        imgs_right = sorted(glob(str(root / "colored_1" / "*_10.png")))
 
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
+        left_img_pattern = str(root / "colored_0" / "*_10.png")
+        right_img_pattern = str(root / "colored_1" / "*_10.png")
+        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "train":
-            disparity_maps_left = sorted(glob(str(root / "disp_noc" / "*.png")))
-            disparity_maps_right = list("" for _ in disparity_maps_left)
-            if not len(disparity_maps_left):
-                raise FileNotFoundError("No disparity maps found in {}".format(root))
-
+            disparity_pattern = str(root / "disp_noc" / "*.png")
+            self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
-
-        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities = list(("", "") for _ in self._images)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -636,12 +631,30 @@ class StereoKitti2015(StereoMatchingDataset):
             Kitti2015
                 testing
                     image_2
+                        img1.png
+                        img2.png
+                        ...
                     image_3
+                        img1.png
+                        img2.png
+                        ...
                 training
                     image_2
+                        img1.png
+                        img2.png
+                        ...
                     image_3
+                        img1.png
+                        img2.png
+                        ...
                     disp_occ_0
+                        img1.png
+                        img2.png
+                        ...
                     disp_occ_1
+                        img1.png
+                        img2.png
+                        ...
                     calib
 
     Args:
@@ -656,23 +669,16 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
         root = Path(root) / "Kitti2015" / (split + "ing")
-        imgs_left = sorted(glob(str(root / "image_2" / "*_10.png")))
-        imgs_right = sorted(glob(str(root / "image_3" / "*_10.png")))
-
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
+        left_img_pattern = str(root / "image_2" / "*.png")
+        right_img_pattern = str(root / "image_3" / "*.png")
+        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "train":
-            disparity_maps_left = sorted(glob(str(root / "disp_occ_0" / "*.png")))
-            disparity_maps_right = sorted(glob(str(root / "disp_occ_1" / "*.png")))
-            if not len(disparity_maps_left) or not len(disparity_maps_right):
-                raise FileNotFoundError("No disparity maps found in {}".format(root))
-
+            left_disparity_pattern = str(root / "disp_occ_0" / "*.png")
+            right_disparity_pattern = str(root / "disp_occ_1" / "*.png")
+            self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
         else:
-            disparity_maps_left, disparity_maps_right = list("" for _ in imgs_left), list("" for _ in imgs_right)
-
-        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._disparities = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
+            self._disparities = list(("", "") for _ in self._images)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -737,19 +743,23 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         root = Path(root) / "Sintel"
 
-        imgs_left = sorted(glob(str(root / "training" / "final_left" / "*" / "*.png")))
-        imgs_right = sorted(glob(str(root / "training" / "final_right" / "*" / "*.png")))
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
-
-        dps_masks_left = sorted(glob(str(root / "training" / "disparities" / "*" / "*.png")))
-        if not len(dps_masks_left):
-            raise FileNotFoundError("No disparity maps found in {}".format(root))
-
-        disparity_maps_right = list("" for _ in dps_masks_left)
-
-        self._images = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._disparities = list((left, right) for left, right in zip(dps_masks_left, disparity_maps_right))
+        left_img_pattern = str(root / "training" / "final_left" / "*" / "*.png")
+        right_img_pattern = str(root / "training" / "final_right" / "*" / "*.png")
+        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
+
+        disparity_pattern = str(root / "training" / "disparities" / "*" / "*.png")
+        self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
+
+    def _get_oclussion_mask_paths(self, file_path: str) -> List[str]:
+        path_tokens = file_path.split(os.sep)
+        for idx in range(len(path_tokens) - 1):
+            if path_tokens[idx] == "training" and path_tokens[idx + 1] == "disparities":
+                pre_tokens = path_tokens[: idx + 1]
+                post_tokens = path_tokens[idx + 2 :]
+                return (
+                    "/".join(pre_tokens + ["occlusions"] + post_tokens),
+                    "/".join(pre_tokens + ["outofframe"] + post_tokens),
+                )
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -761,10 +771,12 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = r * 4 + g / (2 ** 6) + b / (2 ** 14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
-        # occlusion mask
-        valid_mask = np.array(Image.open(file_path.replace("disparities", "occlusions"))) == 0
-        # out of frame mask
-        off_mask = np.array(Image.open(file_path.replace("disparities", "outofframe"))) == 0
+        # find the appropiate file paths
+        occlued_mask_path, out_of_frame_mask_path = self._get_oclussion_mask_paths(file_path)
+        # occlusion masks
+        valid_mask = np.array(Image.open(occlued_mask_path)) == 0
+        # out of frame masks
+        off_mask = np.array(Image.open(out_of_frame_mask_path)) == 0
         # combine the masks together
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
@@ -842,27 +854,13 @@ def __init__(
         root = root / split
 
         for p in passes:
-            imgs_left = sorted(glob(str(root / p / "*" / "left" / "*.png")))
-            imgs_right = sorted(glob(str(root / p / "*" / "right" / "*.png")))
-            if not len(imgs_left) or not len(imgs_right):
-                raise FileNotFoundError("No images found in {}".format(root / p))
-
-            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
-            self._images += imgs
-
-            disparity_maps_left = [file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_left]
-            disparity_maps_right = [
-                file_path.replace(p, "disparity").replace(".png", ".pfm") for file_path in imgs_right
-            ]
+            left_img_pattern = str(root / p / "*" / "left" / "*.png")
+            right_img_pattern = str(root / p / "*" / "right" / "*.png")
+            self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
-            if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
-                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
-
-            if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
-                raise FileNotFoundError("No disparity valid maps found in {}".format(root / "disparity"))
-
-            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
-            self._disparities += disparity_maps
+            left_disparity_pattern = str(root / "disparity" / "*" / "left" / "*.pfm")
+            right_disparity_pattern = str(root / "disparity" / "*" / "right" / "*.pfm")
+            self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = _read_pfm_file(file_path)
@@ -933,21 +931,13 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
         }[split]
 
         for s in splits:
-            imgs_left = sorted(glob(str(root / s / "*" / "*.left.jpg")))
-            imgs_right = sorted(glob(str(root / s / "*" / "*.right.jpg")))
-            if not len(imgs_left) or not len(imgs_right):
-                raise FileNotFoundError("No images found in {}".format(root))
-
-            imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
-            self._images += imgs
-
-            disparity_maps_left = sorted(glob(str(root / s / "*" / "*.left.depth.png")))
-            disparity_maps_right = sorted(glob(str(root / s / "*" / "*.right.depth.png")))
-            if not len(disparity_maps_left) or not len(disparity_maps_right):
-                raise FileNotFoundError("No disparity maps found in {}".format(root))
+            left_img_pattern = str(root / s / "*" / "*.left.jpg")
+            right_img_pattern = str(root / s / "*" / "*.right.jpg")
+            self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
-            disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
-            self._disparities += disparity_maps
+            left_disparity_pattern = str(root / s / "*" / "*.left.depth.png")
+            right_disparity_pattern = str(root / s / "*" / "*.right.depth.png")
+            self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         # (H, W) image
@@ -1007,26 +997,13 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
-        imgs_left = sorted(glob(str(root / "*" / "left.png")))
-        imgs_right = list(p.replace("left", "right") for p in imgs_left)
-
-        if not len(imgs_left) or not len(imgs_right):
-            raise FileNotFoundError("No images found in {}".format(root))
-
-        imgs = list((left, right) for left, right in zip(imgs_left, imgs_right))
-        self._images = imgs  # type: ignore
-
-        disparity_maps_left = list(p.replace("left", "left_disp") for p in imgs_left)
-        disparity_maps_right = list(p.replace("right", "right_disp") for p in imgs_left)
-
-        if not any(os.path.exists(file_path) for file_path in disparity_maps_left):
-            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
-
-        if not any(os.path.exists(file_path) for file_path in disparity_maps_right):
-            raise FileNotFoundError("No disparity valid maps found in {}".format(root))
+        left_img_pattern = str(root / "*" / "left.png")
+        right_img_pattern = str(root / "*" / "right.png")
+        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
 
-        disparity_maps = list((left, right) for left, right in zip(disparity_maps_left, disparity_maps_right))
-        self._disparities = disparity_maps  # type: ignore
+        left_disparity_pattern = str(root / "*" / "left_disp.png")
+        right_disparity_pattern = str(root / "*" / "right_disp.png")
+        self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity = np.array(Image.open(file_path), dtype=np.float32)

From 07e00676f29de1f08d36466c08a3fc98271e80e1 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 18:38:00 +0100
Subject: [PATCH 26/35] Addressed doc comments

---
 torchvision/datasets/_stereo_matching.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index f7ced224bf6..053195459d5 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -44,7 +44,7 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
                 disparities is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (1, H, W)
                 valid_masks is a Tuple of (``np.ndarray``, ``np.ndarray``) with shape (H, W)
 
-                In some cases, when a dataset does not provide disparties, the ``disparities`` and
+                In some cases, when a dataset does not provide disparities, the ``disparities`` and
                 ``valid_masks`` can be Tuples containing None values.
 
                 For training splits generally the datasets provide a minimal guarantee of
@@ -427,10 +427,14 @@ def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
-        """Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True."""
+        """
+        Function that reads either the original right image or an augmented view when ``use_ambient_views`` is True.
+        When ``use_ambient_views`` is True, the dataset will return at random one of ``[im1.png, im1E.png, im1L.png]``
+        as the right image.
+        """
         if os.path.basename(file_path) == "im1.png" and self.use_ambient_views:
             # initialize sampleable container
-            base_path = os.path.basename(file_path)[0]
+            base_path = os.path.dirname(file_path)
             ambient_file_paths = list(os.path.join(base_path, view_name) for view_name in ["im1E.png", "im1L.png"])
             # double check that we're not going to try to read from an invalid file path
             ambient_file_paths = list(filter(lambda p: os.path.exists(p), ambient_file_paths))
@@ -765,7 +769,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        # disparity decoding as per Sintel instructions
+        # disparity decoding as per Sintel instructions in the README provided with the dataset
         disparity_map = np.array(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
         disparity_map = r * 4 + g / (2 ** 6) + b / (2 ** 14)
@@ -945,10 +949,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
         with open(os.path.split(file_path)[0] + "/_camera_settings.json", "r") as f:
+            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constatnt)
             intrinsics = json.load(f)
-            fx = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
-            # inverse of depth-from-disparity equation
-            disparity = (fx * 6.0 * 100) / depth.astype(np.float32)
+            focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
+            baseline, pixel_constant = 6.0, 100.0  # pixel constant is inverted
+            disparity = (baseline * focal * pixel_constant) / depth.astype(np.float32)
             valid = disparity > 0
             # unsqueeze disparity to (C, H, W)
             disparity = disparity[None, :, :]

From ec550e84238f6e7d993469352fb0294d3dfdcedd Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 19:50:47 +0100
Subject: [PATCH 27/35] Middlebury disparity quickfix

---
 torchvision/datasets/_stereo_matching.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 053195459d5..6a4d2e48999 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -417,11 +417,12 @@ def __init__(
             if split == "test":
                 self._disparities += list(("", "") for _ in self._images)
             else:
-                left_dispartity_pattern = str(root / split / "*" / "disp0.pfm")
-                right_dispartity_pattern = str(root / split / "*" / "disp1.pfm")
+                left_dispartity_pattern = str(root / split / scene_pattern / "disp0.pfm")
+                right_dispartity_pattern = str(root / split / scene_pattern / "disp1.pfm")
                 self._disparities += self._scan_pairs(left_dispartity_pattern, right_dispartity_pattern)
 
         self.use_ambient_views = use_ambient_views
+        print(self._disparities[0], self._images[0])
 
     def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)

From 1dd17538d98f960bcdfa35cab7fd51efbc73cbbf Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Thu, 14 Jul 2022 20:44:43 +0100
Subject: [PATCH 28/35] Fixed mypy errors. Addressed download checks.

---
 torchvision/datasets/_stereo_matching.py | 72 ++++++++++++++++--------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 6a4d2e48999..d40616cb835 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -60,8 +60,8 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
         super().__init__(root=root)
         self.transforms = transforms
 
-        self._images: List[Tuple] = []
-        self._disparities: List[Tuple] = []
+        self._images: List[Tuple[str, str]] = []
+        self._disparities: List[Tuple[str, str]] = []
 
     def _read_img(self, file_path: str) -> Image.Image:
         img = Image.open(file_path)
@@ -69,25 +69,27 @@ def _read_img(self, file_path: str) -> Image.Image:
             img = img.convert("RGB")
         return img
 
-    def _scan_pairs(self, left_pattern: str, right_pattern: str, fill_empty: bool = False) -> List[Tuple[str, str]]:
-        left_paths = sorted(glob(left_pattern))
-        right_paths = sorted(glob(right_pattern))
+    def _scan_pairs(
+        self, paths_left_pattern: str, paths_right_pattern: str, fill_empty: bool = False
+    ) -> List[Tuple[str, str]]:
+        left_paths: List[str] = sorted(glob(paths_left_pattern))
+        right_paths: List[str] = sorted(glob(paths_right_pattern))
 
         # used when dealing with inexistent disparity for the right image
         if fill_empty:
             right_paths = list("" for _ in left_paths)
 
         if not left_paths:
-            raise FileNotFoundError(f"Could not find any files matching the patterns: {left_pattern}")
+            raise FileNotFoundError(f"Could not find any files matching the patterns: {paths_left_pattern}")
 
         if not right_paths:
-            raise FileNotFoundError(f"Could not find any files matching the patterns: {right_pattern}")
+            raise FileNotFoundError(f"Could not find any files matching the patterns: {paths_right_pattern}")
 
         if len(left_paths) != len(right_paths):
             raise ValueError(
                 f"Found {len(left_paths)} left files but {len(right_paths)} right files using:\n "
-                f"left pattern: {left_pattern}\n"
-                f"right pattern: {right_pattern}\n"
+                f"left pattern: {paths_left_pattern}\n"
+                f"right pattern: {paths_right_pattern}\n"
             )
 
         images = list((left, right) for left, right in zip(left_paths, right_paths))
@@ -387,6 +389,7 @@ def __init__(
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
+        self.split = split
 
         if not os.path.exists(root / split):
             raise FileNotFoundError(f"The {split} directory was not found in the provided root directory")
@@ -457,7 +460,9 @@ def _download_dataset(self, root: str):
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
         # train and additional splits have 2 different calibration settings
         root = Path(root) / "Middlebury2014"
-        for split_name, split_scenes in self.splits.items():
+        download_split = self.split
+
+        for split_name, split_scenes in (download_split, self.splits[download_split]):
             if split_name == "test":
                 continue
             split_root = root / split_name
@@ -465,11 +470,16 @@ def _download_dataset(self, root: str):
                 for calibration in ["perfect", "imperfect"]:
                     scene_name = f"{scene}-{calibration}"
                     scene_url = f"{base_url}/{scene_name}.zip"
-                    download_and_extract_archive(
-                        url=scene_url, filename=f"{scene_name}.zip", download_root=str(split_root), remove_finished=True
-                    )
-
-        if any(s not in os.listdir(root) for s in self.splits["test"]):
+                    # download the scene only if it doesn't exist
+                    if not os.path.exists(split_root / scene_name):
+                        download_and_extract_archive(
+                            url=scene_url,
+                            filename=f"{scene_name}.zip",
+                            download_root=str(split_root),
+                            remove_finished=True,
+                        )
+
+        if any(s not in os.listdir(root / "test") for s in self.splits["test"]):
             # test split is downloaded from a different location
             test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
 
@@ -550,13 +560,13 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         left_img_pattern = str(root / img_dir / "*" / "im0.png")
         right_img_pattern = str(root / img_dir / "*" / "im1.png")
-        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
+        self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "test":
             self._disparities = list(("", "") for _ in self._images)
         else:
             disparity_pattern = str(root / anot_dir / "*" / "disp0GT.pfm")
-            self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
+            self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
@@ -605,11 +615,11 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
 
         left_img_pattern = str(root / "colored_0" / "*_10.png")
         right_img_pattern = str(root / "colored_1" / "*_10.png")
-        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
+        self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "train":
             disparity_pattern = str(root / "disp_noc" / "*.png")
-            self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
+            self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
         else:
             self._disparities = list(("", "") for _ in self._images)
 
@@ -676,12 +686,12 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         root = Path(root) / "Kitti2015" / (split + "ing")
         left_img_pattern = str(root / "image_2" / "*.png")
         right_img_pattern = str(root / "image_3" / "*.png")
-        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
+        self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
         if split == "train":
             left_disparity_pattern = str(root / "disp_occ_0" / "*.png")
             right_disparity_pattern = str(root / "disp_occ_1" / "*.png")
-            self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
+            self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
         else:
             self._disparities = list(("", "") for _ in self._images)
 
@@ -750,21 +760,33 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
 
         left_img_pattern = str(root / "training" / "final_left" / "*" / "*.png")
         right_img_pattern = str(root / "training" / "final_right" / "*" / "*.png")
-        self._images = self._scan_pairs(left_img_pattern, right_img_pattern)
+        self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
         disparity_pattern = str(root / "training" / "disparities" / "*" / "*.png")
-        self._disparities = self._scan_pairs(disparity_pattern, "", fill_empty=True)
+        self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
 
-    def _get_oclussion_mask_paths(self, file_path: str) -> List[str]:
+    def _get_oclussion_mask_paths(self, file_path: str) -> Tuple[str, str]:
         path_tokens = file_path.split(os.sep)
+        rets = None
+
         for idx in range(len(path_tokens) - 1):
             if path_tokens[idx] == "training" and path_tokens[idx + 1] == "disparities":
                 pre_tokens = path_tokens[: idx + 1]
                 post_tokens = path_tokens[idx + 2 :]
-                return (
+                rets = (
                     "/".join(pre_tokens + ["occlusions"] + post_tokens),
                     "/".join(pre_tokens + ["outofframe"] + post_tokens),
                 )
+                break
+
+        if rets is None:
+            raise ValueError("Malformed file path: {}".format(file_path))
+
+        for path in rets:
+            if not os.path.exists(path):
+                raise ValueError(f"Could not find file {path}")
+
+        return rets
 
     def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):

From 9f70687e055a30cd91439f99c12503202c9f0c2e Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Fri, 15 Jul 2022 12:30:07 +0100
Subject: [PATCH 29/35] Dataset renaming. Test changes. getitem removed.
 Warnings removed. Middlebury per split download.

---
 test/datasets_utils.py                   |  22 ++--
 test/test_datasets.py                    |  80 ++++++------
 torchvision/datasets/__init__.py         |  14 +--
 torchvision/datasets/_stereo_matching.py | 150 +++++++++--------------
 4 files changed, 114 insertions(+), 152 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index a643c43685a..b0c31c71116 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -24,7 +24,7 @@
 import torchvision.datasets
 import torchvision.io
 from common_utils import get_tmp_dir, disable_console_output
-
+from torchvision.transforms.functional import get_dimensions
 
 __all__ = [
     "UsageError",
@@ -937,15 +937,15 @@ def create_random_string(length: int, *digits: str) -> str:
 def shape_test_for_stereo_disp(
     left: PIL.Image.Image, right: PIL.Image.Image, disparity: np.ndarray, valid_mask: np.ndarray
 ):
-    left_array = np.array(left)
-    right_array = np.array(right)
-    h, w, c = left_array.shape
+    left_dims = get_dimensions(left)
+    right_dims = get_dimensions(right)
+    c, h, w = left_dims
     # check that left and right are the same size
-    assert left_array.shape == right_array.shape
+    assert left_dims == right_dims
     # check general shapes
     assert c == 3
-    assert len(disparity.shape) == 3
-    assert len(valid_mask.shape) == 2
+    assert disparity.ndim == 3
+    assert valid_mask.ndim == 2
     assert disparity.shape == (1, h, w)
     # check that valid mask is the same size as the disparity
     _, dh, dw = disparity.shape
@@ -955,11 +955,11 @@ def shape_test_for_stereo_disp(
 
 
 def shape_test_for_stereo_none(left: PIL.Image.Image, right: PIL.Image.Image, disparity: None, valid_mask: None):
-    left_array = np.array(left)
-    right_array = np.array(right)
-    _, _, c = left_array.shape
+    left_dims = get_dimensions(left)
+    right_dims = get_dimensions(right)
+    c, _, _ = left_dims
     # check that left and right are the same size
-    assert left_array.shape == right_array.shape
+    assert left_dims == right_dims
     # check general shapes
     assert c == 3
     assert disparity is None
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 8ba77244c2f..77f3ee4e019 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2672,8 +2672,8 @@ def inject_fake_data(self, tmpdir: str, config):
         return len(sampled_classes) * num_images_per_class[config["split"]]
 
 
-class StereoETH3DTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoETH3D
+class ETH3DTStereoestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.ETH3DStereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
@@ -2745,41 +2745,37 @@ def test_bad_input(self):
 
 class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CREStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("tree", "shapenet", "reflective", "hole"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
         crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
         os.makedirs(crestereo_dir, exist_ok=True)
 
-        split_dir = crestereo_dir / config["split"]
-        os.makedirs(split_dir, exist_ok=True)
+        examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5}
 
-        num_examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5}.get(config["split"], 0)
+        for category_name in ["shapenet", "reflective", "tree", "hole"]:
+            split_dir = crestereo_dir / category_name
+            os.makedirs(split_dir, exist_ok=True)
+            num_examples = examples[category_name]
 
-        for idx in range(num_examples):
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
-            # these are going to end up being gray scale images
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.jpg", size=(1, 100, 100))
-            datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.jpg", size=(1, 100, 100))
+            for idx in range(num_examples):
+                p = datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
+                print(p)
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
+                # these are going to end up being gray scale images
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.png", size=(1, 100, 100))
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.png", size=(1, 100, 100))
 
-        return num_examples
+        return sum(examples.values())
 
     def test_splits(self):
-        for split in ("tree", "shapenet", "reflective", "hole"):
-            with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
-
-    def test_bad_input(self):
-        with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
-            with self.create_dataset(split="bad"):
-                pass
+        with self.create_dataset() as (dataset, _):
+            for left, right, disparity, valid_mask in dataset:
+                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
 
 
-class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoMiddlebury2014
+class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Middlebury2014Stereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
         split=("train", "additional"),
         calibration=("perfect", "imperfect", "both"),
@@ -2789,7 +2785,7 @@ class StereoMiddlebury2014TestCase(datasets_utils.ImageDatasetTestCase):
 
     @staticmethod
     def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
-        calibrations = [""] if split == "test" else ["-perfect", "-imperfect"]
+        calibrations = [None] if split == "test" else ["-perfect", "-imperfect"]
         scene_dirs = []
         for c in calibrations:
             scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
@@ -2851,9 +2847,9 @@ def test_warnings_train(self):
         # train set invalid
         split = "train"
         calibration = None
-        with pytest.warns(
-            RuntimeWarning,
-            match=f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+        with pytest.raises(
+            ValueError,
+            match=f"Split '{split}' has calibration settings, however None was provided as an argument."
             f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
         ):
             with self.create_dataset(split=split, calibration=calibration):
@@ -2863,8 +2859,8 @@ def test_warnings_test(self):
         # test set invalid
         split = "test"
         calibration = "perfect"
-        with pytest.warns(
-            RuntimeWarning, match="\nSplit 'test' has only no calibration settings, ignoring calibration argument."
+        with pytest.raises(
+            ValueError, match="Split 'test' has only no calibration settings, please set `calibration=None`."
         ):
             with self.create_dataset(split=split, calibration=calibration):
                 pass
@@ -2875,8 +2871,8 @@ def test_bad_input(self):
                 pass
 
 
-class StereoKitti2012TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2012
+class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Kitti2012Stereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
@@ -2934,8 +2930,8 @@ def test_bad_input(self):
                 pass
 
 
-class StereoKitti2015TestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoKitti2015
+class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.Kitti2015Stereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
@@ -3002,8 +2998,8 @@ def test_bad_input(self):
                 pass
 
 
-class StereoSceneFlowTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoSceneFlow
+class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SceneFlowStereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
         split=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final")
     )
@@ -3079,8 +3075,8 @@ def test_bad_input(self):
                 pass
 
 
-class StereoFallingThingsTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoFallingThings
+class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.FallingThingsStereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
@@ -3100,10 +3096,10 @@ def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> Lis
         paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
         # single channel depth maps
         paths.append(
-            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
+            FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
         )
         paths.append(
-            StereoFallingThingsTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
+            FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
         )
         # camera settings json. Minimal example for _read_disparity function testing
         settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
@@ -3142,8 +3138,8 @@ def test_bad_input(self):
                 pass
 
 
-class StereoSintelTestCase(datasets_utils.ImageDatasetTestCase):
-    DATASET_CLASS = datasets.StereoSintel
+class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
+    DATASET_CLASS = datasets.SintelStereo
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 973d5ca9f7e..8e0e6f274d1 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -1,12 +1,12 @@
 from ._optical_flow import KittiFlow, Sintel, FlyingChairs, FlyingThings3D, HD1K
 from ._stereo_matching import (
-    StereoETH3D,
-    StereoFallingThings,
-    StereoKitti2012,
-    StereoKitti2015,
-    StereoMiddlebury2014,
-    StereoSceneFlow,
-    StereoSintel,
+    ETH3DStereo,
+    FallingThingsStereo,
+    Kitti2012Stereo,
+    Kitti2015Stereo,
+    Middlebury2014Stereo,
+    SceneFlowStereo,
+    SintelStereo,
     CREStereo,
     InStereo2k,
 )
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index d40616cb835..474b82adcc0 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -3,7 +3,6 @@
 import os
 import random
 import shutil
-import warnings
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
@@ -17,13 +16,13 @@
 
 __all__ = (
     "CREStereo"
-    "StereoMiddlebury2014"
-    "StereoETH3D"
-    "StereoKitti2012"
-    "StereoKitti2015"
-    "StereoSintel"
-    "StereoSceneFlow"
-    "StereoFallingThings"
+    "Middlebury2014Stereo"
+    "ETH3DStereo"
+    "Kitti2012Stereo"
+    "Kitti2015Stereo"
+    "SintelStereo"
+    "SceneFlowStereo"
+    "FallingThingsStereo"
     "InStereo2k"
 )
 
@@ -188,7 +187,6 @@ class CREStereo(StereoMatchingDataset):
     def __init__(
         self,
         root: str,
-        split: str = "tree",
         transforms: Optional[Callable] = None,
         download: bool = False,
         max_disparity: float = 256.0,
@@ -210,29 +208,22 @@ def __init__(
                 )
             self._download_dataset(str(root))
 
-        verify_str_arg(split, "split", valid_values=("tree", "shapenet", "reflective", "hole", "all"))
+        dirs = ["shapenet", "reflective", "tree", "hole"]
 
-        splits = {
-            "tree": ["tree"],
-            "shapenet": ["shapenet"],
-            "reflective": ["reflective"],
-            "hole": ["hole"],
-            "all": ["hole", "shapenet", "reflective", "hole"],
-        }[split]
-
-        for s in splits:
+        for s in dirs:
             left_image_pattern = str(root / s / "*_left.jpg")
             right_image_pattern = str(root / s / "*_right.jpg")
+            print(left_image_pattern, right_image_pattern)
             imgs = self._scan_pairs(left_image_pattern, right_image_pattern)
             self._images += imgs
 
-            left_disparity_pattern = str(root / s / "*_left.disp.jpg")
-            right_disparity_pattern = str(root / s / "*_right.disp.jpg")
+            left_disparity_pattern = str(root / s / "*_left.disp.png")
+            right_disparity_pattern = str(root / s / "*_right.disp.png")
             disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
             self._disparities += disparities
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        disparity = np.asarray(Image.open(file_path), dtype=np.float32)
         valid = (disparity < self.max_disparity) & (disparity > 0.0)
         # unsqueeze the disparity map into (C, H, W) format
         disparity = disparity[None, :, :]
@@ -251,7 +242,7 @@ def _download_dataset(self, root: str) -> None:
                 download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
 
 
-class StereoMiddlebury2014(StereoMatchingDataset):
+class Middlebury2014Stereo(StereoMatchingDataset):
     """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
 
     The dataset mostly follows the original format, without containing the ambient subdirectories.  : ::
@@ -368,28 +359,23 @@ def __init__(
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
+        self.split = split
 
         if calibration:
-            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both"))  # type: ignore
+            verify_str_arg(calibration, "calibration", valid_values=("perfect", "imperfect", "both", None))  # type: ignore
             if split == "test":
-                calibration = None
-                warnings.warn(
-                    "\nSplit 'test' has only no calibration settings, ignoring calibration argument.", RuntimeWarning
-                )
+                raise ValueError("Split 'test' has only no calibration settings, please set `calibration=None`.")
         else:
             if split != "test":
-                calibration = "perfect"
-                warnings.warn(
-                    f"\nSplit '{split}' has calibration settings, however None was provided as an argument."
+                raise ValueError(
+                    f"Split '{split}' has calibration settings, however None was provided as an argument."
                     f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.",
-                    RuntimeWarning,
                 )
 
         if download:
             self._download_dataset(root)
 
         root = Path(root) / "Middlebury2014"
-        self.split = split
 
         if not os.path.exists(root / split):
             raise FileNotFoundError(f"The {split} directory was not found in the provided root directory")
@@ -425,10 +411,6 @@ def __init__(
                 self._disparities += self._scan_pairs(left_dispartity_pattern, right_dispartity_pattern)
 
         self.use_ambient_views = use_ambient_views
-        print(self._disparities[0], self._images[0])
-
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
 
     def _read_img(self, file_path: str) -> Image.Image:
         """
@@ -460,16 +442,15 @@ def _download_dataset(self, root: str):
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
         # train and additional splits have 2 different calibration settings
         root = Path(root) / "Middlebury2014"
-        download_split = self.split
+        split_name = self.split
 
-        for split_name, split_scenes in (download_split, self.splits[download_split]):
-            if split_name == "test":
-                continue
-            split_root = root / split_name
-            for scene in split_scenes:
+        if split_name != "test":
+            for split_scene in self.splits[split_name]:
+                split_root = root / split_name
                 for calibration in ["perfect", "imperfect"]:
-                    scene_name = f"{scene}-{calibration}"
+                    scene_name = f"{split_scene}-{calibration}"
                     scene_url = f"{base_url}/{scene_name}.zip"
+                    print(f"Downloading {scene_url}")
                     # download the scene only if it doesn't exist
                     if not os.path.exists(split_root / scene_name):
                         download_and_extract_archive(
@@ -478,26 +459,26 @@ def _download_dataset(self, root: str):
                             download_root=str(split_root),
                             remove_finished=True,
                         )
-
-        if any(s not in os.listdir(root / "test") for s in self.splits["test"]):
-            # test split is downloaded from a different location
-            test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
-
-            # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
-            # we want to move the contents from testF into the  directory
-            download_and_extract_archive(url=test_set_url, download_root=str(root), remove_finished=True)
-            for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
-                for scene in scene_names:
-                    scene_dst_dir = root / "test" / scene
-                    scene_src_dir = Path(scene_dir) / scene
-                    os.makedirs(scene_dst_dir, exist_ok=True)
-                    shutil.move(str(scene_src_dir), str(scene_dst_dir))
-
-            # cleanup MiddEval3 directory
-            shutil.rmtree(str(root / "MiddEval3"))
-
-
-class StereoETH3D(StereoMatchingDataset):
+        else:
+            os.makedirs(root / "test")
+            if any(s not in os.listdir(root / "test") for s in self.splits["test"]):
+                # test split is downloaded from a different location
+                test_set_url = "https://vision.middlebury.edu/stereo/submit3/zip/MiddEval3-data-F.zip"
+                # the unzip is going to produce a directory MiddEval3 with two subdirectories trainingF and testF
+                # we want to move the contents from testF into the  directory
+                download_and_extract_archive(url=test_set_url, download_root=str(root), remove_finished=True)
+                for scene_dir, scene_names, _ in os.walk(str(root / "MiddEval3/testF")):
+                    for scene in scene_names:
+                        scene_dst_dir = root / "test"
+                        scene_src_dir = Path(scene_dir) / scene
+                        os.makedirs(scene_dst_dir, exist_ok=True)
+                        shutil.move(str(scene_src_dir), str(scene_dst_dir))
+
+                # cleanup MiddEval3 directory
+                shutil.rmtree(str(root / "MiddEval3"))
+
+
+class ETH3DStereo(StereoMatchingDataset):
     """ "ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
 
     The dataset is expected to have the following structure: ::
@@ -575,14 +556,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = _read_pfm_file(file_path)
         mask_path = os.path.join(os.path.split(file_path)[0], "mask0nocc.png")
         valid_mask = Image.open(mask_path)
-        valid_mask = np.array(valid_mask).astype(np.bool_)
+        valid_mask = np.asarray(valid_mask).astype(np.bool_)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
-
 
-class StereoKitti2012(StereoMatchingDataset):
+class Kitti2012Stereo(StereoMatchingDataset):
     """ "Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
     Uses the RGB images for consistency with Kitti 2015.
 
@@ -627,17 +605,14 @@ def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        disparity_map = np.array(Image.open(file_path)) / 256.0
+        disparity_map = np.asarray(Image.open(file_path)) / 256.0
         valid_mask = disparity_map > 0.0
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
 
-
-class StereoKitti2015(StereoMatchingDataset):
+class Kitti2015Stereo(StereoMatchingDataset):
     """ "Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
 
     The dataset is expected to have the following structure: ::
@@ -699,17 +674,14 @@ def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):
             return None, None
 
-        disparity_map = np.array(Image.open(file_path)) / 256.0
+        disparity_map = np.asarray(Image.open(file_path)) / 256.0
         valid_mask = disparity_map < 0.0
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
-
 
-class StereoSintel(StereoMatchingDataset):
+class SintelStereo(StereoMatchingDataset):
     """ "Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
 
     The dataset is expected to have the following structure: ::
@@ -793,7 +765,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return None, None
 
         # disparity decoding as per Sintel instructions in the README provided with the dataset
-        disparity_map = np.array(Image.open(file_path), dtype=np.float32)
+        disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         r, g, b = np.split(disparity_map, 3, axis=-1)
         disparity_map = r * 4 + g / (2 ** 6) + b / (2 ** 14)
         # reshape into (C, H, W) format
@@ -801,18 +773,15 @@ def _read_disparity(self, file_path: str) -> Tuple:
         # find the appropiate file paths
         occlued_mask_path, out_of_frame_mask_path = self._get_oclussion_mask_paths(file_path)
         # occlusion masks
-        valid_mask = np.array(Image.open(occlued_mask_path)) == 0
+        valid_mask = np.asarray(Image.open(occlued_mask_path)) == 0
         # out of frame masks
-        off_mask = np.array(Image.open(out_of_frame_mask_path)) == 0
+        off_mask = np.asarray(Image.open(out_of_frame_mask_path)) == 0
         # combine the masks together
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
-
 
-class StereoSceneFlow(StereoMatchingDataset):
+class SceneFlowStereo(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
 
     The dataset is expected to have the following structre: ::
@@ -895,11 +864,8 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid = np.ones(disparity.shape[1:]).astype(np.bool_)
         return disparity, valid
 
-    def __getitem__(self, index: int) -> Tuple:
-        return super().__getitem__(index)
-
 
-class StereoFallingThings(StereoMatchingDataset):
+class FallingThingsStereo(StereoMatchingDataset):
     """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
 
     The dataset is expected to have the following structre: ::
@@ -968,7 +934,7 @@ def __init__(self, root: str, split: str = "single", transforms: Optional[Callab
 
     def _read_disparity(self, file_path: str) -> Tuple:
         # (H, W) image
-        depth = np.array(Image.open(file_path))
+        depth = np.asarray(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
         with open(os.path.split(file_path)[0] + "/_camera_settings.json", "r") as f:
@@ -1034,7 +1000,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = np.array(Image.open(file_path), dtype=np.float32)
+        disparity = np.asarray(Image.open(file_path), dtype=np.float32)
         valid = np.ones_like(disparity).astype(np.bool_)
         # unsqueeze disparity to (C, H, W)
         disparity = disparity[None, :, :]

From 78f4a52a69605a6385d160fdef7814c63e4ccf3a Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Fri, 15 Jul 2022 13:55:53 +0100
Subject: [PATCH 30/35] Forced disparity to be positive

---
 torchvision/datasets/_stereo_matching.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 474b82adcc0..66d8834c74c 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -433,6 +433,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
         if not os.path.exists(file_path):  # case when dealing with the test split
             return None, None
         disparity_map = _read_pfm_file(file_path)
+        disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = disparity_map < 1e3
         # remove the channel dimension from the valid mask
         valid_mask = valid_mask[0, :, :]
@@ -554,6 +555,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return None, None
 
         disparity_map = _read_pfm_file(file_path)
+        disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         mask_path = os.path.join(os.path.split(file_path)[0], "mask0nocc.png")
         valid_mask = Image.open(mask_path)
         valid_mask = np.asarray(valid_mask).astype(np.bool_)
@@ -675,7 +677,7 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return None, None
 
         disparity_map = np.asarray(Image.open(file_path)) / 256.0
-        valid_mask = disparity_map < 0.0
+        valid_mask = disparity_map > 0.0
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :]
         return disparity_map, valid_mask
@@ -859,10 +861,11 @@ def __init__(
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = _read_pfm_file(file_path)
+        disparity_map = _read_pfm_file(file_path)
+        disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         # keep valid mask with shape (H, W)
-        valid = np.ones(disparity.shape[1:]).astype(np.bool_)
-        return disparity, valid
+        valid = np.ones(disparity_map.shape[1:]).astype(np.bool_)
+        return disparity_map, valid
 
 
 class FallingThingsStereo(StereoMatchingDataset):

From e2ad8d21b4c69237c4b2230d3ad920d5484227cb Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 18 Jul 2022 15:53:56 +0100
Subject: [PATCH 31/35] Removed implicit mask creation. Added private
 built_in_mask flag similar to _optical_flow.py

---
 test/datasets_utils.py                   |  18 +++-
 test/test_datasets.py                    | 122 +++++++++++++----------
 torchvision/datasets/_stereo_matching.py |  93 ++++++++++-------
 3 files changed, 145 insertions(+), 88 deletions(-)

diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index b0c31c71116..ea85a853824 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -934,7 +934,7 @@ def create_random_string(length: int, *digits: str) -> str:
     return "".join(random.choice(digits) for _ in range(length))
 
 
-def shape_test_for_stereo_disp(
+def shape_test_for_stereo_gt_w_mask(
     left: PIL.Image.Image, right: PIL.Image.Image, disparity: np.ndarray, valid_mask: np.ndarray
 ):
     left_dims = get_dimensions(left)
@@ -945,7 +945,6 @@ def shape_test_for_stereo_disp(
     # check general shapes
     assert c == 3
     assert disparity.ndim == 3
-    assert valid_mask.ndim == 2
     assert disparity.shape == (1, h, w)
     # check that valid mask is the same size as the disparity
     _, dh, dw = disparity.shape
@@ -954,7 +953,19 @@ def shape_test_for_stereo_disp(
     assert dw == mw
 
 
-def shape_test_for_stereo_none(left: PIL.Image.Image, right: PIL.Image.Image, disparity: None, valid_mask: None):
+def shape_test_for_stereo_gt_no_mask(left: PIL.Image.Image, right: PIL.Image.Image, disparity: np.ndarray):
+    left_dims = get_dimensions(left)
+    right_dims = get_dimensions(right)
+    c, h, w = left_dims
+    # check that left and right are the same size
+    assert left_dims == right_dims
+    # check general shapes
+    assert c == 3
+    assert disparity.ndim == 3
+    assert disparity.shape == (1, h, w)
+
+
+def shape_test_for_stereo_no_gt(left: PIL.Image.Image, right: PIL.Image.Image, disparity: None):
     left_dims = get_dimensions(left)
     right_dims = get_dimensions(right)
     c, _, _ = left_dims
@@ -963,7 +974,6 @@ def shape_test_for_stereo_none(left: PIL.Image.Image, right: PIL.Image.Image, di
     # check general shapes
     assert c == 3
     assert disparity is None
-    assert valid_mask is None
 
 
 def make_fake_pfm_file(h, w, file_name):
diff --git a/test/test_datasets.py b/test/test_datasets.py
index 77f3ee4e019..a75e597c049 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2724,18 +2724,19 @@ def inject_fake_data(self, tmpdir, config):
 
         return num_examples
 
-    def test_training_test_splits(self):
+    def test_training_splits(self):
         with self.create_dataset(split="train") as (dataset, _):
             assert dataset._images and len(dataset._images) == len(
                 dataset._disparities
             ), "Training images do not match with training disparities"
             for left, right, disparity, valid_mask in dataset:
-                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                datasets_utils.shape_test_for_stereo_gt_w_mask(left, right, disparity, valid_mask)
 
+    def test_testing_splits(self):
         with self.create_dataset(split="test") as (dataset, _):
             assert all(d == ("", "") for d in dataset._disparities)
-            for left, right, disparity, valid_mask in dataset:
-                datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
+            for left, right, disparity, _ in dataset:
+                datasets_utils.shape_test_for_stereo_no_gt(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -2745,7 +2746,7 @@ def test_bad_input(self):
 
 class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CREStereo
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, np.ndarray, type(None))
 
     def inject_fake_data(self, tmpdir, config):
         crestereo_dir = pathlib.Path(tmpdir) / "CREStereo"
@@ -2759,8 +2760,7 @@ def inject_fake_data(self, tmpdir, config):
             num_examples = examples[category_name]
 
             for idx in range(num_examples):
-                p = datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
-                print(p)
+                datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100))
                 datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100))
                 # these are going to end up being gray scale images
                 datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.png", size=(1, 100, 100))
@@ -2770,8 +2770,8 @@ def inject_fake_data(self, tmpdir, config):
 
     def test_splits(self):
         with self.create_dataset() as (dataset, _):
-            for left, right, disparity, valid_mask in dataset:
-                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+            for left, right, disparity, mask in dataset:
+                datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
 
 class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
@@ -2781,7 +2781,7 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
         calibration=("perfect", "imperfect", "both"),
         use_ambient_views=(True, False),
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
     def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
@@ -2826,18 +2826,18 @@ def inject_fake_data(self, tmpdir, config):
     def test_train_splits(self):
         for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]):
             with self.create_dataset(split=split, calibration=calibration) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity in dataset:
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split, calibration=None) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
+                for left, right, disparity in dataset:
+                    datasets_utils.shape_test_for_stereo_no_gt(left, right, disparity)
 
     def test_augmented_view_usage(self):
         with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _):
-            for left, right, _, _ in dataset:
+            for left, right, _ in dataset:
                 left_array = np.array(left)
                 right_array = np.array(right)
                 # check that left and right are the same size
@@ -2915,14 +2915,16 @@ def inject_fake_data(self, tmpdir, config):
     def test_train_splits(self):
         for split in ["train"]:
             with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity, mask in dataset:
+                    assert mask is None
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
+                for left, right, disparity, mask in dataset:
+                    assert mask is None
+                    datasets_utils.shape_test_for_stereo_no_gt(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -2983,14 +2985,16 @@ def inject_fake_data(self, tmpdir, config):
     def test_train_splits(self):
         for split in ["train"]:
             with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity, mask in dataset:
+                    assert mask is None
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_test_split(self):
         for split in ["test"]:
             with self.create_dataset(split=split) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_none(left, right, disparity, valid_mask)
+                for left, right, disparity, mask in dataset:
+                    assert mask is None
+                    datasets_utils.shape_test_for_stereo_no_gt(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3003,7 +3007,7 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
         split=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final")
     )
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
     def _create_pfm_folder(
@@ -3066,8 +3070,8 @@ def inject_fake_data(self, tmpdir, config):
     def test_splits(self):
         for split_name, pass_name in itertools.product(["FlyingThings3D", "Driving", "Monkaa"], ["clean", "final"]):
             with self.create_dataset(split=split_name, pass_name=pass_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity in dataset:
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3078,7 +3082,7 @@ def test_bad_input(self):
 class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FallingThingsStereo
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("single", "mixed"))
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
     def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
@@ -3129,8 +3133,8 @@ def inject_fake_data(self, tmpdir, config):
     def test_splits(self):
         for split_name in ["single", "mixed"]:
             with self.create_dataset(split=split_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity in dataset:
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
@@ -3140,6 +3144,7 @@ def test_bad_input(self):
 
 class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SintelStereo
+    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -3150,25 +3155,31 @@ def inject_fake_data(self, tmpdir, config):
         os.makedirs(split_dir, exist_ok=True)
 
         # a single setting, since there are no splits
-        num_examples = 4
-
-        for view in ["final_left", "final_right"]:
-            root = split_dir / view
-            os.makedirs(root, exist_ok=True)
+        num_examples = {"final": 2, "clean": 2}
+        pass_names = {
+            "final": ["final"],
+            "clean": ["clean"],
+            "both": ["final", "clean"],
+        }.get(config["pass_name"], [])
+
+        for p in pass_names:
+            for view in [f"{p}_left", f"{p}_right"]:
+                root = split_dir / view
+                os.makedirs(root, exist_ok=True)
 
-            datasets_utils.create_image_folder(
-                root=root,
-                name="scene1",
-                file_name_fn=lambda i: f"{i:06d}.png",
-                num_examples=num_examples,
-                size=(3, 100, 200),
-            )
+                datasets_utils.create_image_folder(
+                    root=root,
+                    name="scene1",
+                    file_name_fn=lambda i: f"{i:06d}.png",
+                    num_examples=num_examples[p],
+                    size=(3, 100, 200),
+                )
 
         datasets_utils.create_image_folder(
             root=split_dir / "occlusions",
             name="scene1",
             file_name_fn=lambda i: f"{i:06d}.png",
-            num_examples=num_examples,
+            num_examples=2,
             size=(1, 100, 200),
         )
 
@@ -3176,7 +3187,7 @@ def inject_fake_data(self, tmpdir, config):
             root=split_dir / "outofframe",
             name="scene1",
             file_name_fn=lambda i: f"{i:06d}.png",
-            num_examples=num_examples,
+            num_examples=2,
             size=(1, 100, 200),
         )
 
@@ -3184,21 +3195,32 @@ def inject_fake_data(self, tmpdir, config):
             root=split_dir / "disparities",
             name="scene1",
             file_name_fn=lambda i: f"{i:06d}.png",
-            num_examples=num_examples,
+            num_examples=2,
             size=(3, 100, 200),
         )
 
+        if config["pass_name"] == "both":
+            num_examples = sum(num_examples.values())
+        else:
+            num_examples = num_examples.get(config["pass_name"], 0)
+
         return num_examples
 
     def test_splits(self):
-        with self.create_dataset() as (dataset, _):
-            for left, right, disparity, valid_mask in dataset:
-                datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+        for pass_name in ["final", "clean", "both"]:
+            with self.create_dataset(pass_name=pass_name) as (dataset, _):
+                for left, right, disparity, valid_mask in dataset:
+                    datasets_utils.shape_test_for_stereo_gt_w_mask(left, right, disparity, valid_mask)
+
+    def test_bad_input(self):
+        with pytest.raises(ValueError, match="Unknown value 'bad' for argument pass_name"):
+            with self.create_dataset(pass_name="bad"):
+                pass
 
 
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.InStereo2k
-    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
+    FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
     ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
 
     @staticmethod
@@ -3228,8 +3250,8 @@ def inject_fake_data(self, tmpdir, config):
     def test_splits(self):
         for split_name in ["train", "test"]:
             with self.create_dataset(split=split_name) as (dataset, _):
-                for left, right, disparity, valid_mask in dataset:
-                    datasets_utils.shape_test_for_stereo_disp(left, right, disparity, valid_mask)
+                for left, right, disparity in dataset:
+                    datasets_utils.shape_test_for_stereo_gt_no_mask(left, right, disparity)
 
     def test_bad_input(self):
         with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"):
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 66d8834c74c..14ce20a2d96 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -32,6 +32,8 @@
 class StereoMatchingDataset(ABC, VisionDataset):
     """Base interface for Stereo matching datasets"""
 
+    _has_built_in_disparity_mask = False
+
     def __init__(self, root: str, transforms: Optional[Callable] = None):
         """
 
@@ -49,11 +51,15 @@ def __init__(self, root: str, transforms: Optional[Callable] = None):
                 For training splits generally the datasets provide a minimal guarantee of
                 images: (``PIL.Image``, ``PIL.Image``)
                 disparities: (``np.ndarray``, ``None``) with shape (1, H, W)
-                valid_masks: (``np.ndarray``, ``None``) with shape (H, W)
+
+                Optionally, based on the dataset, it can return a ``mask`` as well:
+                valid_masks: (``np.ndarray | None``, ``None``) with shape (H, W)
 
                 For some test splits, the datasets provides outputs that look like:
                 imgaes: (``PIL.Image``, ``PIL.Image``)
                 disparities: (``None``, ``None``)
+
+                Optionally, based on the dataset, it can return a ``mask`` as well:
                 valid_masks: (``None``, ``None``)
         """
         super().__init__(root=root)
@@ -106,10 +112,10 @@ def __getitem__(self, index: int) -> Tuple:
             index(int): The index of the example to retrieve
 
         Returns:
-            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` where ``valid_mask``
-                is a numpy boolean mask of shape (H, W)
-                indicating which disparity values are valid. The disparity is a numpy array of
-                shape (1, H, W) and the images are PIL images. ``disparity`` and ``valid_mask`` are None for
+            tuple: A 3 or 4-tuple with ``(img_left, img_right, disparity, Optional[valid_mask])`` where ``valid_mask``
+                can be a numpy boolean mask of shape (H, W) if the dataset provides a file
+                indicating which disparity pixels are valid. The disparity is a numpy array of
+                shape (1, H, W) and the images are PIL images. ``disparity`` is None for
                 datasets on which for ``split="test"`` the authors did not provide annotations.
         """
         img_left = self._read_img(self._images[index][0])
@@ -129,7 +135,10 @@ def __getitem__(self, index: int) -> Tuple:
                 valid_masks,
             ) = self.transforms(imgs, dsp_maps, valid_masks)
 
-        return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
+        if self._has_built_in_disparity_mask or valid_masks[0] is not None:
+            return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
+        else:
+            return imgs[0], imgs[1], dsp_maps[0]
 
     def __len__(self) -> int:
         return len(self._images)
@@ -192,6 +201,7 @@ def __init__(
         max_disparity: float = 256.0,
     ):
         super().__init__(root, transforms)
+        self._has_built_in_disparity_mask = True
 
         root = Path(root) / "CREStereo"
         self.max_disparity = max_disparity
@@ -213,7 +223,6 @@ def __init__(
         for s in dirs:
             left_image_pattern = str(root / s / "*_left.jpg")
             right_image_pattern = str(root / s / "*_right.jpg")
-            print(left_image_pattern, right_image_pattern)
             imgs = self._scan_pairs(left_image_pattern, right_image_pattern)
             self._images += imgs
 
@@ -223,11 +232,11 @@ def __init__(
             self._disparities += disparities
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = np.asarray(Image.open(file_path), dtype=np.float32)
-        valid = (disparity < self.max_disparity) & (disparity > 0.0)
+        disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze the disparity map into (C, H, W) format
-        disparity = disparity[None, :, :]
-        return disparity, valid
+        disparity_map = disparity_map[None, :, :]
+        valid_mask = None
+        return disparity_map, valid_mask
 
     def _download_dataset(self, root: str) -> None:
         dirs = ["tree", "shapenet", "reflective", "hole"]
@@ -430,13 +439,13 @@ def _read_img(self, file_path: str) -> Image.Image:
         return super()._read_img(file_path)
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        if not os.path.exists(file_path):  # case when dealing with the test split
+        # test split has not disparity maps
+        if not os.path.exists(file_path):
             return None, None
+
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
-        valid_mask = disparity_map < 1e3
-        # remove the channel dimension from the valid mask
-        valid_mask = valid_mask[0, :, :]
+        valid_mask = None
         return disparity_map, valid_mask
 
     def _download_dataset(self, root: str):
@@ -532,6 +541,9 @@ class ETH3DStereo(StereoMatchingDataset):
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
+        # needed for output consistency, otherwise tests get fussy about
+        # variable sized FEATURE_TYPES based on dataset split
+        self._has_built_in_disparity_mask = True
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
@@ -551,6 +563,7 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
 
     def _read_disparity(self, file_path: str) -> Tuple:
+        # test split has no disparity maps
         if not os.path.exists(file_path):
             return None, None
 
@@ -588,6 +601,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
+        self._has_built_in_disparity_mask = True
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
@@ -604,13 +618,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             self._disparities = list(("", "") for _ in self._images)
 
     def _read_disparity(self, file_path: str) -> Tuple:
+        # test split has no disparity maps
         if not os.path.exists(file_path):
             return None, None
 
         disparity_map = np.asarray(Image.open(file_path)) / 256.0
-        valid_mask = disparity_map > 0.0
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :]
+        valid_mask = None
         return disparity_map, valid_mask
 
 
@@ -657,6 +672,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
 
     def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
+        self._has_built_in_disparity_mask = True
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
 
@@ -673,13 +689,14 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
             self._disparities = list(("", "") for _ in self._images)
 
     def _read_disparity(self, file_path: str) -> Tuple:
+        # test split has no disparity maps
         if not os.path.exists(file_path):
             return None, None
 
         disparity_map = np.asarray(Image.open(file_path)) / 256.0
-        valid_mask = disparity_map > 0.0
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :]
+        valid_mask = None
         return disparity_map, valid_mask
 
 
@@ -724,20 +741,29 @@ class SintelStereo(StereoMatchingDataset):
 
     Args:
         root (string): Root directory where Sintel Stereo is located.
+        pass_name (string): The name of the pass to use, either "final" or "clean".
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None):
         super().__init__(root, transforms)
 
+        verify_str_arg(pass_name, "pass_name", valid_values=("final", "clean", "both"))
+
         root = Path(root) / "Sintel"
+        pass_names = {
+            "final": ["final"],
+            "clean": ["clean"],
+            "both": ["final", "clean"],
+        }[pass_name]
 
-        left_img_pattern = str(root / "training" / "final_left" / "*" / "*.png")
-        right_img_pattern = str(root / "training" / "final_right" / "*" / "*.png")
-        self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
+        for p in pass_names:
+            left_img_pattern = str(root / "training" / f"{p}_left" / "*" / "*.png")
+            right_img_pattern = str(root / "training" / f"{p}_right" / "*" / "*.png")
+            self._images += self._scan_pairs(left_img_pattern, right_img_pattern)
 
-        disparity_pattern = str(root / "training" / "disparities" / "*" / "*.png")
-        self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
+            disparity_pattern = str(root / "training" / "disparities" / "*" / "*.png")
+            self._disparities += self._scan_pairs(disparity_pattern, "", fill_empty=True)
 
     def _get_oclussion_mask_paths(self, file_path: str) -> Tuple[str, str]:
         path_tokens = file_path.split(os.sep)
@@ -863,9 +889,8 @@ def __init__(
     def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
-        # keep valid mask with shape (H, W)
-        valid = np.ones(disparity_map.shape[1:]).astype(np.bool_)
-        return disparity_map, valid
+        valid_mask = None
+        return disparity_map, valid_mask
 
 
 class FallingThingsStereo(StereoMatchingDataset):
@@ -945,11 +970,11 @@ def _read_disparity(self, file_path: str) -> Tuple:
             intrinsics = json.load(f)
             focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             baseline, pixel_constant = 6.0, 100.0  # pixel constant is inverted
-            disparity = (baseline * focal * pixel_constant) / depth.astype(np.float32)
-            valid = disparity > 0
+            disparity_map = (baseline * focal * pixel_constant) / depth.astype(np.float32)
             # unsqueeze disparity to (C, H, W)
-            disparity = disparity[None, :, :]
-            return disparity, valid
+            disparity_map = disparity_map[None, :, :]
+            valid_mask = None
+            return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple:
         return super().__getitem__(index)
@@ -1003,8 +1028,8 @@ def __init__(self, root: str, split: str = "train", transforms: Optional[Callabl
         self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
     def _read_disparity(self, file_path: str) -> Tuple:
-        disparity = np.asarray(Image.open(file_path), dtype=np.float32)
-        valid = np.ones_like(disparity).astype(np.bool_)
+        disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze disparity to (C, H, W)
-        disparity = disparity[None, :, :]
-        return disparity, valid
+        disparity_map = disparity_map[None, :, :]
+        valid_mask = None
+        return disparity_map, valid_mask

From 93f4b6c12800910ff77c12343c27905384339719 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 18 Jul 2022 16:20:48 +0100
Subject: [PATCH 32/35] Added getiem & docs to inform support multi shape
 returns

---
 torchvision/datasets/_stereo_matching.py | 129 +++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index 14ce20a2d96..ff7d0183773 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -250,6 +250,20 @@ def _download_dataset(self, root: str) -> None:
                 url = f"https://data.megengine.org.cn/research/crestereo/dataset/{d}/{i}.tar"
                 download_and_extract_archive(url=url, download_root=d_path, remove_finished=True)
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
+            generate a valid mask.
+        """
+        return super().__getitem__(index)
+
 
 class Middlebury2014Stereo(StereoMatchingDataset):
     """Publicly available scenes from the Middlebury dataset `2014 version <https://vision.middlebury.edu/stereo/data/scenes2014/>`.
@@ -487,6 +501,20 @@ def _download_dataset(self, root: str):
                 # cleanup MiddEval3 directory
                 shutil.rmtree(str(root / "MiddEval3"))
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            If a ``valid_mask`` is generated within the ``transforms`` parameter,
+            a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
+        """
+        return super().__getitem__(index)
+
 
 class ETH3DStereo(StereoMatchingDataset):
     """ "ETH3D `Low-Res Two-View <https://www.eth3d.net/datasets>`_ dataset.
@@ -574,6 +602,22 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.asarray(valid_mask).astype(np.bool_)
         return disparity_map, valid_mask
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
+            generate a valid mask.
+
+            Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
+        """
+        return super().__getitem__(index)
+
 
 class Kitti2012Stereo(StereoMatchingDataset):
     """ "Kitti dataset from the `2012 <http://www.cvlibs.net/datasets/kitti/eval_stereo_flow.php>`_ stereo evaluation benchmark.
@@ -628,6 +672,22 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = None
         return disparity_map, valid_mask
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
+            generate a valid mask.
+
+            Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
+        """
+        return super().__getitem__(index)
+
 
 class Kitti2015Stereo(StereoMatchingDataset):
     """ "Kitti dataset from the `2015 <http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php>`_ stereo evaluation benchmark.
@@ -699,6 +759,22 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = None
         return disparity_map, valid_mask
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 4-tuple with ``(img_left, img_right, disparity, valid_mask)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
+            generate a valid mask.
+
+            Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
+        """
+        return super().__getitem__(index)
+
 
 class SintelStereo(StereoMatchingDataset):
     """ "Sintel `Stereo Dataset <http://sintel.is.tue.mpg.de/stereo>`_.
@@ -808,6 +884,20 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            If a ``valid_mask`` is generated within the ``transforms`` parameter,
+            a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
+        """
+        return super().__getitem__(index)
+
 
 class SceneFlowStereo(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
@@ -892,6 +982,20 @@ def _read_disparity(self, file_path: str) -> Tuple:
         valid_mask = None
         return disparity_map, valid_mask
 
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            If a ``valid_mask`` is generated within the ``transforms`` parameter,
+            a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
+        """
+        return super().__getitem__(index)
+
 
 class FallingThingsStereo(StereoMatchingDataset):
     """FallingThings `<https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset
@@ -977,6 +1081,17 @@ def _read_disparity(self, file_path: str) -> Tuple:
             return disparity_map, valid_mask
 
     def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            If a ``valid_mask`` is generated within the ``transforms`` parameter,
+            a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
+        """
         return super().__getitem__(index)
 
 
@@ -1033,3 +1148,17 @@ def _read_disparity(self, file_path: str) -> Tuple:
         disparity_map = disparity_map[None, :, :]
         valid_mask = None
         return disparity_map, valid_mask
+
+    def __getitem__(self, index: int) -> Tuple:
+        """Return example at given index.
+
+        Args:
+            index(int): The index of the example to retrieve
+
+        Returns:
+            tuple: A 3-tuple with ``(img_left, img_right, disparity)``.
+            The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
+            If a ``valid_mask`` is generated within the ``transforms`` parameter,
+            a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
+        """
+        return super().__getitem__(index)

From c83bc8059ccea8b3abe60567a928791dcb391f6d Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Tue, 19 Jul 2022 11:20:28 +0100
Subject: [PATCH 33/35] removed path returns from helper test functions

---
 test/test_datasets.py | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index a75e597c049..08c5be78649 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2693,7 +2693,7 @@ def _create_scene_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]
         return image_paths
 
     @staticmethod
-    def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.Path]:
+    def _create_annotation_folder(num_examples: int, root_dir: str) -> None:
         paths = []
         # make the root_dir if it does not exits
         os.makedirs(root_dir, exist_ok=True)
@@ -2706,8 +2706,6 @@ def _create_annotation_folder(num_examples: int, root_dir: str) -> List[pathlib.
             paths.append(datasets_utils.create_image_file(root=scene_dir, name="mask0nocc.png", size=(1, 100, 100)))
             pfm_path = os.path.join(scene_dir, "disp0GT.pfm")
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
-            paths.append(pfm_path)
-        return paths
 
     def inject_fake_data(self, tmpdir, config):
         eth3d_dir = os.path.join(tmpdir, "ETH3D")
@@ -2784,9 +2782,9 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
-    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
+    def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> None:
         calibrations = [None] if split == "test" else ["-perfect", "-imperfect"]
-        scene_dirs = []
+
         for c in calibrations:
             scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
             os.makedirs(scene_dir, exist_ok=True)
@@ -2798,8 +2796,6 @@ def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> List[str]:
             # these are going to end up being gray scale images
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp0.pfm"))
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=os.path.join(scene_dir, "disp1.pfm"))
-            scene_dirs.append(scene_dir)
-        return scene_dirs
 
     def inject_fake_data(self, tmpdir, config):
         split_scene_map = {
@@ -3012,15 +3008,12 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
     @staticmethod
     def _create_pfm_folder(
         root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int]
-    ) -> List[str]:
+    ) -> None:
         root = pathlib.Path(root) / name
         os.makedirs(root, exist_ok=True)
 
-        paths = []
         for i in range(num_examples):
             datasets_utils.make_fake_pfm_file(size[0], size[1], root / file_name_fn(i))
-            paths.append(str(root / file_name_fn(i)))
-        return paths
 
     def inject_fake_data(self, tmpdir, config):
         scene_flow_dir = pathlib.Path(tmpdir) / "SceneFlow"
@@ -3091,27 +3084,20 @@ def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]):
         PIL.Image.fromarray(image).save(file)
 
     @staticmethod
-    def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> List[str]:
-        paths = []
+    def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> None:
         root = pathlib.Path(root) / scene_name
         os.makedirs(root, exist_ok=True)
         # jpg images
-        paths.append(datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0])))
-        paths.append(datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0])))
+        datasets_utils.create_image_file(root, "image1.left.jpg", size=(3, size[1], size[0]))
+        datasets_utils.create_image_file(root, "image1.right.jpg", size=(3, size[1], size[0]))
         # single channel depth maps
-        paths.append(
-            FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
-        )
-        paths.append(
-            FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
-        )
+        FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.left.depth.png", size=(size[0], size[1]))
+        FallingThingsStereoTestCase._make_dummy_depth_map(root, "image1.right.depth.png", size=(size[0], size[1]))
         # camera settings json. Minimal example for _read_disparity function testing
         settings_json = {"camera_settings": [{"intrinsic_settings": {"fx": 1}}]}
         with open(root / "_camera_settings.json", "w") as f:
             json.dump(settings_json, f)
 
-        return paths
-
     def inject_fake_data(self, tmpdir, config):
         fallingthings_dir = pathlib.Path(tmpdir) / "FallingThings"
         os.makedirs(fallingthings_dir, exist_ok=True)

From 650bf67c47aa54748ff81e69cd5134e9a36a29df Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Tue, 19 Jul 2022 11:36:30 +0100
Subject: [PATCH 34/35] replaced os.path.join with pathlib in tests

---
 test/test_datasets.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/test_datasets.py b/test/test_datasets.py
index 08c5be78649..ff1a418fbac 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -2708,12 +2708,12 @@ def _create_annotation_folder(num_examples: int, root_dir: str) -> None:
             datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=pfm_path)
 
     def inject_fake_data(self, tmpdir, config):
-        eth3d_dir = os.path.join(tmpdir, "ETH3D")
+        eth3d_dir = pathlib.Path(tmpdir) / "ETH3D"
 
         num_examples = 2 if config["split"] == "train" else 3
 
         split_name = "two_view_training" if config["split"] == "train" else "two_view_test"
-        split_dir = os.path.join(eth3d_dir, split_name)
+        split_dir = eth3d_dir / split_name
         self._create_scene_folder(num_examples, split_dir)
 
         if config["split"] == "train":
@@ -2784,9 +2784,10 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
     @staticmethod
     def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> None:
         calibrations = [None] if split == "test" else ["-perfect", "-imperfect"]
+        root_dir = pathlib.Path(root_dir)
 
         for c in calibrations:
-            scene_dir = os.path.join(root_dir, f"{scene_name}{c}")
+            scene_dir = root_dir / f"{scene_name}{c}"
             os.makedirs(scene_dir, exist_ok=True)
             # make normal images first
             datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(3, 100, 100))

From 39efae5772b8eba73e678d606a5f5720cbf7a977 Mon Sep 17 00:00:00 2001
From: Ponku <teodor.poncu@gmail.com>
Date: Mon, 25 Jul 2022 14:58:28 +0100
Subject: [PATCH 35/35] crestereo draft implementation

---
 test/test_prototype_models.py                 |   41 +
 .../models/depth/stereo/crestereo.py          | 1007 +++++++++++++++++
 2 files changed, 1048 insertions(+)
 create mode 100644 torchvision/prototype/models/depth/stereo/crestereo.py

diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py
index c76a84f8634..6ff1382010d 100644
--- a/test/test_prototype_models.py
+++ b/test/test_prototype_models.py
@@ -1,6 +1,7 @@
 import pytest
 import test_models as TM
 import torch
+import torchvision.prototype.models.depth.stereo.crestereo as crestereo
 import torchvision.prototype.models.depth.stereo.raft_stereo as raft_stereo
 from common_utils import set_rng_seed, cpu_and_gpu
 
@@ -36,3 +37,43 @@ def test_raft_stereo(model_builder, model_mode, dev):
 
     # Test against expected file output
     TM._assert_expected(depth_pred, name=model_builder.__name__, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("model_builder", (crestereo.crestereo_base,))
+@pytest.mark.parametrize("model_mode", ("standard", "scripted"))
+@pytest.mark.parametrize("dev", cpu_and_gpu())
+def test_crestereo(model_builder, model_mode, dev):
+    set_rng_seed(0)
+
+    model = model_builder().eval().to(dev)
+
+    if model_mode == "scripted":
+        model = torch.jit.script(model)
+
+    img1 = torch.rand(1, 3, 256, 256).to(dev)
+    img2 = torch.rand(1, 3, 256, 256).to(dev)
+    iterations = 3
+
+    preds = model(img1, img2, flow_init=None, iterations=iterations)
+    disparity_pred = preds[-1]
+
+    # all the pyramid levels except the highest res make only half the number of iterations
+    expected_iterations = (iterations // 2) * (len(model.resolutions) - 1)
+    expected_iterations += iterations
+    assert (
+        len(preds) == expected_iterations
+    ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels"
+
+    assert disparity_pred.shape == torch.Size(
+        [1, 2, 256, 256]
+    ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}"
+
+    assert all(
+        d.shape == torch.Size([1, 2, 256, 256]) for d in preds
+    ), "All predicted disparities are expected to have the same shape"
+
+    # test a backward pass with a dummy loss as well
+    preds = torch.stack(preds, dim=0)
+    targets = torch.ones_like(preds, requires_grad=False)
+    loss = torch.nn.functional.mse_loss(preds, targets)
+    loss.backward()
diff --git a/torchvision/prototype/models/depth/stereo/crestereo.py b/torchvision/prototype/models/depth/stereo/crestereo.py
new file mode 100644
index 00000000000..92a75d20ce3
--- /dev/null
+++ b/torchvision/prototype/models/depth/stereo/crestereo.py
@@ -0,0 +1,1007 @@
+import math
+from functools import partial
+from typing import Iterable, List, Optional, Callable, Tuple, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models.optical_flow.raft as raft
+from torch import Tensor
+from torchvision.models._api import WeightsEnum
+from torchvision.models.optical_flow._utils import make_coords_grid, grid_sample, upsample_flow
+from torchvision.ops import Conv2dNormActivation
+
+
+class ResidualBlock(raft.ResidualBlock):
+    def __init__(self, in_channels, out_channels, *, norm_layer, stride=1):
+        super().__init__(in_channels, out_channels, norm_layer=norm_layer, stride=stride)
+
+        # the CREStereo base architecture changes the number of channels
+        # even on grids with the same spatial resolution
+        if in_channels != out_channels:
+            self.downsample = Conv2dNormActivation(
+                in_channels,
+                out_channels,
+                norm_layer=norm_layer,
+                kernel_size=1,
+                stride=stride,
+                bias=True,
+                activation_layer=None,
+            )
+
+
+class FeatureEncoder(raft.FeatureEncoder):
+    """Base encoder for Feature Encoder and Context Encoder"""
+
+    def __init__(
+        self,
+        *,
+        block: Callable[..., nn.Module] = ResidualBlock,
+        layers: Tuple[int, int, int, int, int] = (64, 64, 96, 128, 256),
+        strides: Tuple[int, int, int, int] = (2, 1, 2, 1),
+        norm_layer: Callable[..., nn.Module] = nn.InstanceNorm2d,
+    ):
+        super().__init__(block=block, layers=layers, strides=strides, norm_layer=norm_layer)
+        for s in strides:
+            if s not in [1, 2]:
+                raise ValueError(f"FeatureEncoder unsupported stride size {s}. Supported values are one of ``[1, 2]``.")
+
+        self.output_dim = layers[-1]
+        num_downsamples = len(list(filter(lambda s: s == 2, strides)))
+        self.downsample_factor = 2 ** num_downsamples
+
+
+class ConvexMaskPredictor(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        hidden_size: int,
+        upsample_factor: int,
+        multiplier: float = 0.25,
+    ) -> None:
+
+        super().__init__()
+        self.mask_head = nn.Sequential(
+            Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3),
+            nn.Conv2d(hidden_size, upsample_factor ** 2 * 9, 1, padding=0),
+        )
+
+        self.multiplier = multiplier
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.mask_head(x) * self.multiplier
+        return x
+
+
+class AdaptiveGroupCorrelationLayer(nn.Module):
+    """
+    Container for computing various correlation types between a left and right feature map.
+    This module does not contain any optimisable parameters, it's solely a collection of ops.
+    We wrap in a nn.Module for torch.jit.script compatibility
+
+    Adaptive Group Correlation operations from: https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf
+
+    Canonical reference implementation: https://github.com/megvii-research/CREStereo/blob/master/nets/corr.py
+    """
+
+    def __init__(
+        self,
+        attention_module: Optional[nn.Module] = None,
+        groups: int = 4,
+        search_window_1d: Tuple[int, int] = (1, 9),
+        search_dilate_1d: Tuple[int, int] = (1, 1),
+        search_window_2d: Tuple[int, int] = (3, 3),
+        search_dilate_2d: Tuple[int, int] = (1, 1),
+    ) -> None:
+        super().__init__()
+        self.attention_module = attention_module
+
+        assert np.prod(search_window_1d) == np.prod(search_window_2d), (
+            f"The 1D and 2D windows should contain the same number of elements. "
+            f"1D shape: {search_window_1d} 2D shape: {search_window_2d}"
+        )
+
+        assert np.prod(search_window_1d) % 2 == 1, (
+            f"Search windows should contain an odd number of elements in them."
+            f"Window of shape {search_window_1d} has {np.prod(search_window_1d)} elements."
+        )
+
+        assert any(
+            size == 1 for size in search_window_1d
+        ), f"The 1D search window should have at least one size equal to 1. 1D shape: {search_window_1d}"
+
+        assert all(
+            size != 1 for size in search_window_2d
+        ), f"The 2D search window should have all dimensions greater than 1. 2D shape: {search_window_2d}"
+
+        self.search_window_1d = search_window_1d
+        self.search_window_2d = search_window_2d
+
+        self.search_dilate_1d = search_dilate_1d
+        self.search_dilate_2d = search_dilate_2d
+
+        self.groups = groups
+
+        # two selection tables for dealing withh the small_patch argument in the forward function
+        self.patch_sizes = {
+            True: [self.search_window_2d for _ in range(self.groups)],
+            False: [self.search_window_1d for _ in range(self.groups)],
+        }
+
+        self.dilate_sizes = {
+            True: [self.search_dilate_2d for _ in range(self.groups)],
+            False: [self.search_dilate_1d for _ in range(self.groups)],
+        }
+
+    def forward(
+        self,
+        left_features: Tensor,
+        right_features: Tensor,
+        flow: torch.Tensor,
+        extra_offset: Union[torch.Tensor, None],
+        use_small_patch: bool = False,
+        iter_mode: bool = False,
+    ):
+        if iter_mode or extra_offset is None:
+            corr = self.iterative_correlation(left_features, right_features, flow, use_small_patch)
+        else:
+            corr = self.attention_offset_correlation(left_features, right_features, flow, extra_offset, use_small_patch)  # type: ignore
+        return corr
+
+    def _make_coords(self, feature_map: Tensor) -> Tensor:
+        return make_coords_grid(feature_map.shape[0], feature_map.shape[2], feature_map.shape[3]).to(feature_map.device)
+
+    def get_correlation(
+        self,
+        left_feature: Tensor,
+        right_feature: Tensor,
+        window_size: Tuple[int, int] = (3, 3),
+        dilate: Tuple[int, int] = (1, 1),
+    ) -> Tensor:
+        """Function that computes a correlation product between the left and right features.
+
+        The correlation is computed in a sliding window fashion, namely the the left features are fixed
+        and for each ``(i, j)`` location we compute the correlation with a sliding window anchored in
+        ``(i, j)`` from the right feature map. The sliding window selects pixels obtained in the range of the sliding
+        window; i.e ``(i - window_size // 2, i + window_size // 2)`` respectively ``(j - window_size // 2, j + window_size // 2)``.
+        """
+
+        B, C, H, W = left_feature.shape
+
+        di_y, di_x = dilate[0], dilate[1]
+        pad_y, pad_x = window_size[0] // 2 * di_y, window_size[1] // 2 * di_x
+
+        right_padded = F.pad(right_feature, (pad_x, pad_x, pad_y, pad_y), mode="replicate")
+        right_padded = right_padded.detach()
+        # in order to vectorize the correlation computation over all pixel candidates
+        # we create multiple shifted right images which we stack on an extra dimension
+        right_padded = F.unfold(right_padded, kernel_size=(H, W), dilation=dilate)
+        # torch unfold returns a tensor of shape [B, flattened_values, n_selections]
+        right_padded = right_padded.permute(0, 2, 1)
+        # we consider rehsape back into [B, n_views, C, H, W]
+        right_padded = right_padded.reshape(B, (window_size[0] * window_size[1]), C, H, W)
+        # we expand the left features for broadcasting
+        left_feature = left_feature.unsqueeze(1)
+        # this will compute an element product of between [B, 1, C, H, W] * [B, n_views, C, H, W]
+        # to obtain correlations over the pixel canditates we perform a mean on the C dimension
+        correlation = torch.mean(left_feature * right_padded, dim=2, keepdim=False)
+        # the final correlation tensor shape will be [B, n_views, H, W]
+        # where on the i-th position of the n_views dimension we will have
+        # the correlation value between the left pixel
+        # and the i-th candidate on the right feature map
+        return correlation
+
+    def iterative_correlation(
+        self, left_feature: Tensor, right_feature: Tensor, flow: Tensor, use_small_patch: bool = False
+    ) -> Tensor:
+        """Function that computes 1 pass of non-offsetted Group-Wise correlation"""
+        coords = self._make_coords(left_feature)
+
+        # we offset the coordinate grid in the flow direction
+        coords = coords + flow
+        coords = coords.permute(0, 2, 3, 1)
+        # resample right features according to off-setted grid
+        right_feature = grid_sample(right_feature, coords, mode="bilinear", align_corners=True)
+
+        # use_small_patch is a flag by which we decide on how many axes
+        # we perform candidate search. See section 3.1 ``Deformable search window`` & Figure 4 in the paper.
+        patch_size_list = self.patch_sizes[use_small_patch]
+        dilate_size_list = self.dilate_sizes[use_small_patch]
+
+        # chunking the left and right feature to perform group-wise correlation
+        # mechanism simillar to GroupNorm. See section 3.1 ``Group-wise correlation``.
+        left_groups = torch.chunk(left_feature, self.groups, dim=1)
+        right_groups = torch.chunk(right_feature, self.groups, dim=1)
+
+        correlations = []
+        # this boils down to rather than performing the correlation product
+        # over the entire C dimensions, we use subsets of C to get multiple correlation sets
+        for i in range(len(patch_size_list)):
+            correlation = self.get_correlation(left_groups[i], right_groups[i], patch_size_list[i], dilate_size_list[i])
+            correlations.append(correlation)
+        final_correlations = torch.cat(correlations, dim=1)
+        return final_correlations
+
+    def attention_offset_correlation(
+        self,
+        left_feature: Tensor,
+        right_feature: Tensor,
+        flow: Tensor,
+        extra_offset: Tensor,
+        use_small_patch: bool = False,
+    ):
+        """Function that computes 1 pass of offsetted Group-Wise correlation
+
+        If the class was provided with an attention layer, the left and right feature maps
+        will be passed through a transformer first
+        """
+        B, C, H, W = left_feature.shape
+
+        if self.attention_module is not None:
+            # prepare for transformer required input shapes
+            left_feature = left_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
+            right_feature = right_feature.permute(0, 2, 3, 1).reshape(B, H * W, C)
+            # this can be either self attention or cross attention, hence the tupple return
+            left_feature, right_feature = self.attention_module(left_feature, right_feature)
+            left_feature = left_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
+            right_feature = right_feature.reshape(B, H, W, C).permute(0, 3, 1, 2)
+
+        left_groups = torch.chunk(left_feature, self.groups, dim=1)
+        right_groups = torch.chunk(right_feature, self.groups, dim=1)
+
+        num_search_candidates = 9
+        # for each pixel (i, j) we have a number of search candidates
+        # thus, for each candidate we should have an X-axis and Y-axis offset value
+        extra_offset = extra_offset.reshape(B, num_search_candidates, 2, H, W).permute(0, 1, 3, 4, 2)
+
+        # see line 133 for details
+        patch_size_list = self.patch_sizes[use_small_patch]
+        dilate_size_list = self.dilate_sizes[use_small_patch]
+
+        group_channels = C // self.groups
+        correlations = []
+
+        for i in range(len(patch_size_list)):
+            left_group, right_group = left_groups[i], right_groups[i]
+            patch_size, dilate = patch_size_list[i], dilate_size_list[i]
+
+            di_y, di_x = dilate
+            ps_y, ps_x = patch_size
+            # define the search based on the window patch shape
+            ry, rx = ps_y // 2 * di_y, ps_x // 2 * di_x
+
+            # base offsets for search (i.e. where to look on the search index)
+            x_grid, y_grid = torch.meshgrid(
+                torch.arange(-rx, rx + 1, di_x), torch.arange(-ry, ry + 1, di_y), indexing="xy"
+            )
+            x_grid, y_grid = x_grid.to(flow.device), y_grid.to(flow.device)
+            offsets = torch.stack((x_grid, y_grid))
+            offsets = offsets.reshape(2, -1).permute(1, 0)
+
+            for d in (0, 2, 3):
+                offsets = offsets.unsqueeze(d)
+            # extra offsets for search (i.e. deformed search indexes. Simillar concept to deformable convolutions)
+            offsets = offsets + extra_offset
+
+            coords = self._make_coords(left_feature) + flow
+            coords = coords.permute(0, 2, 3, 1).unsqueeze(1)
+            coords = coords + offsets
+            coords = coords.reshape(B, -1, W, 2)
+
+            right_group = grid_sample(right_group, coords, mode="bilinear", align_corners=True)
+            # we do not need to perform any window shifting because the grid sample op
+            # will return a multi-view right based on the num_search_candidates dimension in the offsets
+            right_group = right_group.reshape(B, -1, group_channels, H, W)
+            left_group = left_group.reshape(B, -1, group_channels, H, W)
+            correlation = torch.mean(left_group * right_group, dim=2)
+            correlations.append(correlation)
+
+        final_correlation = torch.cat(correlations, dim=1)
+        return final_correlation
+
+
+def elu_feature_map(x: Tensor) -> Tensor:
+    """Elu feature map operation from: https://arxiv.org/pdf/2006.16236.pdf"""
+    return F.elu(x) + 1
+
+
+class LinearAttention(nn.Module):
+    """
+    Linear attention operation from: https://arxiv.org/pdf/2006.16236.pdf
+    Cannonical implementation reference: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
+    """
+
+    def __init__(self, eps: float = 1e-6, feature_map_fn: Callable[[Tensor], Tensor] = elu_feature_map) -> None:
+        super().__init__()
+        self.eps = eps
+        self.feature_map_fn = elu_feature_map
+
+    def forward(
+        self,
+        queries: Tensor,
+        keys: Tensor,
+        values: Tensor,
+        q_mask: Optional[Tensor] = None,
+        kv_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            queries (torch.Tensor): [N, S1, H, D]
+            keys (torch.Tensor): [N, S2, H, D]
+            values (torch.Tensor): [N, S2, H, D]
+            q_mask (torch.Tensor): [N, S1] (optional)
+            kv_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            queried_values (torch.Tensor): [N, S1, H, D]
+        """
+        queries = self.feature_map_fn(queries)
+        values = self.feature_map_fn(values)
+
+        if q_mask is not None:
+            queries = queries * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            keys = keys * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+
+        # mitigates fp16 overflows
+        values_length = values.shape[1]
+        values = values / values_length
+        kv = torch.einsum("NSHD, NSHV -> NHDV", keys, values)
+        z = 1 / (torch.einsum("NLHD, NHD -> NLH", queries, keys.sum(dim=1)) + self.eps)
+        # rescale at the end to account for fp16 mitigation
+        queried_values = torch.einsum("NLHD, NHDV, NLH -> NLHV", queries, kv, z) * values_length
+        return queried_values
+
+
+class SoftmaxAttention(nn.Module):
+    """
+    A simple softmax attention  operation
+    LoFTR implementation reference: https://github.com/zju3dv/LoFTR/blob/2122156015b61fbb650e28b58a958e4d632b1058/src/loftr/loftr_module/linear_attention.py
+    """
+
+    def __init__(self, dropout: float = 0.0) -> None:
+        super().__init__()
+        self.dropout = nn.Dropout(dropout) if dropout else nn.Identity()
+
+    def forward(
+        self,
+        queries: Tensor,
+        keys: Tensor,
+        values: Tensor,
+        q_mask: Optional[Tensor] = None,
+        kv_mask: Optional[Tensor] = None,
+    ):
+        """
+        Computes classical softmax full-attention between all queries and keys.
+
+        Args:
+            queries (torch.Tensor): [N, S1, H, D]
+            keys (torch.Tensor): [N, S2, H, D]
+            values (torch.Tensor): [N, S2, H, D]
+            q_mask (torch.Tensor): [N, S1] (optional)
+            kv_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            queried_values: [N, S1, H, D]
+        """
+
+        scale_factor = 1.0 / queries.shape[3] ** 0.5  # irsqrt(D) scaling
+        queries = queries * scale_factor
+
+        qk = torch.einsum("NLHD, NSHD -> NLSH", queries, keys)
+        if kv_mask is not None and q_mask is not None:
+            qk.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float("-inf"))
+
+        attention = torch.softmax(qk, dim=2)
+        attention = self.dropout(attention)
+
+        queried_values = torch.einsum("NLSH, NSHD -> NLHD", attention, values)
+        return queried_values
+
+
+class PositionalEncodingSine(nn.Module):
+    """
+    Sinusoidal positonal encodings
+
+    Using the scaling term from https://github.com/megvii-research/CREStereo/blob/master/nets/attention/position_encoding.py
+    Reference implementation from https://github.com/facebookresearch/detr/blob/8a144f83a287f4d3fece4acdf073f387c5af387d/models/position_encoding.py#L28-L48
+    """
+
+    def __init__(self, dim_model: int) -> None:
+        super().__init__()
+        self.dim_model = dim_model
+        self.scale_factor = -math.log(10_000) / (dim_model // 2)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x: [B, C, H, W]
+        """
+        torch._assert(
+            len(x.shape) == 4,
+            f"PositionalEncodingSine requires a 4-D dimensional input. Provided tensor is of shape {x.shape}",
+        )
+
+        coords = torch.ones(size=x.shape[2:], dtype=x.dtype, device=x.device)
+        positions_y = coords.cumsum(0).unsqueeze(0).unsqueeze(-1)
+        positions_x = coords.cumsum(1).unsqueeze(0).unsqueeze(-1)
+
+        div_term = torch.exp(torch.arange(0, self.dim_model // 2, dtype=x.dtype, device=x.device) * self.scale_factor)
+        positions_x = positions_x * div_term
+        positions_y = positions_y * div_term
+
+        positions_x = torch.stack((positions_x[..., 0::2].sin(), positions_x[..., 1::2].cos()), dim=4).flatten(3)
+        positions_y = torch.stack((positions_y[..., 0::2].sin(), positions_y[..., 1::2].cos()), dim=4).flatten(3)
+
+        positional_embeddings = torch.cat((positions_x, positions_y), dim=3).permute(0, 3, 1, 2)
+        return x + positional_embeddings
+
+
+class LocalFeatureEncoderLayer(nn.Module):
+    """
+    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
+    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    """
+
+    def __init__(
+        self,
+        *,
+        dim_model: int,
+        num_heads: int,
+        attention_type: str = "linear",
+    ) -> None:
+        super().__init__()
+
+        if attention_type not in ["linear", "softmax"]:
+            raise ValueError(
+                f"Unsuported attention type {attention_type}. LocalFeatureEncoderLayer supports one of ``[linear, softmax]``"
+            )
+
+        self.dim_head = dim_model // num_heads
+        self.num_heads = num_heads
+
+        # multi-head attention
+        self.query_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.key_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.value_proj = nn.Linear(dim_model, dim_model, bias=False)
+        self.attention_op = LinearAttention() if attention_type == "linear" else SoftmaxAttention()
+        self.merge = nn.Linear(dim_model, dim_model, bias=False)
+
+        # feed forward network
+        self.ffn = nn.Sequential(
+            nn.Linear(dim_model * 2, dim_model * 2, bias=False),
+            nn.ReLU(),
+            nn.Linear(dim_model * 2, dim_model, bias=False),
+        )
+
+        # norm layers
+        self.attention_norm = nn.LayerNorm(dim_model)
+        self.ffn_norm = nn.LayerNorm(dim_model)
+
+    def forward(self, x: Tensor, source: Tensor, x_mask: Optional[Tensor] = None, source_mask: Optional[Tensor] = None):
+        """
+        Args:
+            x (torch.Tensor): [B, S1, D]
+            source (torch.Tensor): [B, S2, D]
+            x_mask (torch.Tensor): [B, S1] (optional)
+            source_mask (torch.Tensor): [B, S2] (optional)
+        """
+        B, S, D = x.shape
+        queries, keys, values = x, source, source
+
+        queries = self.query_proj(queries).reshape(B, S, self.num_heads, self.dim_head)
+        keys = self.key_proj(keys).reshape(B, S, self.num_heads, self.dim_head)
+        values = self.value_proj(values).reshape(B, S, self.num_heads, self.dim_head)
+
+        # attention operation
+        message = self.attention_op(queries, keys, values, x_mask, source_mask)
+        # concatenating attention heads together before passing throught projection layer
+        message = self.merge(message.reshape(B, S, D))
+        message = self.attention_norm(message)
+
+        # ffn operation
+        message = self.ffn(torch.cat([x, message], dim=2))
+        message = self.attention_norm(message)
+
+        return x + message
+
+
+class LocalFeatureTransformer(nn.Module):
+    """
+    LoFTR transformer module from: https://arxiv.org/pdf/2104.00680.pdf
+    Cannonical implementations at: https://github.com/zju3dv/LoFTR/blob/master/src/loftr/loftr_module/transformer.py
+    """
+
+    def __init__(
+        self,
+        *,
+        dim_model: int,
+        num_heads: int,
+        attention_directions: List[str],
+        attention_type: str = "linear",
+    ) -> None:
+        super(LocalFeatureTransformer, self).__init__()
+
+        self.attention_directions = attention_directions
+        for direction in attention_directions:
+            if direction not in ["self", "cross"]:
+                raise ValueError(
+                    f"Attention direction {direction} unsupported. LocalFeatureTransformer accepts only ``attention_type`` in ``[self, cross]``."
+                )
+
+        self.layers = nn.ModuleList(
+            [
+                LocalFeatureEncoderLayer(dim_model=dim_model, num_heads=num_heads, attention_type=attention_type)
+                for _ in attention_directions
+            ]
+        )
+
+    def forward(
+        self,
+        left_features: Tensor,
+        right_features: Tensor,
+        left_mask: Optional[Tensor] = None,
+        right_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            left_features (torch.Tensor): [N, S1, D]
+            right_features (torch.Tensor): [N, S2, D]
+            left_mask (torch.Tensor): [N, S1] (optional)
+            right_mask (torch.Tensor): [N, S2] (optional)
+        Returns:
+            left_features (torch.Tensor): [N, S1, D]
+            right_features (torch.Tensor): [N, S2, D]
+        """
+
+        torch._assert(
+            left_features.shape[2] == right_features.shape[2],
+            f"left_features and right_features should have the same embedding dimensions. left_features: {left_features.shape[2]} right_features: {right_features.shape[2]}",
+        )
+
+        for idx, layer in enumerate(self.layers):
+            attention_direction = self.attention_directions[idx]
+            # for layer, attention_direction in zip(self.layers, self.attention_directions):
+
+            if attention_direction == "self":
+                left_features = layer(left_features, left_features, left_mask, left_mask)
+                right_features = layer(right_features, right_features, right_mask, right_mask)
+
+            elif attention_direction == "cross":
+                left_features = layer(left_features, right_features, left_mask, right_mask)
+                right_features = layer(right_features, left_features, right_mask, left_mask)
+
+        return left_features, right_features
+
+
+class PyramidDownsample(nn.Module):
+    """
+    A simple wrapper that return and Avg Pool feature pyramid based on the provided scales.
+    Implicitly returns the input as well.
+    """
+
+    def __init__(self, factors: Iterable[int]) -> None:
+        super().__init__()
+        self.factors = factors
+
+    def forward(self, x: torch.Tensor) -> List[Tensor]:
+        results = [x]
+        for factor in self.factors:
+            results.append(F.avg_pool2d(x, kernel_size=factor, stride=factor))
+        return results
+
+
+class CREStereo(nn.Module):
+    """
+    CREStereo network from: https://openaccess.thecvf.com/content/CVPR2022/papers/Li_Practical_Stereo_Matching_via_Cascaded_Recurrent_Network_With_Adaptive_Correlation_CVPR_2022_paper.pdf
+
+    Canonical implementation: https://github.com/megvii-research/CREStereo/blob/master/nets/crestereo.py
+    """
+
+    def __init__(
+        self,
+        *,
+        feature_encoder: FeatureEncoder,
+        update_block: raft.UpdateBlock,
+        flow_head: raft.FlowHead,
+        self_attn_block: LocalFeatureTransformer,
+        cross_attn_block: LocalFeatureTransformer,
+        feature_downsample_rates: Tuple[int, ...] = (2, 4),
+        correlation_groups: int = 4,
+        search_window_1d: Tuple[int, int] = (1, 9),
+        search_dilate_1d: Tuple[int, int] = (1, 1),
+        search_window_2d: Tuple[int, int] = (3, 3),
+        search_dilate_2d: Tuple[int, int] = (1, 1),
+    ) -> None:
+        super().__init__()
+
+        self.feature_encoder = feature_encoder
+        self.update_block = update_block
+        self.flow_head = flow_head
+        self.self_attn_block = self_attn_block
+
+        # average pooling for the feature encoder outputs
+        self.downsampling_pyramid = PyramidDownsample(feature_downsample_rates)
+        self.downsampling_factors: List[int] = [feature_encoder.downsample_factor]
+        base_downsample_factor: int = self.downsampling_factors[0]
+        for rate in feature_downsample_rates:
+            self.downsampling_factors.append(base_downsample_factor * rate)
+
+        # output resolution tracking
+        self.resolutions: List[str] = [f"1 / {factor}" for factor in self.downsampling_factors]
+        self.search_pixels = int(np.prod(search_window_1d))
+
+        # flow convex upsampling mask predictor
+        self.mask_predictor = ConvexMaskPredictor(
+            in_channels=feature_encoder.output_dim // 2,
+            hidden_size=feature_encoder.output_dim,
+            upsample_factor=4,
+            multiplier=0.25,
+        )
+
+        # offsets modules for offseted feature selection
+        self.offset_convs = nn.ModuleDict()
+        self.correlation_layers = nn.ModuleDict()
+
+        offset_conv_layer = partial(
+            Conv2dNormActivation,
+            in_channels=feature_encoder.output_dim,
+            out_channels=self.search_pixels * 2,
+            norm_layer=None,
+            activation_layer=None,
+        )
+
+        correlation_layer = partial(
+            AdaptiveGroupCorrelationLayer,
+            groups=correlation_groups,
+            search_window_1d=search_window_1d,
+            search_dilate_1d=search_dilate_1d,
+            search_window_2d=search_window_2d,
+            search_dilate_2d=search_dilate_2d,
+        )
+
+        # populate the dicts in top to bottom order
+        # useful for iterating through torch.jit.script module given the network forward pass
+        #
+        # Ignore the largest resolution. We handle that separately due to torch.jit.script
+        # not being to able access to runtime generated keys in ModuleDicts.
+        # This way, we can keep a generic way of processing all pyramid levels but except
+        # the final one
+
+        for idx, resolution in enumerate(reversed(self.resolutions[1:])):
+            # the largest resolution does use offset convolutions for sampling grid coords
+            offset_conv = None if idx == len(self.resolutions) - 1 else offset_conv_layer()
+            if offset_conv:
+                self.offset_convs[resolution] = offset_conv
+                # only the lowest resolution uses the cross attention module when computing correlation scores
+                self.correlation_layers[resolution] = (
+                    correlation_layer(attention_module=cross_attn_block) if idx == 0 else correlation_layer()
+                )
+
+        # correlation layer for the largest resolution
+        self.max_res_correlation_layer = correlation_layer()
+
+        # simple 2D Postional Encodings
+        self.positional_encodings = PositionalEncodingSine(feature_encoder.output_dim)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def unfreeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.train()
+
+    def forward(self, left_image: Tensor, right_image: Tensor, flow_init: Optional[Tensor], iterations: int = 10):
+        features = torch.cat([left_image, right_image], dim=0)
+        features = self.feature_encoder(features)
+        left_features, right_features = features.chunk(2, dim=0)
+
+        # update block network state and input context are derived from the left feature map
+        net, ctx = left_features.chunk(2, dim=1)
+        net = torch.tanh(net)
+        ctx = torch.relu(ctx)
+
+        # will output lists of tensor.
+        l_pyramid = self.downsampling_pyramid(left_features)
+        r_pyramid = self.downsampling_pyramid(right_features)
+        net_pyramid = self.downsampling_pyramid(net)
+        ctx_pyramid = self.downsampling_pyramid(ctx)
+
+        # we store in reversed order because we process the pyramid from top to bottom
+        l_pyramid: Dict[str, Tensor] = {res: l_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        r_pyramid: Dict[str, Tensor] = {res: r_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        net_pyramid: Dict[str, Tensor] = {res: net_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+        ctx_pyramid: Dict[str, Tensor] = {res: ctx_pyramid[idx] for idx, res in enumerate(self.resolutions)}
+
+        # offsets for sampling pixel candidates in the correlation ops
+        offsets: Dict[str, Tensor] = {}
+        for resolution, offset_conv in self.offset_convs.items():
+            feature_map = l_pyramid[resolution]
+            offset = offset_conv(feature_map)
+            offsets[resolution] = (torch.sigmoid(offset) - 0.5) * 2.0
+
+        # the smallest resolution is prepared for passing through self attention
+        min_res = self.resolutions[-1]
+        max_res = self.resolutions[0]
+
+        B, C, MIN_H, MIN_W = l_pyramid[min_res].shape
+        # add positional encodings
+        l_pyramid[min_res] = self.positional_encodings(l_pyramid[min_res])
+        r_pyramid[min_res] = self.positional_encodings(r_pyramid[min_res])
+        # reshaping for transformer
+        l_pyramid[min_res] = l_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
+        r_pyramid[min_res] = r_pyramid[min_res].permute(0, 2, 3, 1).reshape(B, MIN_H * MIN_W, C)
+        # perform self attention
+        l_pyramid[min_res], r_pyramid[min_res] = self.self_attn_block(l_pyramid[min_res], r_pyramid[min_res])
+        # now we need to reshape back into [B, C, H, W] format
+        l_pyramid[min_res] = l_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
+        r_pyramid[min_res] = r_pyramid[min_res].reshape(B, MIN_H, MIN_W, C).permute(0, 3, 1, 2)
+
+        predictions: List[Tensor] = []
+        flow_estimates: Dict[str, Tensor] = {}
+        # we added this because of torch.script.jit
+        # also, the predicition prior is always going to have the
+        # spatial size of the features outputed by the feature encoder
+        flow_pred_prior: Tensor = torch.empty(
+            size=(B, 2, left_features.shape[2], left_features.shape[3]),
+            dtype=l_pyramid[max_res].dtype,
+            device=l_pyramid[max_res].device,
+        )
+
+        if flow_init is not None:
+            scale = l_pyramid[max_res].shape[2] // flow_init.shape[2]
+            # in CREStereo implementation they multiply with -scale instead of scale
+            # upsample_flow multiples with scale, therefor we add the - in front
+            flow_estimates[max_res] = -upsample_flow(flow_init, up_mask=None, factor=scale)
+        # when not provided with a flow prior, we construct one using the lower resolution maps
+        else:
+            # initialize a zero flow with the smallest resolution
+            flow = torch.zeros(size=(B, 2, MIN_H, MIN_W), device=left_features.device, dtype=left_features.dtype)
+
+            # flows from coarse resolutions are refined similarly
+            # we always need to fetch the next pyramid feature map as well
+            # when updating coarse resolutions, therefore we create a reversed
+            # view which has its order synced with the ModuleDict keys iterator
+            coarse_resolutions: List[str] = self.resolutions[::-1]  # using slicing because of torch.jit.script
+            fine_grained_resolution = max_res
+
+            # set the coarsest flow to the zero flow
+            flow_estimates[coarse_resolutions[0]] = flow
+
+            # the correlation layer ModuleDict will contain layers ordered from coarse to fine resolution
+            # i.e ["1 / 16", "1 / 8", "1 / 4"]
+            # the correlation layer ModuleDict has layers for all the resolutions except the fine one
+            # i.e {"1 / 16": Module, "1 / 8": Module}
+            # for these resolution we perform only half of the number of refinement iterations
+            for idx, (resolution, correlation_layer) in enumerate(self.correlation_layers.items()):
+                # compute the scale difference between the first pyramid scale and the current pyramid scale
+                scale_to_base = l_pyramid[fine_grained_resolution].shape[2] // l_pyramid[resolution].shape[2]
+                for it in range(iterations // 2):
+                    # set wether or not we want to search on (X, Y) axes for correlation or just on X axis
+                    use_small_search_patch = (it % 2) == 1
+                    # we consider this a prior, therefor we do not want to back-propagate through it
+                    flow_estimates[resolution] = flow_estimates[resolution].detach()
+
+                    # corr_fn = self.get_module_from_module_dict(self.correlation_functions, resolution)
+                    correlations = correlation_layer(
+                        l_pyramid[resolution],  # left
+                        r_pyramid[resolution],  # right
+                        flow_estimates[resolution],
+                        offsets[resolution],
+                        use_small_search_patch,
+                    )
+
+                    # update the recurrent network state and the flow deltas
+                    net_pyramid[resolution], delta_flow = self.update_block(
+                        net_pyramid[resolution], ctx_pyramid[resolution], correlations, flow_estimates[resolution]
+                    )
+
+                    # the convex upsampling weights are computed w.r.t.
+                    # the recurrent update state
+                    up_mask = self.mask_predictor(net_pyramid[resolution])
+                    flow_estimates[resolution] = flow_estimates[resolution] + delta_flow
+                    # convex upsampling with the initial feature encoder downsampling rate
+                    flow_pred_prior = upsample_flow(
+                        flow_estimates[resolution], up_mask, factor=self.downsampling_factors[0]
+                    )
+                    # we then bilinear upsample to the final resolution
+                    # we use a factor that's equivalent to the difference between
+                    # the current downsample resolution and the base downsample resolution
+                    #
+                    # i.e. if a 1 / 16 flow is upsampled by 4 (base downsampling) we get a 1 / 4 flow.
+                    # therefore we have to further upscale it by the difference between
+                    # the current level 1 / 16 and the base level 1 / 4.
+                    flow_pred = -upsample_flow(flow_pred_prior, None, factor=scale_to_base)
+                    predictions.append(flow_pred)
+
+                # when constructing the next resolution prior, we resample w.r.t
+                # to the scale of the next level in the pyramid
+                next_resolution = coarse_resolutions[idx + 1]
+                scale_to_next = l_pyramid[next_resolution].shape[2] / flow_pred_prior.shape[2]
+                # we use the flow_up_prior because this is a more accurate estimation of the true flow
+                # due to the convex upsample, which resembles a learned super-resolution module.
+                # this is not necessarily an upsample, it can be a downsample, based on the provided configuration
+                flow_estimates[next_resolution] = -scale_to_next * F.interpolate(
+                    input=flow_pred_prior,
+                    size=l_pyramid[next_resolution].shape[2:],
+                    mode="bilinear",
+                    align_corners=True,
+                )
+
+        # finally we will be doing a full pass through the fine-grained resolution
+        # this coincides with the maximum resolution
+
+        # we keep a separate loop here in order to avoid python control flow
+        # to decide how much iterations should we do based on the current resolution
+        # further more, if provided with an inital flow, there is no need to generate
+        # a prior estimate when moving into the final refinement stage
+
+        for it in range(iterations):
+            use_small_search_patch = (it % 2) == 1
+
+            flow_estimates[max_res] = flow_estimates[max_res].detach()
+            # we run the fine-grained resolution correlations in iterative mode
+            # this means that we are using the fixed window pixel selections
+            # instead of the deformed ones as with the previous steps
+            correlations = self.max_res_correlation_layer(
+                l_pyramid[max_res],
+                r_pyramid[max_res],
+                flow_estimates[max_res],
+                extra_offset=None,
+                use_small_patch=use_small_search_patch,
+                iter_mode=True,
+            )
+
+            net_pyramid[max_res], delta_flow = self.update_block(
+                net_pyramid[max_res], ctx_pyramid[max_res], correlations, flow_estimates[max_res]
+            )
+
+            up_mask = self.mask_predictor(net_pyramid[max_res])
+            flow_estimates[max_res] = flow_estimates[max_res] + delta_flow
+            # at the final resolution we simply do a convex upsample using the base downsample rate
+            flow_pred = -upsample_flow(flow_estimates[max_res], up_mask, factor=self.downsampling_factors[0])
+            predictions.append(flow_pred)
+
+        return predictions
+
+
+def _crestereo(
+    *,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    # Feature Encoder
+    feature_encoder_layers: Tuple[int, int, int, int, int],
+    feature_encoder_strides: Tuple[int, int, int, int],
+    feature_encoder_block: Callable[..., nn.Module],
+    # Average Pooling Pyramid
+    feature_downsample_rates: Tuple[int, ...],
+    # Adaptive Correlation Layer
+    corr_groups: int,
+    corr_search_window_2d: Tuple[int, int],
+    corr_search_dilate_2d: Tuple[int, int],
+    corr_search_window_1d: Tuple[int, int],
+    corr_search_dilate_1d: Tuple[int, int],
+    # Flow head
+    flow_head_hidden_size: int,
+    # Recurrent block
+    recurrent_block_hidden_state_size: int,
+    recurrent_block_kernel_size: Tuple[Tuple[int, int], Tuple[int, int]],
+    recurrent_block_padding: Tuple[Tuple[int, int], Tuple[int, int]],
+    # Motion Encoder
+    motion_encoder_corr_layers: Tuple[int, int],
+    motion_encoder_flow_layers: Tuple[int, int],
+    motion_encoder_out_channels: int,
+    # Transformer Blocks
+    num_attention_heads: int,
+    num_self_attention_layers: int,
+    num_cross_attention_layers: int,
+    self_attention_type: str,
+    cross_attention_type: str,
+    **kwargs,
+) -> CREStereo:
+
+    feature_encoder = kwargs.pop("feature_encoder", None) or FeatureEncoder(
+        block=feature_encoder_block,
+        layers=feature_encoder_layers,
+        strides=feature_encoder_strides,
+        norm_layer=nn.InstanceNorm2d,
+    )
+
+    assert feature_encoder.output_dim % corr_groups == 0, (
+        f"Final ``feature_encoder_layers`` size should be divisible by ``corr_groups`` argument."
+        f"Feature encoder output size : {feature_encoder.output_dim}, Correlation groups: {corr_groups}."
+    )
+
+    motion_encoder = kwargs.pop("motion_encoder", None) or raft.MotionEncoder(
+        in_channels_corr=corr_groups * int(np.prod(corr_search_window_1d)),
+        corr_layers=motion_encoder_corr_layers,
+        flow_layers=motion_encoder_flow_layers,
+        out_channels=motion_encoder_out_channels,
+    )
+
+    out_channels_context = feature_encoder_layers[-1] - recurrent_block_hidden_state_size
+    recurrent_block = kwargs.pop("recurrent_block", None) or raft.RecurrentBlock(
+        input_size=motion_encoder.out_channels + out_channels_context,
+        hidden_size=recurrent_block_hidden_state_size,
+        kernel_size=recurrent_block_kernel_size,
+        padding=recurrent_block_padding,
+    )
+
+    flow_head = kwargs.pop("flow_head", None) or raft.FlowHead(
+        in_channels=out_channels_context, hidden_size=flow_head_hidden_size
+    )
+
+    update_block = raft.UpdateBlock(motion_encoder=motion_encoder, recurrent_block=recurrent_block, flow_head=flow_head)
+
+    self_attn_block = LocalFeatureTransformer(
+        dim_model=feature_encoder.output_dim,
+        num_heads=num_attention_heads,
+        attention_directions=["self"] * num_self_attention_layers,
+        attention_type=self_attention_type,
+    )
+
+    cross_attn_block = LocalFeatureTransformer(
+        dim_model=feature_encoder.output_dim,
+        num_heads=num_attention_heads,
+        attention_directions=["cross"] * num_cross_attention_layers,
+        attention_type=cross_attention_type,
+    )
+
+    model = CREStereo(
+        feature_encoder=feature_encoder,
+        update_block=update_block,
+        flow_head=flow_head,
+        self_attn_block=self_attn_block,
+        cross_attn_block=cross_attn_block,
+        feature_downsample_rates=feature_downsample_rates,
+        correlation_groups=corr_groups,
+        search_window_1d=corr_search_window_1d,
+        search_window_2d=corr_search_window_2d,
+        search_dilate_1d=corr_search_dilate_1d,
+        search_dilate_2d=corr_search_dilate_2d,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
+
+
+def crestereo_base(*, weights: Optional[WeightsEnum] = None, progress=True, **kwargs) -> CREStereo:
+    return _crestereo(
+        weights=weights,
+        progress=progress,
+        # Feature encoder
+        feature_encoder_layers=(64, 64, 96, 128, 256),
+        feature_encoder_strides=(2, 1, 2, 1),
+        feature_encoder_block=ResidualBlock,
+        # Average pooling pyramid
+        feature_downsample_rates=(2, 4),
+        # Motion encoder
+        motion_encoder_corr_layers=(256, 192),
+        motion_encoder_flow_layers=(128, 64),
+        motion_encoder_out_channels=256,
+        # Recurrent block
+        recurrent_block_hidden_state_size=128,
+        recurrent_block_kernel_size=((1, 5), (5, 1)),
+        recurrent_block_padding=((0, 2), (2, 0)),
+        # Flow head
+        flow_head_hidden_size=256,
+        # Transformer blocks
+        num_attention_heads=8,
+        num_self_attention_layers=1,
+        num_cross_attention_layers=1,
+        self_attention_type="linear",
+        cross_attention_type="linear",
+        # Adaptive Correlation layer
+        corr_groups=4,
+        corr_search_window_2d=(3, 3),
+        corr_search_dilate_2d=(1, 1),
+        corr_search_window_1d=(1, 9),
+        corr_search_dilate_1d=(1, 1),
+    )