In [None]:
# default_exp data.read

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# Reading data 
> Defines utilities function for reading and manipulating data.

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
from grade_classif.core import ifnone
from grade_classif.imports import *
from fastcore.foundation import L, setify

In [None]:
# export
def _check_include(obj: Path, include: Sequence[str]) -> bool:
    return include is None or obj.name in include

In [None]:
# export
def _check_exclude(obj: Path, exclude: Sequence[str]) -> bool:
    return exclude is None or obj.name not in exclude

In [None]:
# export
def _check_valid(obj: Path, include: Sequence[str], exclude: Sequence[str]) -> bool:
    return (
        _check_include(obj, include)
        and _check_exclude(obj, exclude)
        and not obj.name.startswith(".")
    )

In [None]:
# export
def _get_files(p, fs, extensions=None):
    p = Path(p)
    res = [
        p / f
        for f in fs
        if not f.startswith(".")
        and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)
    ]
    return res


def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):
    """
    Find all files in a folder recursively.
    Arguments:
        path (str): Path to input folder.
        extensions (list of str): list of acceptable file extensions.
        recurse (bool): whether to perform a recursive search or not.
        folders (list of str): direct subfolders to explore (if None explore all).
        followlinks (bool): whether to follow symlinks or not.
    Returns:
        list: list of all absolute paths to found files.
    """
    path = Path(path)
    folders = L(folders)
    extensions = setify(extensions)
    extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i, (p, d, f) in enumerate(
            os.walk(path, followlinks=followlinks)
        ):  # returns (dirpath, dirnames, filenames)
            if len(folders) != 0 and i == 0:
                d[:] = [o for o in d if o in folders]
            else:
                d[:] = [o for o in d if not o.startswith(".")]
            if len(folders) != 0 and i == 0 and "." not in folders:
                continue
            res += _get_files(p, f, extensions)
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        res = _get_files(path, f, extensions)
    return L(res)

In [None]:
# export
def get_leaf_folders(path, folders=None, followlinks=True):
    path = Path(path)
    folders = L(folders)
    res = []
    for i, (p, d, f) in enumerate(
        os.walk(path, followlinks=followlinks)
    ):  # returns (dirpath, dirnames, filenames)
        if len(d) == 0 and not p.startswith("."):
            res.append(p)
            continue
        if len(folders) != 0 and i == 0:
            d[:] = [o for o in d if o in folders]
        else:
            d[:] = [o for o in d if not o.startswith(".")]
        if len(folders) != 0 and i == 0 and "." not in folders:
            continue
    return L(res)    

In [None]:
data = Path.cwd() / "sample_data/Patches_MGI_256_7"

In [None]:
# export
def get_items(
    folder: Union[Path, str],
    label_func: Callable[[Path], bool],
    recurse: bool = True,
    extensions: Optional[Sequence[str]] = None,
    include: Optional[Sequence[str]] = None,
    exclude: Optional[Sequence[str]] = None,
    filterfunc: Optional[Callable[[Path], bool]] = None,
) -> Tuple[List[Path], List[Path]]:
    items = []
    labels = []
    folder = Path(folder)
    filterfunc = ifnone(filterfunc, lambda x: True)
    for obj in folder.iterdir():
        if obj.is_file():
            if extensions is None or obj.suffix in extensions and filterfunc(obj):
                items.append(obj)
                labels.append(label_func(obj))
        elif recurse and _check_valid(obj, include, exclude):
            items_r, labels_r = get_items(
                obj, label_func, extensions=extensions, filterfunc=filterfunc
            )
            items += items_r
            labels += labels_r
    return items, labels

Loads all items and labels in `folder`. Items are stored as `Path` objects. Labels are computed for each item using `label_func`. By default, the search will happen recursively in all subfolders. To disable this behaviour, use `recurse=False`. You can also specify a list of `extensions` to restrict the accepted files, as well as `include` and `exclude` folders (as `str`, these are direct subfolders of `folder`). `filterfunc` can be used to only accept objects for which it returns `True`. 

In [None]:
items, labels = get_items(data, lambda x: x.parts[-3])
items[::3], labels[::3]

([PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/1/CF_PACS04rescan_07P0201/CF_PACS04rescan_07P0201_32768_98304.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/1/CF_PACS05HE_03026-04H2669/CF_PACS05HE_03026-04H2669_0_131072.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/1/CF_PACS04rescan_07P0208/CF_PACS04rescan_07P0208_32768_98304.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/1/CF_PACS04rescan_06P1306/CF_PACS04rescan_06P1306_0_98304.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/3/CF_PACS04rescan_06P242/CF_PACS04rescan_06P242_0_98304.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/3/CF_PACS05HE_08034-A03.11622B3/CF_PACS05HE_08034-A03.11622B3_32768_65536.png'),
  PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/3/CF_PACS05HE_06003-172

In [None]:
# export
def get_scan(
    folder: Union[Path, str],
    scan_name: str,
    include: Optional[Sequence[str]] = None,
    exclude: Optional[Sequence[str]] = None,
) -> Path:
    dirs = []
    folder = Path(folder)
    for item in folder.iterdir():
        if item.name == scan_name:
            return item
        if item.is_dir() and _check_valid(item, include, exclude):
            dirs.append(item)
    for item in dirs:
        obj = get_scan(item, scan_name)
        if obj is not None:
            return obj

Return a `Path` object to the folder corresponding to a `scan` name in a specific `folder`. Direct subfolders can be include or excluded using respectively `include` and `exclude`.

In [None]:
get_scan(data, "CF_PACS05HE_08034-A03.11622B3")

PosixPath('/home/DeepLearning/grade_classif/nbs/sample_data/Patches_MGI_256_7/3/CF_PACS05HE_08034-A03.11622B3')

In [None]:
# export
def split(
    scans: Sequence[str], grades: Sequence[str], valid_pct: float = 0.2
) -> List[str]:
    grades1 = list(filter(lambda x: x == "1", grades))
    order = np.random.permutation(len(scans))
    n = {"1": len(grades1), "3": len(grades) - len(grades1)}
    k = {"1": 0, "3": 0}
    splits = ["" for _ in scans]
    for o in order:
        grade, scan = grades[o], scans[o]
        if k[grade] >= valid_pct * n[grade]:
            split = "train"
        else:
            split = "valid"
        k[grade] += 1
        splits[o] = split
    return splits

Using a list of scan names `scans`, a list of grades `grades` (both as strings) and ratio for the validation set's size `valid_pct`, randomly splits the dataset between training and validation sets. Returns a list of `'train'` and `'valid'` strings so that each scan is associated to a set.

In [None]:
scans = [str(i) for i in range(100)]
grades = ["1" for _ in range(30)] + ["3" for _ in range(70)]
splits = split(scans, grades)

In [None]:
df = pd.DataFrame({"scan": scans, "grade": grades, "split": splits})
df.head(10)

Unnamed: 0,scan,grade,split
0,0,1,valid
1,1,1,train
2,2,1,valid
3,3,1,train
4,4,1,train
5,5,1,train
6,6,1,train
7,7,1,train
8,8,1,train
9,9,1,train


In [None]:
len(df.loc[df["split"] == "valid"])

20

Proportions of grades 1 and 3 are preserved in each subset as you can see below.

In [None]:
len(df.loc[(df["split"] == "valid") & (df["grade"] == "3")]) / len(
    df.loc[df["split"] == "valid"]
)

0.7

In [None]:
# export
def _remove_doubles(
    scans: Sequence[str], grades: Sequence[str]
) -> Tuple[List[str], List[str]]:
    scans_res = []
    grades_res = []
    for scan, grade in zip(scans, grades):
        if scan not in scans_res:
            scans_res.append(scan)
            grades_res.append(grade)
    return scans_res, grades_res

In [None]:
# export
def create_csv(
    csv_path: Path, data_path: Path, label_func: Optional[Callable[[Path], bool]] = None
) -> pd.DataFrame:
    label_func = ifnone(label_func, lambda x: x.parts[-3])
    scans, grades = get_items(data_path, label_func, extensions=[".png"])
    scans = list(map(lambda x: x.parent.name, scans))
    scans, grades = _remove_doubles(scans, grades)
    splits = split(scans, grades)
    df = pd.DataFrame({"scan": scans, "grade": grades, "split": splits})
    df.to_csv(csv_path, index=False)
    return df

Creates a csv that contains all scan names, with the associated grade and subset they are in. Scans are found in the folder contained in `data_path` (`Path` object). Doubles are deleted. The file is then stored in `csv_path`. You can specify a `label_func` to compute how the grade is to be extracted from the path to a patch. Returns the corresponding dataframe.

In [None]:
df = create_csv(data.parent / "sample.csv", data)
df

Unnamed: 0,scan,grade,split
0,CF_PACS04rescan_07P0208,1,train
1,CF_PACS05HE_03026-04H2669,1,valid
2,CF_PACS04rescan_06P1306,1,train
3,CF_PACS04rescan_07P0201,1,train
4,CF_PACS05HE_06003-172.608I,3,train
5,CF_PACS05HE_08034-A03.11622B3,3,train
6,CF_PACS04rescan_06P242,3,valid
7,CF_PACS04rescan_06P0912,3,train


In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()

Converted 00_core.ipynb.
Converted 01_train.ipynb.
Converted 02_predict.ipynb.
Converted 10_data.read.ipynb.
Converted 11_data.loaders.ipynb.
Converted 12_data.dataset.ipynb.
Converted 13_data.utils.ipynb.
Converted 14_data.transforms.ipynb.
Converted 15_data.color.ipynb.
Converted 16_data.modules.ipynb.
Converted 20_models.plmodules.ipynb.
Converted 21_models.modules.ipynb.
Converted 22_models.utils.ipynb.
Converted 23_models.hooks.ipynb.
Converted 24_models.metrics.ipynb.
Converted 25_models.losses.ipynb.
Converted 80_params.defaults.ipynb.
Converted 81_params.parser.ipynb.
Converted 99_index.ipynb.
