In [None]:
# default_exp data.read

# Reading data 
> Defines utilities function for reading and manipulating data

In [None]:
# hide
from nbdev.showdoc import *
from grade_classif.params.defaults import *

In [None]:
# export
from grade_classif.core import ifnone
import pandas as pd
import numpy as np

In [None]:
# export
def _check_include(obj, include):
    return include is None or obj.name in include

In [None]:
# export
def _check_exclude(obj, exclude):
    return exclude is None or obj.name not in exclude

In [None]:
# export
def _check_valid(obj, include, exclude):
    return _check_include(obj, include) and _check_exclude(obj, exclude) and not obj.name.startswith('.')

In [None]:
# export
def get_items(folder, label_func, recurse=True, extensions=None, include=None, exclude=None, filterfunc=None):
    items = []
    labels = []
    filterfunc = ifnone(filterfunc, lambda x: True)    
    for obj in folder.iterdir():
        if obj.is_file():
            if extensions is None or obj.suffix in extensions and filterfunc(obj):
                items.append(obj)
                labels.append(label_func(obj))
        elif recurse and _check_valid(obj, include, exclude):
            items_r, labels_r = get_items(obj, label_func, extensions=extensions, filterfunc=filterfunc)
            items += items_r
            labels += labels_r
    return items, labels

Loads all items and labels in `folder`. Items are stored as `Path` objects. Labels are computed for each item using `label_func`. By default, the search will happen recursively in all subfolders. To disable this behaviour, use `recurse=False`. You can also specify a list of `extensions` to restrict the accepted files, as well as `include` and `exclude` folders (as `str`, these are direct subfolders of `folder`). `filterfunc` can be used to only accept objects for which it returns `True`. 

In [None]:
# export
def get_scan(folder, scan_name, include=None, exclude=None):
    dirs = []
    for item in folder.iterdir():
        if item.name == scan_name and _check_valid(item, include, exclude):
            return item
        if item.is_dir():
            dirs.append(item)
    for item in dirs:
        obj = get_scan(item, scan_name)
        if obj is not None:
            return obj

Return a `Path` object to the folder corresponding to a `scan` name in a specific `folder`. Direct subfolders can be include or excluded using respectively `include` and `exclude`.

In [None]:
get_scan(DATA, 'CF_PACS04rescan_10F0051', include=['1', '3'])

PosixPath('/work/stages/schwob/Patches_240/Patches_MGI_240_2/3/CF_PACS04rescan_10F0051')

In [None]:
# export
def split(scans, grades, valid_pct=0.2):
    grades1 = list(filter(lambda x: x == '1', grades))
    order = np.random.permutation(len(scans))
    n = {'1': len(grades1), '3': len(grades)-len(grades1)}
    k = {'1': 0, '3': 0}
    splits = ['' for _ in scans]
    for o in order:
        grade, scan = grades[o], scans[o]
        if k[grade] > valid_pct*n[grade]:
            split = 'train'
        else:
            split = 'valid'
        k[grade] += 1
        splits[o] = split
    return splits

Using a list of scan names `scans`,a list of grades `grades` (both as strings) and ratio for the validation set's size `valid_pct`, splits the dataset between training and validation sets. Returns a list of `'train'` and `'valid'` strings so that each scan is associated to a set. Lists are reordered inside the function so that the splitting is random. Proportions of grades 1 and 3 are preserved in each subset.

In [None]:
# export
def _remove_doubles(scans, grades):
    scans_res = []
    grades_res = []
    for scan, grade in zip(scans, grades):
        if scan not in scans_res:
            scans_res.append(scan)
            grades_res.append(grade)
    return scans_res, grades_res

In [None]:
# export
def create_csv(csv_path, data_path, label_func=None):
    label_func = ifnone(label_func, lambda x: x.parts[-3])
    scans, grades = get_items(data_path, label_func, include=['1', '3'])
    scans = list(map(lambda x: x.parent.name, scans))
    scans, grades = _remove_doubles(scans, grades)
    splits = split(scans, grades)
    df = pd.DataFrame({'scan': scans, 'grade': grades, 'split': splits})
    df.to_csv(csv_path, index=False)
    return df

Creates a csv that contains all scan names, with the associated grade and subset they are in. Scans are found in the folder contained in `data_path` (`Path` object). Doubles are deleted. The file is then stored in `csv_path`. You can specify a `label_func` to compute how the grade is to be extracted from the path to a patch. Returns the corresponding dataframe.

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_train.ipynb.
Converted 02_predict.ipynb.
Converted 10_data_read.ipynb.
Converted 11_data_loaders.ipynb.
Converted 12_data_dataset.ipynb.
Converted 13_data_utils.ipynb.
Converted 14_data_transforms.ipynb.
Converted 20_models_plmodules.ipynb.
Converted 21_models_modules.ipynb.
Converted 22_models_utils.ipynb.
Converted 23_models_hooks.ipynb.
Converted 24_models_metrics.ipynb.
Converted 80_params_defaults.ipynb.
Converted 81_params_parser.ipynb.
Converted 99_index.ipynb.
