In [1]:
#default_exp utils

In [2]:
#export
from pathlib import Path
from typing import Union,Dict,List,Tuple,Any,Optional,Collection,Iterable

import shutil
import re
import json
import os
import mimetypes

In [3]:
#export
PathLike      = Union[str,Path]
ListOfPaths   = List[Path]
ListOfStrings = List[str]

### List Files 

In [4]:
#export
def ls(path:Path, list_hidden:bool=False) -> List[Path]:
    "List files, while hiding hidden files or directories by default"
    if list_hidden: return list(path.iterdir())
    else:
        return [p for p in path.iterdir() if not p.name.startswith('.')]

def listdirs(path:Path, sort=True) -> List[Path]:
    "List directories in `path`"
    dirs = [p for p in path.ls() if p.is_dir()]
    if sort: return sorted(dirs)
    else: return dirs
    
Path.ls = ls
Path.listdirs = listdirs

In [5]:
path = Path("/Users/rahulsomani/Desktop/test-images/")
path.ls()

[PosixPath('/Users/rahulsomani/Desktop/test-images/2.jpg'),
 PosixPath('/Users/rahulsomani/Desktop/test-images/3.jpg'),
 PosixPath('/Users/rahulsomani/Desktop/test-images/1.jpg')]

In [6]:
path.parent.listdirs() # list all directories in Destop

[PosixPath('/Users/rahulsomani/Desktop/DATA-SCRAPE'),
 PosixPath('/Users/rahulsomani/Desktop/DATASETS'),
 PosixPath('/Users/rahulsomani/Desktop/Synopsis'),
 PosixPath('/Users/rahulsomani/Desktop/cinemanet-tests'),
 PosixPath('/Users/rahulsomani/Desktop/data-heist'),
 PosixPath('/Users/rahulsomani/Desktop/isabella-koshy.github.io'),
 PosixPath('/Users/rahulsomani/Desktop/label-studio'),
 PosixPath('/Users/rahulsomani/Desktop/location-workspace'),
 PosixPath('/Users/rahulsomani/Desktop/nsfw-workspace'),
 PosixPath('/Users/rahulsomani/Desktop/shotdeck-cropped'),
 PosixPath('/Users/rahulsomani/Desktop/test-images')]

### Get Files From Directory

In [7]:
#export
def get_files(path:PathLike,
              extensions:Collection[str]=None,
              recurse:bool=False,
              exclude:Optional[Collection[str]]=None,
              include:Optional[Collection[str]]=None,
              presort:bool=False,
              followlinks:bool=False) -> List[Path]:
    """
    Return list of files in `path` that have a suffix in `extensions`; optionally `recurse`.
    Use `include` and `exclude` for including/excluding folder names, `presort` to sort.
    """
    if recurse:
        res = []
        for i,(p,d,f) in enumerate(os.walk(path, followlinks=followlinks)):
            # skip hidden dirs
            if include is not None and i==0:   d[:] = [o for o in d if o in include]
            elif exclude is not None and i==0: d[:] = [o for o in d if o not in exclude]
            else:                              d[:] = [o for o in d if not o.startswith('.')]
            res += _get_files(path, p, f, extensions)
        if presort: res = sorted(res, key=lambda p: _path_to_same_str(p), reverse=False)
        return res
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        res = _get_files(path, path, f, extensions)
        if presort: res = sorted(res, key=lambda p: _path_to_same_str(p), reverse=False)
        return res

def _path_to_same_str(p_fn:PathLike) -> str:
    "path -> str, but same on nt+posix, for alpha-sort only"
    s_fn = str(p_fn)
    s_fn = s_fn.replace('\\','.')
    s_fn = s_fn.replace('/','.')
    return s_fn

def _get_files(parent, p, f, extensions) -> list:
    p = Path(p)#.relative_to(parent)
    if isinstance(extensions,str): extensions = [extensions]
    low_extensions = [e.lower() for e in extensions] if extensions is not None else None
    res = [p/o for o in f if not o.startswith('.')
           and (extensions is None or f'.{o.split(".")[-1].lower()}' in low_extensions)]
    return res

In [8]:
get_files("/Users/rahulsomani/Desktop")

[PosixPath('/Users/rahulsomani/Desktop/2020_10_13__creatures.json'),
 PosixPath('/Users/rahulsomani/Desktop/2020_10_13__tied_up.json'),
 PosixPath('/Users/rahulsomani/Desktop/2020_10_13__explosions_accidents.json')]

In [9]:
get_files("/Users/rahulsomani/Desktop", recurse=True, extensions=[".html"])

[PosixPath('/Users/rahulsomani/Desktop/label-studio/location-config-pretty.html'),
 PosixPath('/Users/rahulsomani/Desktop/label-studio/nsfw-config.html'),
 PosixPath('/Users/rahulsomani/Desktop/isabella-koshy.github.io/index.html')]

In [10]:
#export
image_extensions = set(k for k,v in mimetypes.types_map.items() if v.startswith('image/'))
video_extensions = set([k for k,v in mimetypes.types_map.items() if v.startswith('video/')] + ['.mkv'])

def get_image_files(path:PathLike,
                    include:Optional[ListOfStrings]=None,
                    exclude:Optional[ListOfStrings]=None,
                    recurse:bool=True) -> List[Path]:
    return get_files(path=path, include=include, exclude=exclude, recurse=recurse, extensions=image_extensions)

def get_video_files(path:PathLike,
                    include:Optional[ListOfStrings]=None,
                    exclude:Optional[ListOfStrings]=None,
                    recurse:bool=True) -> List[Path]:
    return get_files(path=path, include=include, exclude=exclude, recurse=recurse, extensions=video_extensions)

In [11]:
video_extensions

{'.avi',
 '.m1v',
 '.mkv',
 '.mov',
 '.movie',
 '.mp4',
 '.mpa',
 '.mpe',
 '.mpeg',
 '.mpg',
 '.qt',
 '.webm'}

In [12]:
image_extensions

{'.bmp',
 '.gif',
 '.ico',
 '.ief',
 '.jpe',
 '.jpeg',
 '.jpg',
 '.pbm',
 '.pgm',
 '.png',
 '.pnm',
 '.ppm',
 '.ras',
 '.rgb',
 '.svg',
 '.tif',
 '.tiff',
 '.xbm',
 '.xpm',
 '.xwd'}

### Flatten List, List of Lists

In [13]:
#export
def flatten(x:Any) -> List[Any]:
    flattened_list = []
    for item in x:
        if isinstance(item, (tuple,list)):
            [flattened_list.append(i) for i in item]
        else:
            flattened_list.append(item)
    return flattened_list

In [14]:
flatten("a")

['a']

In [15]:
flatten(("a", "b"))

['a', 'b']

In [16]:
sub_list_items = ["sub_list_item1", "sub_list_item2"]
flatten(['list_item', sub_list_items])

['list_item', 'sub_list_item1', 'sub_list_item2']

In [17]:
#export
def mkdir(x:Path) -> None:
    x.mkdir(exist_ok=True)

def uniqueify(x:Collection) -> Collection:
    return sorted(list(set(x)))

In [18]:
uniqueify(['a', 'a', 'a'])

['a']

In [19]:
uniqueify(["a", "aa"])

['a', 'aa']

### Clean Filename

In [20]:
#export
def clean_filename(fname:str, truncate:bool=True, prefix:bool=None, to_lower:bool=True) -> str:
    'Clean a string to contain only alphabets, numbers, and/or underscores'
    import re
    
    f = Path(fname)
    fractions  = '\u00BC-\u00BE\u2150-\u215E' # not-exhaustive..?
    supscripts = '\u00B1-\u00B9'

    fname_new = re.sub(f'[\W{supscripts}{fractions}]', '_', f.stem) # captures (?) subscripts, fractions, other non-alphanumerics
    fname_new = re.sub(f'[^A-Za-z0-9_+]', '_', fname_new)           # captures alphabets in foreign languages
    fname_new = re.sub('^[\W_]*'     , '' , fname_new)              # replace leading spl characters or '_'
    
    if truncate:
        if len(fname_new) > 200: fname_new = fname_new[:200]
    if prefix:
        fname_new = f"{prefix}_{fname_new}"
    if to_lower: fname_new = fname_new.lower()
    
    return fname_new

In [21]:
clean_filename("this is A HoRRIBLE FILENEME !!!! --+")

'this_is_a_horrible_fileneme_________'

In [22]:
clean_filename("this is A HoRRIBLE FILENEME !!!! --+", to_lower=False)

'this_is_A_HoRRIBLE_FILENEME_________'

In [23]:
clean_filename("this is A HoRRIBLE FILENEME !!!! --+", to_lower=True, prefix="HMM")

'hmm_this_is_a_horrible_fileneme_________'

### Export

In [26]:
from nbdev.export import *
notebook2script('utils.ipynb')

Converted utils.ipynb.
