In [1]:
import csv
import time
import os
from materials_io.utils.interface import get_available_parsers, get_parser
import multiprocessing as mp

def run_parser(file_parser):
    """Helper function for multiprocessing.
    
    Parameters:
    file_parser (parser, file): 2-tuple containing a parser name
    to use for parsing and a file(s) to parse.
    
    Returns:
    metadata {file_name: {parser_name: metadata_dict}}: Returns
    dictionary of metadata with parser and file names included.
    """
    parser = get_parser(file_parser[0])
    file = file_parser[1]
    
    try:
        metadata = {file: {file_parser[0]: parser.parse(file)}}
        return metadata
    except Exception as e:
        return None

def run_all_parsers_mp(directory, exclude_parsers=None, processes=mp.cpu_count()):
    """Runs all parsers on a directory but uses multiprocessing.
    
    Parameters:
    directory (str): Directory to run parsers on.
    exclude_parsers (list): List of parsers to not run.
    
    Returns:
    file_metadata (file_name: {parser_name: metadata_dict}}):
    List of dictionaries of metadata as returned by run_parser.
    """
    start_time = time.time()
    parsers = get_available_parsers()
    file_metadata = []
    task_queue = []
    
    if exclude_parsers is not None:
        parsers = list(set(parsers.keys()).difference(exclude_parsers))
    
    print("starting...")
    
    for parser in parsers:
        parser_obj = get_parser(parser)
        
        for root, dirs, files in os.walk(directory):
            # Generate the full paths
            dirs = [os.path.join(root, d) for d in dirs]
            files = [os.path.join(root, f) for f in files]
            
            for group in parser_obj.group(files,dirs):
                task_queue.append((parser, group))
    
    print("It took {} seconds to generate the queue. {} jobs in queue".format(time.time() - start_time,
                                                                              len(task_queue)))
    print("starting job processing...")
    
    pools = mp.Pool(processes)
    
    for metadata in pools.imap_unordered(run_parser, task_queue):
        file_metadata.append(metadata)
        if (len(file_metadata) % 1000) == 0:
            print("{} out of {} files processed".format(len(file_metadata), len(task_queue)))
            print("{} seconds have passed\n".format(time.time() - start_time))
    
    pools.close()
    pools.join()
    
    file_metadata = [metadata for metadata in file_metadata if metadata is not None]
    
    print("Finished in {} seconds".format(time.time() - start_time))
    print("{} number of metadata".format(len(file_metadata)))
    return file_metadata

def matio_label_gen(directory, label_file=None, exclude_parsers=None):
    """Generates file metadata using run_all_parsers_mp
    and then writes file names and parser names to a .csv for
    successfully extracted metadata.
    
    Parameters:
    directory (str): Directory of files to write labels for.
    label_file (str): Name of .csv to write labels to.
    exclude_parsers (list): List of parsers to not run.
    """
    file_row = []
    
    if label_file is None:
        label_file = os.path.basename(directory) + ".csv"
    
    file_metadata = run_all_parsers_mp(directory, exclude_parsers=exclude_parsers)
    
    for metadata in file_metadata:
        file_path = list(metadata.keys())[0]
        file_label = list(metadata[file_path].keys())[0]
        
        if isinstance(file_path, list):
            for path in file_path:
                file_row.append([path, os.path.getsize(path), file_label])
        else:
            file_row.append([file_path[0], os.path.getsize(file_path[0]), file_label])
    
    with open(label_file, 'w', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(["path", "size", "file_label"])
        
        for row in file_row:
            csv_writer.writerow(row)
    
    print("Done writing labels")
    

In [2]:
matio_label_gen('/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/datasets/nist_dataset'
                                , exclude_parsers=['generic', 'noop', 'csv'])

  warn('The libmagic library is not installed. '


starting...
It took 20.25339102745056 seconds to generate the queue. 975964 jobs in queue
starting job processing...
1000 out of 975964 files processed
20.769722938537598 seconds have passed

2000 out of 975964 files processed
20.93887186050415 seconds have passed

3000 out of 975964 files processed
21.09473490715027 seconds have passed

4000 out of 975964 files processed
21.254810094833374 seconds have passed

5000 out of 975964 files processed
21.410362005233765 seconds have passed

6000 out of 975964 files processed
21.571824073791504 seconds have passed

7000 out of 975964 files processed
21.749927043914795 seconds have passed

8000 out of 975964 files processed
21.89958095550537 seconds have passed

9000 out of 975964 files processed
22.055094003677368 seconds have passed

10000 out of 975964 files processed
22.206357955932617 seconds have passed

11000 out of 975964 files processed
22.367774963378906 seconds have passed

12000 out of 975964 files processed
22.516807079315186 seco

Process ForkPoolWorker-7:
Process ForkPoolWorker-4:
Process ForkPoolWorker-8:
Process ForkPoolWorker-3:


57000 out of 975964 files processed
33.55553388595581 seconds have passed



Process ForkPoolWorker-5:
Process ForkPoolWorker-9:
Process ForkPoolWorker-11:
Process ForkPoolWorker-1:
Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-12:
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/ryan/anaconda3/envs/xtract/li

  File "<ipython-input-1-649162bf6cbb>", line 22, in run_parser
    metadata = {file: {file_parser[0]: parser.parse(file)}}
  File "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/MaterialsIO/materials_io/base.py", line 150, in parse
    return self._parse_file(group[0], context)
  File "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/MaterialsIO/materials_io/electron_microscopy.py", line 10, in _parse_file
    data = hs.load(file_path).metadata.as_dictionary()
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io.py", line 215, in <listcomp>
    if os.path.isfile(f)])
  File "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/MaterialsIO/materials_io/electron_microscopy.py", line 10, in _parse_file
    data = hs.load(file_path).metadata.as_dictionary()
  File "/Users/ryan/Documents/CS/CDAC/xtract_autoencoder/MaterialsIO/materials_io/electron_microscopy.py", line 10, in _parse_file
    data = hs.load(file_path).metadata.as_dictionary()
  File "/Users/ryan/Do

  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/functions.py", line 221, in imread
    reader = read(uri, format, "i", **kwargs)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/functions.py", line 221, in imread
    reader = read(uri, format, "i", **kwargs)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/functions.py", line 221, in imread
    reader = read(uri, format, "i", **kwargs)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/functions.py", line 221, in imread
    reader = read(uri, format, "i", **kwargs)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io_plugins/image.py", line 85, in _read_data
    dc = imread(filename)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/functions.py", line 136, in get_reader
    format = formats.search_read_format(request)
  File "<ipython-input-

  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io_plugins/image.py", line 65, in file_reader
    dc = _read_data(filename)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/plugins/simpleitk.py", line 99, in _can_read
    if has_module("itk.ImageIOBase") or has_module("SimpleITK"):
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io.py", line 325, in load_with_reader
    **kwds)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io_plugins/image.py", line 85, in _read_data
    dc = imread(filename)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io_plugins/image.py", line 85, in _read_data
    dc = imread(filename)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy/io_plugins/image.py", line 85, in _read_data
    dc = imread(filename)
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/hyperspy

  File "<frozen importlib._bootstrap>", line 906, in _find_spec
KeyboardInterrupt
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/plugins/pillow.py", line 109, in _can_read
    if request.firstbytes and accept(request.firstbytes):
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/core/request.py", line 499, in read_n_bytes
    extra_bytes = f.read(N - len(bb))
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/site-packages/imageio/plugins/_freeimage.py", line 443, in lib
    if self._lib is None:
  File "<frozen importlib._bootstrap_external>", line 1252, in _get_spec
  File "<frozen importlib._bootstrap_external>", line 1280, in find_spec
  File "/Users/ryan/anaconda3/envs/xtract/lib/python3.7/importlib/util.py", line 103, in find_spec
    return _find_spec(fullname, parent_path)
KeyboardInterrupt
  File "<frozen importlib._bootstrap_external>", line 1252, in _get_spec
KeyboardInterrupt
  File "<frozen importlib._bootstra

KeyboardInterrupt: 