In [1]:
# default_exp untarify

# Untarify

> Untarify provides the `untar()` method to untar zipped json files. An additional decorator `@powerup_untar`is provided which endows multiprocessign superpowers to the humble unzipping processes.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#todo: write simple tests for this
#export
from pathlib import Path
import concurrent.futures, gzip

In [4]:
#export
def untar(source: Path, verbose=False) -> Path:
    """
    Desc: creates a file with the same extension as the 
    file contained within the zipped archive and 
    then writes the contents from the zipped file 
    into the new file created
    Args:
     source - Path object for the source from
              where the files needs to be fetched
     verbose - Let the method know if you want to
               see progress messages
    Returns: Path object for the uncompressed file
    """

    if verbose: print(f'source:{source}')
    dest_file = source.parent/source.stem
    if dest_file.exists(): dest_file.unlink()
    if not dest_file.exists():
        if verbose: print("extracting..")
        dest_file.touch()
        with gzip.open(source, "rb") as file:
            for line in file:
                dest_file.write_bytes(line)
                #dest_file.close()
        if verbose: print('File extracted successfully.')
    return dest_file

def powerup_untar(func):
    """
    Desc: Decorator which endows untar() with superpowers.
          Divides the untar process to multiple cores
    Args: Function to be decorated.
    Returns: Superpowers
    """
    def wrapper(*args):
        source = func(*args)
        with concurrent.futures.ProcessPoolExecutor() as executor:
            executor.map(untar, source) 
    return wrapper

In [5]:
show_doc(untar)
show_doc(powerup_untar)

<h4 id="untar" class="doc_header"><code>untar</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>untar</code>(**`source`**:`Path`, **`verbose`**=*`False`*)

Desc: creates a file with the same extension as the 
file contained within the zipped archive and 
then writes the contents from the zipped file 
into the new file created
Args:
 source - Path object for the source from
          where the files needs to be fetched
 verbose - Let the method know if you want to
           see progress messages
Returns: Path object for the uncompressed file

<h4 id="powerup_untar" class="doc_header"><code>powerup_untar</code><a href="__main__.py#L29" class="source_link" style="float:right">[source]</a></h4>

> <code>powerup_untar</code>(**`func`**)

Desc: Decorator which endows untar() with superpowers.
      Divides the untar process to multiple cores
Args: Function to be decorated.
Returns: Superpowers

### Example

The below is an example of using the `untar()` method

In [6]:
path = Path("./test/input/test")
outputPath = Path("./test/working")

In [7]:
files = [file for file in path.iterdir() if file.suffix == '.gz']
files

[PosixPath('test/input/test/coronavirus-tweet-id-2020-01-21-22.jsonl.gz'),
 PosixPath('test/input/test/coronavirus-tweet-id-2020-01-21-23.jsonl.gz')]

In [8]:
import shutil
test_files = files[:3]
test_target = [outputPath/test_file.stem for test_file in test_files]
for i in range(len(test_files)):
    shutil.copy(str(test_files[i]),str(outputPath/test_files[i].name))

In [9]:
for file in files:
    untar(file)

## Example
use `untar()` to power up the untaring processes. This uses multiple processers.

In [10]:
@powerup_untar
def find_source(outputPath, source):
    test_source = [outputPath/source.name for source in source]
    return test_source

In [11]:
find_source(outputPath, test_files)