In [1]:
import tarfile
from os import remove, path, listdir
from re import search, sub

**Exploring the tarfile package**

In [27]:
with tarfile.TarFile('../data/test_archives/test3_dir.tar') as zipper:
    print(zipper.getmembers())

[<TarInfo 'test2.tar' at 0x234783abf20>, <TarInfo 'test3.txt' at 0x234783c4048>]


In [5]:
with tarfile.TarFile('../data/test_archives/test3.tar') as zipper:
    print([a for a in zipper.getmembers() if a.isfile()])

[<TarInfo 'test2b.tar' at 0x1d9eae78c00>, <TarInfo 'test3.txt' at 0x1d9eb0278e0>]


In [8]:
tar = tarfile.open('../data/test_archives/test3.tar')
tar.getmembers()[0]

<TarInfo 'test2b.tar' at 0x1d9eb027c00>

In [38]:
tar.close()

**Attempt to do some extracting**

In [46]:
def extract_and_delete(tar_location, orig_path, delete=True, dest='../data/test_archives_contents/dir1'):
    """Open the tar file at the given location, extract it's contents, and delete it if indicated."""
    extracted = []
    with tarfile.TarFile(tar_location) as tarry:
        sep = '__' if '__' in tar_location else '/'
        tar_name = tar_location.split(sep)[-1].split('.')[0]
        
        for name, child in zip(tarry.getnames(), tarry.getmembers()):      
            new_filename = '{}/{}__{}__{}'.format(dest, orig_path.split('/')[-1], 
                                                tar_name, sub('/', '__', name))
            child_object = tarfile.ExFileObject(tarry, child).read()
            
            if search('\.tar$', name): # lazily assume all tar files end in .tar
                with open(new_filename, "wb") as fout:
                    fout.write(child_object)
                extracted.extend(extract_and_delete(new_filename, '{}/{}'.format(orig_path, tar_name), dest=dest))
            elif child.isfile():
                with open(new_filename, "wb") as fout:
                    fout.write(child_object)
                extracted.append('{}/{}/{}'.format(orig_path, tar_location.split(sep)[-1], name))
            #elif child.isdir():
            #    shutil.copyfileobj(child_object, new_filename)
            #    with open(new_filename, "wb") as fout:
            #        fout.write(child_object)
            #    extracted.extend(process_directory(new_filename, '{}/{}'.format(orig_path, tar_name), dest=dest))
                
    if delete:
        remove(tar_location)
        
    return extracted

In [47]:
extract_and_delete('../data/test_archives/test3.tar', '', False)

['/test3/test2b/test2/test1.tar/test1.txt',
 '/test3/test2b/test2.tar/test2.txt',
 '/test3.tar/test3.txt']

In [25]:
def process_directory(dir_location, orig_path, delete=True, dest='../data/test_archives_contents/dir2'):
    extracted = []
    # call this on all of it's tar children
    for name in listdir(dir_location):
        new_filename = '{}/{}__{}__{}'.format(dest, orig_path.split('/')[-1], 
                                            dir_location.split('/')[-1], name)

        if path.isdir('{}/{}'.format(dir_location, name)):
            extracted.extend(process_directory('{}/{}'.format(dir_location, name), dir_location, dest=dest))
        elif child.isfile():
            with tarfile.TarFile(name) as tarry:
                if tarry.is_tarfile():
                    extracted.extend(extract_and_delete('{}/{}'.format(dir_location, name), 
                                                        dir_location), dest=dest)
                else:
                    with open(new_filename, "wb") as fout:
                        with open('{}/{}'.format(dir_location, name), 'r') as r:
                            fout.write(r.read())
                    extracted.append('{}/{}/{}'.format(orig_path, dir_location, name))
                
    if delete:
        remove(dir_location)
        
    return extracted

In [48]:
extract_and_delete('../data/test_archives/test3_dir.tar', '', False, '../data/test_archives_contents/dir2')

['/test3_dir/test2.tar/test 2/test2.txt',
 '/test3_dir/test2/test1.tar/test1.txt',
 '/test3_dir.tar/test3.txt']