## No, I can't

In [5]:
from __future__ import print_function
from sys import getsizeof, stderr
from itertools import chain
from collections import deque
try:
    from reprlib import repr
except ImportError:
    pass

def total_size(o, handlers={}, verbose=False):
    """ Returns the approximate memory footprint an object and all of its contents.

    Automatically finds the contents of the following builtin containers and
    their subclasses:  tuple, list, deque, dict, set and frozenset.
    To search other containers, add handlers to iterate over their contents:

        handlers = {SomeContainerClass: iter,
                    OtherContainerClass: OtherContainerClass.get_elements}

    """
    dict_handler = lambda d: chain.from_iterable(d.items())
    all_handlers = {tuple: iter,
                    list: iter,
                    deque: iter,
                    dict: dict_handler,
                    set: iter,
                    frozenset: iter,
                   }
    all_handlers.update(handlers)     # user handlers take precedence
    seen = set()                      # track which object id's have already been seen
    default_size = getsizeof(0)       # estimate sizeof object without __sizeof__

    def sizeof(o):
        if id(o) in seen:       # do not double count the same object
            return 0
        seen.add(id(o))
        s = getsizeof(o, default_size)

        if verbose:
            print(s, type(o), repr(o), file=stderr)

        for typ, handler in all_handlers.items():
            if isinstance(o, typ):
                s += sum(map(sizeof, handler(o)))
                break
        return s

    return sizeof(o)


##### Example call #####

if __name__ == '__main__':
    d = dict(a=1, b=2, c=3, d=[4,5,6,7], e='a string of chars')
    print(total_size(d, verbose=True))

848


240 <class 'dict'> {'a': 1, 'b': 2, 'c': 3, 'd': [4, 5, 6, 7], ...}
50 <class 'str'> 'a'
28 <class 'int'> 1
50 <class 'str'> 'b'
28 <class 'int'> 2
50 <class 'str'> 'c'
28 <class 'int'> 3
50 <class 'str'> 'd'
96 <class 'list'> [4, 5, 6, 7]
28 <class 'int'> 4
28 <class 'int'> 5
28 <class 'int'> 6
28 <class 'int'> 7
50 <class 'str'> 'e'
66 <class 'str'> 'a string of chars'


In [6]:
import time, pickle, glob, os, sys, resource
from matching_functions import *

EEBO_SHINGLE_FOLDER = '/home/spenteco/0/eebo_shingled/'

all_shingles = []

for n, path_to_eebo_pickle in enumerate(sorted(glob.glob(EEBO_SHINGLE_FOLDER + '*.pickle'))):
    
    if n == 100:
        break
    
    eebo_data = load_pickle_file(path_to_eebo_pickle)
    
    all_shingles.append(eebo_data['shingles'])
    
print(len(all_shingles))
print(total_size(all_shingles))

100
304992260


In [8]:
import time, pickle, glob, os, sys, resource
from matching_functions import *

EEBO_SHINGLE_FOLDER = '/home/spenteco/0/eebo_shingled/'

all_shingles = []
shingle_lookup = {}
shingle_n = 0

for n, path_to_eebo_pickle in enumerate(sorted(glob.glob(EEBO_SHINGLE_FOLDER + '*.pickle'))):
    
    if n == 100:
        break
    
    eebo_data = load_pickle_file(path_to_eebo_pickle)
    
    translated_shingles = {}
    
    for k, v in eebo_data['shingles'].items():
        if k not in shingle_lookup:
            shingle_lookup[k] = shingle_n
            shingle_n += 1
        translated_shingles[shingle_lookup[k]] = v
    
    all_shingles.append(translated_shingles)
    
print(len(all_shingles))
print(total_size(all_shingles))

100
231113112


In [9]:
!df

Filesystem           1K-blocks       Used  Available Use% Mounted on
udev                   8118268          0    8118268   0% /dev
tmpfs                  1628052     165224    1462828  11% /run
/dev/sda3            207356528  115993408   80806912  59% /
tmpfs                  8140244      61292    8078952   1% /dev/shm
tmpfs                     5120          4       5116   1% /run/lock
tmpfs                  8140244          0    8140244   0% /sys/fs/cgroup
/dev/sda1               763904      35576     728328   5% /boot/efi
/dev/sdb             961303584  625734272  286714800  69% /data
//172.20.93.145/hdw 5799562832 4412487312 1387075520  77% /mnt/hdw
tmpfs                  1628052        152    1627900   1% /run/user/1001


In [11]:
import time, pickle, glob, os, sys, resource
from matching_functions import *

EEBO_SHINGLE_FOLDER = '/home/spenteco/0/eebo_shingled/'

all_shingles = []
shingle_lookup = {}
shingle_n = 0

for n, path_to_eebo_pickle in enumerate(sorted(glob.glob(EEBO_SHINGLE_FOLDER + '*.pickle'))):
    
    tcp_id = path_to_eebo_pickle.split('/')[-1].split('.')[0]
    
    eebo_data = load_pickle_file(path_to_eebo_pickle)
    
    for k, v in eebo_data['shingles'].items():
        print(str(k) + '\t' + tcp_id + '\t' + str(v))
        
    break

('passionate', 'morris', 'london')	A00001	[[0, 2]]
('morris', 'london', 'imprint')	A00001	[[1, 3]]
('london', 'imprint', 'richard')	A00001	[[2, 4]]
('imprint', 'richard', 'jones')	A00001	[[3, 5]]
('richard', 'jones', 'gentlewoman')	A00001	[[4, 6]]
('jones', 'gentlewoman', 'england')	A00001	[[5, 7]]
('gentlewoman', 'england', 'beautiful')	A00001	[[6, 8]]
('england', 'beautiful', 'damsel')	A00001	[[7, 9]]
('beautiful', 'damsel', 'bold')	A00001	[[8, 10]]
('damsel', 'bold', 'presume')	A00001	[[9, 11]]
('bold', 'presume', 'wont')	A00001	[[10, 12]]
('presume', 'wont', 'favour')	A00001	[[11, 13]]
('wont', 'favour', 'thereby')	A00001	[[12, 14]]
('favour', 'thereby', 'lead')	A00001	[[13, 15]]
('thereby', 'lead', 'performance')	A00001	[[14, 16]]
('lead', 'performance', 'vow')	A00001	[[15, 17]]
('performance', 'vow', 'duty')	A00001	[[16, 18]]
('vow', 'duty', 'kind')	A00001	[[17, 19]]
('duty', 'kind', 'zeal')	A00001	[[18, 20]]
('kind', 'zeal', 'bind')	A00001	[[19, 21]]
('zeal', 'bind', 'offer')	A0

('use', 'courteous', 'answer')	A00001	[[3515, 3517]]
('courteous', 'answer', 'order')	A00001	[[3516, 3518]]
('answer', 'order', 'hope')	A00001	[[3517, 3519]]
('order', 'hope', 'gentle')	A00001	[[3518, 3520]]
('hope', 'gentle', 'sir')	A00001	[[3519, 3521]]
('gentle', 'sir', 'quoth')	A00001	[[3520, 3522]]
('sir', 'quoth', 'give')	A00001	[[3521, 3523]]
('quoth', 'give', 'leave')	A00001	[[3522, 3524]]
('give', 'leave', 'answer')	A00001	[[3523, 3525]]
('leave', 'answer', 'speedy')	A00001	[[3524, 3526]]
('answer', 'speedy', 'blunt')	A00001	[[3525, 3527]]
('speedy', 'blunt', 'ask')	A00001	[[3526, 3528]]
('blunt', 'ask', 'question')	A00001	[[3527, 3529]]
('ask', 'question', 'heart')	A00001	[[3528, 3530]]
('question', 'heart', 'reply')	A00001	[[3529, 3531]]
('heart', 'reply', 'desire')	A00001	[[3530, 3532]]
('reply', 'desire', 'assure')	A00001	[[3531, 3533]]
('desire', 'assure', 'thus')	A00001	[[3532, 3534]]
('assure', 'thus', 'much')	A00001	[[3533, 3535]]
('thus', 'much', 'say')	A00001	[[3534,

('certainty', 'everlasting', 'happiness')	A00001	[[7528, 7530]]
('everlasting', 'happiness', 'assurance')	A00001	[[7529, 7531]]
('happiness', 'assurance', 'continual')	A00001	[[7530, 7532]]
('assurance', 'continual', 'earthly')	A00001	[[7531, 7533]]
('continual', 'earthly', 'pleasure')	A00001	[[7532, 7534]]
('earthly', 'pleasure', 'come')	A00001	[[7533, 7535]]
('pleasure', 'come', 'many')	A00001	[[7534, 7536]]
('come', 'many', 'fair')	A00001	[[7535, 7537]]
('many', 'fair', 'horse')	A00001	[[7536, 7538]]
('fair', 'horse', 'smith')	A00001	[[7537, 7539]]
('horse', 'smith', 'field')	A00001	[[7538, 7540]]
('smith', 'field', 'twelve')	A00001	[[7539, 7541]]
('field', 'twelve', 'month')	A00001	[[7540, 7542]]
('twelve', 'month', 'make')	A00001	[[7541, 7543]]
('month', 'make', 'many')	A00001	[[7542, 7544]]
('make', 'many', 'speed')	A00001	[[7543, 7545]]
('many', 'speed', 'already')	A00001	[[7544, 7546]]
('speed', 'already', 'wish')	A00001	[[7545, 7547]]
('already', 'wish', 'unprovide')	A00001	[[