Based on Raymond Hettinger talk https://www.youtube.com/watch?v=OSGv2VnC0go

In [107]:
import os

In [108]:
sorted(set('qqqqabracadabra'))

['a', 'b', 'c', 'd', 'q', 'r']

In [109]:
sum(x**3 for x in xrange(10))

2025

In [110]:
# create dict of all extensions in current dir
{os.path.splitext(filename)[1] for filename in os.listdir('.')}

{'', '.ipynb', '.txt'}

In [111]:
{filename : os.path.getsize(filename) for filename in os.listdir('.')}

{'.git': 4096L,
 '.ipynb_checkpoints': 0L,
 'Untitled.ipynb': 5832L,
 'deduplicate_files_in_dir.ipynb': 4587L,
 'test.ipynb': 0L,
 'test.txt': 108L}

## How to call a function until a sentinel value

In [112]:
print 'Size of the file is: ', os.path.getsize('test.txt')

Size of the file is:  108


In [113]:
blocks = []
with open('test.txt', 'r') as f:
    while True:
        block = f.read(10)
        if block == '':
            break
        blocks.append(block)

print blocks

['Memory pal', 'aces are a', 'wesome!\n\nl', 'ocation = ', 'word\nimage', ' = pronoun', 'ciation\nac', 'tion = mea', 'ning\n\npriv', 'et kak del', 'a!']


In [114]:
from functools import partial
with open('test.txt', 'r') as f:
    blocks = []
    for block in iter(partial(f.read, 10), ''):
        blocks.append(block)

print blocks

['Memory pal', 'aces are a', 'wesome!\n\nl', 'ocation = ', 'word\nimage', ' = pronoun', 'ciation\nac', 'tion = mea', 'ning\n\npriv', 'et kak del', 'a!']


## How to avoid flag variables

In [128]:
from random import shuffle
seq = range(10)
shuffle(seq)
look_for = 5

In [129]:
def find(seq, target):
    found = False
    for i, value in enumerate(seq):
        if value == target:
            found = True
            break
    if not found:
        return -1
    return i

In [130]:
print seq
print ('Number %s is on the %s position') % (look_for, find(seq, look_for))

[8, 3, 1, 4, 5, 6, 2, 0, 7, 9]
Number 5 is on the 4 position


In [131]:
def find(seq, target):
    for i, value in enumerate(seq):
        if value == target:
            break
    else: # I finished the body. Is there more of the body that I need to do? No? do else.
        return -1
    return i

In [132]:
print seq
print ('Number %s is on the %s position') % (look_for, find(seq, look_for))

[8, 3, 1, 4, 5, 6, 2, 0, 7, 9]
Number 5 is on the 4 position


## Dictionary skills

*If you mutate something while iterating over it, you're living in the state of sin and you deserve whatever happens to you*

In [134]:
d = {filename : os.path.getsize(filename) for filename in os.listdir('.')}

In [137]:
# loop over the key and then lookup the value
# is it fast? no, because you have to re-hash a key and do a lookup on it
for key in d:
    print key, '-->', d[key]

test.ipynb --> 0
.ipynb_checkpoints --> 0
deduplicate_files_in_dir.ipynb --> 4587
.git --> 4096
Untitled.ipynb --> 6788
test.txt --> 108


In [139]:
# this way it does not create a copy of a hash of a key
for key, value in d.iteritems():
    print key, '-->', value

test.ipynb --> 0
.ipynb_checkpoints --> 0
deduplicate_files_in_dir.ipynb --> 4587
.git --> 4096
Untitled.ipynb --> 6788
test.txt --> 108


In [146]:
from string import capwords
names = ['christian bale', 'brad pitt', 'charlize therone', 'john travolta']
names = [capwords(name) for name in names]
movies = ['Dark Knight', 'Fight Club', 'Prometheus', 'Pulp Fiction']

In [150]:
dict(zip(names, movies)) # izip is faster in 2.7, in 3.0 its just zip

{'Brad Pitt': 'Fight Club',
 'Charlize Therone': 'Prometheus',
 'Christian Bale': 'Dark Knight',
 'John Travolta': 'Pulp Fiction'}

In [151]:
dict(enumerate(movies))

{0: 'Dark Knight', 1: 'Fight Club', 2: 'Prometheus', 3: 'Pulp Fiction'}

## Counting with dictionaries

In [155]:
colors = ['red', 'yellow', 'red', 'blue', 'red', 'blue']

In [159]:
d = {}
for color in colors:
    if color not in d:
        d[color] = 0
    d[color] += 1

print d

{'blue': 2, 'yellow': 1, 'red': 3}


In [160]:
d = {}
for color in colors:
    d[color] = d.get(color, 0) + 1

print d

{'blue': 2, 'yellow': 1, 'red': 3}


In [162]:
from collections import defaultdict
d = defaultdict(int)
for color in colors:
    d[color] += 1
    
print d

defaultdict(<type 'int'>, {'blue': 2, 'yellow': 1, 'red': 3})


## Grouping with dictionaries

In [166]:
words = ['test', 'film', 'dict', 'movie', 'picture', 'ghastly', 'ostentatious']

In [167]:
d = {}
for word in words:
    key = len(word)
    if key not in d:
        d[key] = []
    d[key].append(word)
    
print d

{12: ['ostentatious'], 4: ['test', 'film', 'dict'], 5: ['movie'], 7: ['picture', 'ghastly']}


In [170]:
d = {} # there is a better way!
for word in words:
    key = len(word)
    d.setdefault(key, []).append(word)
    
print d

{12: ['ostentatious'], 4: ['test', 'film', 'dict'], 5: ['movie'], 7: ['picture', 'ghastly']}


Modern way is to use defaultdict. It is also a faster way

In [228]:
from collections import defaultdict
d = defaultdict(list)
for word in words:
    key = len(word)
    d[key].append(word)
    
print d

defaultdict(<type 'list'>, {12: ['ostentatious'], 4: ['test', 'film', 'dict'], 5: ['movie'], 7: ['picture', 'ghastly']})


Is a dictionary popitem() atomic? Yes. You don't have to put locks around it, so it can used between threads to pull out tasks.

In [257]:
d = os.environ
while d:
    key, value = d.popitem()
    print key, '-->', value

## Roman numerals

In [98]:
ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
nums = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
lst = (zip(nums, ints))
print lst

[('M', 1000), ('CM', 900), ('D', 500), ('CD', 400), ('C', 100), ('XC', 90), ('L', 50), ('XL', 40), ('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1)]


In [101]:
def rom_to_int(string):
    index = result = 0
    for roman, value in lst:
        while string[index:index+len(roman)] == roman:
            result += value
            index += len(roman)
    return result

In [102]:
print rom_to_int('MCMXCIX')

1999


## Updating multiple state variables

In [106]:
def fibonacci(n):
    x, y = 0, 1
    for i in range(n):
        x, y = y, x+y

In [107]:

x, y, dx, dy = (x+dx+t, 
                y+dy+t, 
                func(m, x, y, dx, dy, partial='x'), 
                func(m, x, y, dx, dy, partial='y'))

NameError: name 'x' is not defined

## Updating sequences

In [110]:
colors = ['red', 'blue', 'yellow']
del colors[0]
colors.insert(0, 'black')
print colors

['black', 'blue', 'yellow']


This is too slow. What's the correct data structure? Deque.

In [113]:
from collections import deque
colors = deque(['red', 'blue', 'yellow'])
colors.popleft()
colors.appendleft('black')
print colors

deque(['black', 'blue', 'yellow'])


## use with teardown logic

In [135]:
from decimal import localcontext, Context, Decimal
with localcontext(Context(prec=50)):
    print Decimal(355)/Decimal(113)

3.1415929203539823008849557522123893805309734513274


AttributeError: 'list' object has no attribute 'cells'