In [62]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os

import numpy as np
import itertools

### PEP8

In [12]:
a = []
not a # preferred
len(a) == 0 # avoid


d = {}
not d
len(d) == 0 # avoid

s = ''
not s # preferred
len(s) # avoid

True

True

True

True

True

0

In [13]:
open??

### string and bytes

* bytes contain raw, unsigned 8-bit values(often displayed in the ASCII encoding). 
* ASCII: each byte can store up to 255 numbers. 2**8 -1. for example all english letters and other special important characters can be represented by  numbers 32-127.
* Unicode: but we have more than 255 characters in other languages, so ASCII is not enough. use code-points to represent characters, like U+00639, hexdecimal
* uft-8: encoding standard, use at least 8-bits to store unicode points. 0-127 using 1 byte(8 bits), 128 and above using 2,3,4 bytes 
* str contain unicode code points

In [279]:
s = 'this is a test! \x65, \u0300'
b = b'this is a test! \x65\u0300'
s
b
print(repr(s))
print(repr(b))
print(list(b))
print(list(s))

'this is a test! e, ̀'

b'this is a test! e\\u0300'

'this is a test! e, ̀'
b'this is a test! e\\u0300'
[116, 104, 105, 115, 32, 105, 115, 32, 97, 32, 116, 101, 115, 116, 33, 32, 101, 92, 117, 48, 51, 48, 48]
['t', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 't', 'e', 's', 't', '!', ' ', 'e', ',', ' ', '̀']


In [288]:
# string encode to bytes and bytes decode to string
a = b'this is byte data!'.decode('utf-8')
a;type(a)
b = 'this is string data type.'.encode('utf-8')
b; type(b)

'this is byte data!'

str

b'this is string data type.'

bytes

In [289]:
b'foo' == 'foo' # bytes == string always evaluated to Falseb


False

### string formatting F-string

In [301]:
'aaa'.title()

'Aaa'

In [316]:
pantry = [('avocado', 1.25), ('banana', 2.5), ('cherries', 15),]
# multiple unpacking preferred over indexing
for i, (item, count) in enumerate(pantry):
    #  print(item.title(), item)
    # < left align, 10 spaces
    # !r repr 
    # 6.3f, numbers occupy 6 spaces, 3 digits after decimal
    f_string = f'#{i+1}: {item.title():<10s} = {round(count):6.3f}, {item!r:>15s}'
    print(f_string)

#1: Avocado    =  1.000,       'avocado'
#2: Banana     =  2.000,        'banana'
#3: Cherries   = 15.000,      'cherries'


### enumerate over range

In [320]:
flavor_list = ['vanilla', 'pecan', 'strawberry']

it = enumerate(flavor_list) # generator
next(it)
next(it)

for i, flavor in enumerate(flavor_list, 3): # start from 3 to enumerate
    print(f'{i}: {flavor}')

(0, 'vanilla')

(1, 'pecan')

3: vanilla
4: pecan
5: strawberry


### zip and itertools.zip_longest

In [329]:
import itertools
max_count = 0
names = ['Sonya', "Kathryne", 'Alisa']
counts = [len(n) for n in names]
names.append('Owen')
names
counts
it = zip(names, counts) # lazy generator, stop when any iterators is exhansted
next(it)
next(it)

for name, count in zip(names, counts):
    print(name)
    if count > max_count:
        longest_name = name
        max_count = count
print(longest_name)


for name, count in itertools.zip_longest(names, counts, fillvalue='kkkkk'):
    print(f'{name}: {count}')

['Sonya', 'Kathryne', 'Alisa', 'Owen']

[5, 8, 5]

('Sonya', 5)

('Kathryne', 8)

Sonya
Kathryne
Alisa
Kathryne
Sonya: 5
Kathryne: 8
Alisa: 5
Owen: kkkkk


### walrus assignment expression

In [334]:
fresh_fruit = {'apple': 10, 'banana': 8, 'lemon': 5}
def make_lemonade(count):
    pass

if count := fresh_fruit.get('lemon', 0):
    print(f'there are still {count} lemons!')
    make_lemonade(count)
else:
    out_of_stock()
        

there are still 5 lemons!


In [338]:
class OutOfBananas(Exception):
    pass


def slice_bananas(count):
    pass

def make_smoothies(pieces):
    pass

pieces = 0
if (count := fresh_fruit.get('banana', 0)) >= 2:
    pieces = slice_bananas(count)
try:
    smoothies = make_smoothies(pieces)
except OutOfBananas:
    out_of_stock()

In [342]:
def pick_fruit():
    pass 

bottles = []
while fresh_fruit := pick_fruit():
    for fruit, count in fresh_fruit.items():
        print(f'fruit: count, {fruit}: {count}')
        batch = make_juice(fruit, count)
        bottles.extend(batch)

In [349]:
# no out of range exception here
list('hello')[-40:]
list('hello')[:40]
list('hello')[20]

['h', 'e', 'l', 'l', 'o']

['h', 'e', 'l', 'l', 'o']

IndexError: list index out of range

In [369]:
a = list('hello')
a
# assign a list to a slice, does not need to be the same length
a[2:4] = '12345'
a
a[2:7] = '3'
a
a[::-1]

['h', 'e', 'l', 'l', 'o']

['h', 'e', '1', '2', '3', '4', '5', 'o']

['h', 'e', '3', 'o']

['o', '3', 'e', 'h']

In [367]:
# slicing produces a new list
a = list('hello')
b = a[:]
a == b
a is b

# assigning without start and end index, b is a copy of a.
a = list('hello')
b = a
a == b
a is b

a
b
a[3] = '1000'
a 
b

True

False

True

True

['h', 'e', 'l', 'l', 'o']

['h', 'e', 'l', 'l', 'o']

['h', 'e', 'l', '1000', 'o']

['h', 'e', 'l', '1000', 'o']

### starred expression, catch-all unpacking

In [372]:
car_ages = [9, 0, 8, 4, 0, 7]
oldest, second_old, *others = car_ages
others
second_old
oldest, *other, youngest = car_ages
other 
youngest

[8, 4, 0, 7]

0

[0, 8, 4, 0]

7

### sort list using key parameter

list.sort() sort string, interger, float etc by natural order, but it does not work for object unless you implement special comparison method

In [392]:
class Tool:
    def __init__(self, name, weight):
        self.name = name
        self.weight = weight
        
    def __repr__(self):
        return f'Tool({self.name!r}, {self.weight})'
    
tools = [
    Tool('level', 3.5),
    Tool('hammer', 1.25),
    Tool('screwdriver', 0.5), 
    Tool('plier', 0.3), 
    Tool('chisel', 0.5)
]

# tools.sort()
print(repr(tools))
tools.sort(key=lambda x: x.name)
print(repr(tools))

tools.sort(key=lambda x: x.weight)
print(repr(tools))

# sort first by  weight and then by name
# tuple comparison, compare first position and then the next position in the tuples
tools.sort(key=lambda x: (x.weight, x.name), reverse=True)
tools

# negate numerical value, sort name and weight in different order
tools.sort(key=lambda x: (-x.weight, x.name), reverse=False)
tools

# stable sort preserve the input order when key function return the same value
# sort by weight descending and then name ascending, two step sort needs to sort on name first then weight
tools.sort(key=lambda x: x.name) # name ascending
tools.sort(key=lambda x: x.weight, reverse=True) # wight descending
tools

[Tool('level', 3.5), Tool('hammer', 1.25), Tool('screwdriver', 0.5), Tool('plier', 0.3), Tool('chisel', 0.5)]
[Tool('chisel', 0.5), Tool('hammer', 1.25), Tool('level', 3.5), Tool('plier', 0.3), Tool('screwdriver', 0.5)]
[Tool('plier', 0.3), Tool('chisel', 0.5), Tool('screwdriver', 0.5), Tool('hammer', 1.25), Tool('level', 3.5)]


[Tool('level', 3.5),
 Tool('hammer', 1.25),
 Tool('screwdriver', 0.5),
 Tool('chisel', 0.5),
 Tool('plier', 0.3)]

[Tool('level', 3.5),
 Tool('hammer', 1.25),
 Tool('chisel', 0.5),
 Tool('screwdriver', 0.5),
 Tool('plier', 0.3)]

[Tool('level', 3.5),
 Tool('hammer', 1.25),
 Tool('chisel', 0.5),
 Tool('screwdriver', 0.5),
 Tool('plier', 0.3)]

In [384]:
places = ['home', 'work', 'New York', 'Paris']
places.sort()
places

places.sort(key=lambda x: x.lower())
places

['New York', 'Paris', 'home', 'work']

['home', 'New York', 'Paris', 'work']

### type annotation

In [409]:
from typing import Dict

def get_winner(ranks: Dict[str, int]) -> str:
    return next(iter(ranks))

ranks = {'cat': 40, 'dog': 3}
names = list(ranks.keys())
names.sort(key=ranks.get)
sorted_ranks = {k:ranks[k] for k in names}
get_winner(sorted_ranks)


# type annotation is not doing what I expected
sorted_ranks['dog'] = 'xxx'
sorted_ranks
get_winner(sorted_ranks)


'dog'

{'dog': 'xxx', 'cat': 40}

'dog'

In [419]:
def careful_divide(a: float, b: int) -> float:
    return a / b

In [420]:
careful_divide(3.4, 0.88)

3.8636363636363633

In [400]:
ranks.get('dog')

3

### dictionary: handle missing keys

In [424]:
counters = {'apple':2, 'orange':3}
counters.get('orange', 0)
counters.get('banana', 0)
counters


key = 'carrot'
count = counters.get(key, 0)
counters[key] = count + 1
counters

3

0

{'apple': 2, 'orange': 3}

{'apple': 2, 'orange': 3, 'carrot': 1}

use collections.Counter to maintain counts

In [432]:
from collections import Counter
# Counter?
c = Counter('i am litter teapot!')
c
c.most_common(3)


Counter({'i': 2,
         ' ': 3,
         'a': 2,
         'm': 1,
         'l': 1,
         't': 4,
         'e': 2,
         'r': 1,
         'p': 1,
         'o': 1,
         '!': 1})

[('t', 4), (' ', 3), ('i', 2)]

### triple assignment

In [446]:
votes = {'Biden': ['A', 'B'], 'Warren': ['C'], 'Sanders': ['H', 'I', 'J']}
votes

# populate key in one line


def populate_dict(key, who):
    if (names := votes.get(key)) is None:
        print(names)
        votes[key] = names = []
    names.append(who)
    print(names)
    return votes

key = 'Obama'
who = 'X'
populate_dict(key, who)

key = 'Sanders'
who = 'Y'
populate_dict(key, who)

# use setdefault be careful, you need to contruct a default value for each key
key = 'Trump'
who = 'X'
names = votes.setdefault(key, [])
names.append(who)
votes

{'Biden': ['A', 'B'], 'Warren': ['C'], 'Sanders': ['H', 'I', 'J']}

None
['X']


{'Biden': ['A', 'B'],
 'Warren': ['C'],
 'Sanders': ['H', 'I', 'J'],
 'Obama': ['X']}

['H', 'I', 'J', 'Y']


{'Biden': ['A', 'B'],
 'Warren': ['C'],
 'Sanders': ['H', 'I', 'J', 'Y'],
 'Obama': ['X']}

{'Biden': ['A', 'B'],
 'Warren': ['C'],
 'Sanders': ['H', 'I', 'J', 'Y'],
 'Obama': ['X'],
 'Trump': ['X']}

### defaultdict is much better than setdefault if you control the creation of the dictionary

In [454]:
from collections import defaultdict
class Visits:
    def __init__(self):
        self.data = defaultdict(set)
        
    def add(self, country, city):
        self.data[country].add(city)
        
visits = Visits()
visits.data
visits.add('England', 'Batch')
visits.add('England', 'London')
visits.add('China', 'Beijing')
visits.add('England', 'London')
visits.data

visits.data['China']
visits.data['France']

defaultdict(set, {})

defaultdict(set, {'England': {'Batch', 'London'}, 'China': {'Beijing'}})

{'Beijing'}

set()

### ITEM 20: PREFER RAISING EXCEPTIONS TO RETURNING NONE

In [9]:
def careful_divide(a, b):
    try:
        return a / b
    except ZeroDivisionError:
        return None

In [11]:
def careful_divide(a, b):
    try:
        return a / b
    except ZeroDivisionError as e:
        raise ValueError('Invalid inputs')

In [12]:
x, y = 5, 0
try:
    result = careful_divide(x, y)
except ValueError:
    print('Invalid inputs')
else:
    print('Result is %.1f' % result)

Invalid inputs


### ITEM 21: KNOW HOW CLOSURES INTERACT WITH VARIABLE SCOPE

In [5]:
def sort_priority(values, group):
    def helper(x):
        print(x)
        if x in group:
            return (0, x)
        return (1, x)
    values.sort(key=helper)

In [4]:
# numbers.sort??
# if key is a function, it is applied to each item in the list and then sort the list

In [7]:
numbers = [8, 3, 1, 2, 5, 4, 7, 6]
group = {2, 3, 5, 7}
sort_priority(numbers, group)
print(numbers)

8
3
1
2
5
4
7
6
[2, 3, 5, 7, 1, 4, 6, 8]


it is important to understand the scop. the outside def is called enclosing function, the inside function is called closure function. the closure can access variables in the enclosing scope. when you reference a variable, it looks the inner function scope first, then the enclosing scope, and then the module where this function is define, and lastly the build-in function scope. if it is not found, it raises a nameerror.
for variable assignment, if you assign a value to a variable in the inside function, it will not affect the variable with the same name in the outside function scope unless you use nonlocal variable. this is to prevent polluting the scope.

In [11]:
def sort_priority(values, group):
    found = False
    def helper(x):
        print(x)
        if x in group:
            found = True
            print(f'found in inside scope is {found}')
            return (0, x)
        return (1, x)
    
    values.sort(key=helper)
    print(f'found in outside scope is {found}')
    
sort_priority(numbers, group)

2
found in inside scope is True
3
found in inside scope is True
5
found in inside scope is True
7
found in inside scope is True
1
4
6
8
found in outside scope is False


### this is easier to understand

In [27]:
class Sorter:
    def __init__(self, group):
        self.group = group
        self.found = False

    def __call__(self, x):
        if x in self.group:
            self.found = True
            return (0, x)
        return (1, x)

sorter = Sorter(group)
print(sorter)
numbers.sort(key=sorter)
assert sorter.found is True
print(numbers)

<__main__.Sorter object at 0x7f439b2787b8>
[2, 3, 5, 7, 1, 4, 6, 8]


When you reference a variable in an expression, the Python interpreter traverses the scope to resolve the reference in this order:

The current function’s scope.

Any enclosing scopes (such as other containing functions).

The scope of the module that contains the code (also called the global scope).

The built-in scope (that contains functions like len and str).

In [256]:
not []
not None
not 0
not False
not {}
not ()
not ''

True

True

True

True

True

True

True

In [33]:
def print_parameters(**kwargs):
    for key, value in kwargs.items():
        print(f'{key} = {value}')

print_parameters(alpha=1.5, beta=9, gamma=4)
print_parameters(alpha=1.5, beta=9, gamma=4, sigma=66)

alpha = 1.5
beta = 9
gamma = 4
alpha = 1.5
beta = 9
gamma = 4
sigma = 66


### item 26: define function decorators with functools.wraps

functools wraps helper function copies all of the important metadata about inner function to the outer function. so you actually get the doc string and the correct function name etc.

In [61]:
from functools import wraps
def deco(func):
    @wraps(func)
    def print_details(*args):
        result = func(*args)
        print(f'{func.__name__}({args!r}) -> {result!r}')
        return result
    return print_details

@deco
def sumup(a, b):
    """calculate sum of two values"""
    return a + b

# sumpup just return 7 initially, with the decorator, it is modified at runtime, it prints out the input arguments as well
sumup(3, 4)

deco(sumup(3, 4))
help(sumup)

sumup((3, 4)) -> 7


7

sumup((3, 4)) -> 7


<function __main__.deco.<locals>.print_details>

Help on function sumup in module __main__:

sumup(a, b)
    calculate sum of two values



#### a more complicated example with a recursive function

In [66]:
def trace(func):
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        print(f'{func.__name__}({args!r}, {kwargs!r}) '
              f'-> {result!r}')
        return result
    return wrapper

# @trace # equivalent to fibonacci = trace(fibonacci)
def fibonacci(n):
    """Return the n-th Fibonacci number"""
    print(f'calculate fibonacci number of n={n}')
    if n in (0, 1):
        return n
    return (fibonacci(n - 2) + fibonacci(n - 1))

In [68]:
# when there is no decorator, it just find out fibonacci number of 4
# decorator can modify a function at runtime
# i still don't understand the execution sequence
fibonacci(3)

calculate fibonacci number of n=3
calculate fibonacci number of n=1
calculate fibonacci number of n=2
calculate fibonacci number of n=0
calculate fibonacci number of n=1


2

In [64]:
fibonacci(4)

calculate fibonacci number of n=4
calculate fibonacci number of n=2
calculate fibonacci number of n=0
fibonacci((0,), {}) -> 0
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
fibonacci((2,), {}) -> 1
calculate fibonacci number of n=3
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
calculate fibonacci number of n=2
calculate fibonacci number of n=0
fibonacci((0,), {}) -> 0
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
fibonacci((2,), {}) -> 1
fibonacci((3,), {}) -> 2
fibonacci((4,), {}) -> 3


3

In [65]:
trace(fibonacci(4))

calculate fibonacci number of n=4
calculate fibonacci number of n=2
calculate fibonacci number of n=0
fibonacci((0,), {}) -> 0
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
fibonacci((2,), {}) -> 1
calculate fibonacci number of n=3
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
calculate fibonacci number of n=2
calculate fibonacci number of n=0
fibonacci((0,), {}) -> 0
calculate fibonacci number of n=1
fibonacci((1,), {}) -> 1
fibonacci((2,), {}) -> 1
fibonacci((3,), {}) -> 2
fibonacci((4,), {}) -> 3


<function __main__.trace.<locals>.wrapper(*args, **kwargs)>

### position and keyword arguments, position-only and keyword-only argument

In [33]:
nums = [1, 2, 4]
print(nums)
print(*nums) #unpack a sequence

[1, 2, 4]
1 2 4


In [None]:
# *args, unknown number of position argumeents
# **kargs, unknown number of keyword arguments
def some_func(*args, **kargs):
    print(args[0]+args[1], args, kargs)
    
some_func(1, 2, 'arg1', kargs1='arg2')

In [26]:
def some_func(*args, **kargs):
    print(args, kargs)
    return args, kargs
    
nums = [1, 2, 4]
players = {'John':'forward', 'Mike': 'defense'}
# some_func(nums, players) # this is not correct
# some_func(nums, **players) # this is not correct
# * operator instructs python to pass values from a sequence as positional arguments to the function
# ** operator instructs python to pass values in the dictionary as corresponding keyword arguments.
a, b = some_func(*nums, **players) # this unpacks both the list and dictionary 
a
b
a[1]

(1, 2, 4) {'John': 'forward', 'Mike': 'defense'}


(1, 2, 4)

{'John': 'forward', 'Mike': 'defense'}

2

In [60]:
def some_func(a, b, k1=10, k2=100):
    print(a+b+k1+k2)
    
some_func(1, 2)
some_func(1, 2, 3)
some_func(1, 2, k2=3)

113
106
16


In [4]:
!python --version

Python 3.8.1


In [10]:
# you can provide positional using keyword, you can also provide keyword argument by position
def some_func(a, b=3):
    return a + b
some_func(2, 3)
some_func(a=2)
some_func(a=2, b=3)
# some_func(a=2, 3) this does not work, positional argument has to be before keyword argument

5

5

5

In [27]:
# arguments before / are position-only
# arguments after * are keyword-only
# in between can be either
def some_func(a, b, /, c, *, k1=10, k2=100):
    print(a+b+k1+k2)
    

some_func(1, 2, 3)
some_func(1, 2, c=2, k2=3)
some_func(1, 2, 2, k2=3)

113
16
16


### list, set, dictionary comprehension and generator expression

In [1]:
stock = {
    'nails': 125,
    'screws': 35,
    'wingnuts': 8,
    'washers': 24,
}

order = ['screws', 'wingnuts', 'clips']

def get_batches(count, size):
    return count // size

In [7]:
# list comprehesion
nums = [1, 3, 4, 5, 6]
squares = [x**2 for x in nums if x % 3 == 0]
squares

[9, 36]

In [14]:
# this can also be achieved by using map and filter, but visually noisy
list(map(lambda x: x**2, filter(lambda x: x % 3 == 0, nums)))

[9, 36]

In [6]:
# set comprehension
{x**3 for x in nums if x % 2 == 0}

{64, 216}

In [36]:
stock.get??

In [4]:
stock.items()

dict_items([('nails', 125), ('screws', 35), ('wingnuts', 8), ('washers', 24)])

In [3]:
# dictionary comprehesion
# get value by key, default to 0
{name: stock.get(name, 0) for name in order}

{'screws': 35, 'wingnuts': 8, 'clips': 0}

In [17]:
get_batches??

In [16]:
# walrus assignment :=
found = {name: batches for name in order
         if (batches := get_batches(stock.get(name, 0), 8))}

found

{'screws': 4, 'wingnuts': 1}

In [45]:
# why it does not output clips: 0, because if 0 evaluated to False
{name: get_batches(stock.get(name, 0), 8) for name in order}

{'screws': 4, 'wingnuts': 1, 'clips': 0}

In [87]:
nums = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
nums

# subexpression runs from left to right
[x**2 for row in nums for x in row]
[[x**2 for x in row] for row in nums]

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

[1, 4, 9, 16, 25, 36, 49, 64, 81]

[[1, 4, 9], [16, 25, 36], [49, 64, 81]]

In [119]:
nums = [1, 4, 7]
# generator expression is same as list comprehension, but with () instead of [], it is evaluated to an iterator
it = (x for x in nums)
it
# list(it)
# next(it)

# generators can be composed together
# it = (x**2 for x in it)
# list(it)

# this composition can also be achieved by using yield from
nums = [1, 4, 7]
# it = (x for x in nums)
def it():
    for i in range(1_000_000):
        yield i**2
        
def square(x):
    for i in x:
        yield i**2
    
def yf():    
    yield from it()
    
print('----')
# list(square(it))

yf()

<generator object <genexpr> at 0x109117f90>

----


<generator object yf at 0x109117270>

### Item 30: consider generators instead of returning list

In [35]:
def index_words_iter(text):
    if text:
        yield 0
    for index, letter in enumerate(text):
        if letter == ' ':
             yield index + 1

In [38]:
# yield produces generator, it returns a item at a time when you call next build-in method
address = 'Four score and seven years ago...'
it = index_words_iter(address)
print(next(it))
print(next(it))

list(it)

0
5


[11, 15, 21, 27]

In [67]:
list(enumerate(address))[:2]

[(0, 'F'), (1, 'o')]

In [44]:
# generator is stateful and can not be resued. It is exhausted after you read each element once,
# does not consume huge amount of memories, one a line in the memory
# good for data streaming
# generator can be composed together

nums = list(range(100))
it = (len(x) for x in open('wages.txt'))
# next(it)
# next(it)
# next(it)

double = ((x, x*2) for x in it)
# next(double)
# next(double)
# next(double)

In [46]:
def yf(it):
    yield from it
    
# list(yf(it))

In [47]:
 
# help(itertools)

### Item 31: be defensive when iterating over arguments

difference between iterator and container, iterator and generator raise stopiteration exception when they are exhausted, you can not repeated access the content of a iterator or generator. On the other hand, list, sequence or container type can be accessed multiple times.

In [53]:
nums = [1, 2, 5]
nums[1]
nums[1]
it = iter(nums)
list(it)
next(it)

2

2

[1, 2, 5]

StopIteration: 

In [55]:
# iter??

Object `get_iter` not found.


In [68]:
# implement a generator
def read_visit(data_path):
    with open(data_path) as f:
        for line in f:
            yield int(line)
            
f = '/Users/stuartzong/data_science/visits.txt'
gr = read_visit(f)
gr
list(gr)

<generator object read_visit at 0x1090a2820>

[12, 22, 34, 6612, 22, 34, 66]

In [73]:
# implement a container
class ReadVisits:
    def __init__(self, data_path):
        self.data_path = data_path
        
    def __iter__(self):
        with open(self.data_path) as f:
            for line in f:
                yield int(line)

In [79]:
from collections.abc import Iterator
visits = ReadVisits(f)
type(visits)
isinstance(visits, Iterator)
isinstance(iter(visits), Iterator)
visits
list(visits)
list(visits)

__main__.ReadVisits

False

True

<__main__.ReadVisits at 0x1090c2a30>

[12, 22, 34, 6612, 22, 34, 66]

[12, 22, 34, 6612, 22, 34, 66]

### class

In [16]:
class SimpleGradebook:
    def __init__(self):
        self._grades = {}

    def add_student(self, name):
      self._grades[name] = []

    def report_grade(self, name, score):
      self._grades[name].append(score)

    def average_grade(self, name):
      grades = self._grades[name]
      return sum(grades) / len(grades)

In [18]:
book = SimpleGradebook()
book.add_student('Isaac Newton')
book.report_grade('Isaac Newton', 90)
book.report_grade('Isaac Newton', 95)
book.report_grade('Isaac Newton', 85)

print(book.average_grade('Isaac Newton'))

90.0


In [20]:
book.__dict__
book.__getattr__

{'_grades': {'Isaac Newton': [90, 95, 85]}}

AttributeError: 'SimpleGradebook' object has no attribute '__getattr__'

### item 39: use @classmethod polymorphism to construct object generically

In python, both object and class support polymorphism. polymorphism allows multiple class in a hiearchy to implement their own unique version of a method. so many of the these class can have the same interface or abstraction base class while providing different functionality.

still not completely understand how this whole thing works

In [120]:
# this class provide standard interface, child class have to implemet these methods
class GenericInputData:
    def read(self):
        raise NotImplementedError
        
    # different subclasses implement this method differently
    @classmethod
    def generate_inputs(cls, config):
        raise NotImplementedError


In [131]:
class PathInputData(GenericInputData):
    def __init__(self, path):
        super().__init__()
        self.path = path
        
    def read(self):
        with open(self.path) as f:
            return f.read()
        
    @classmethod
    def generate_inputs(cls, config):
        data_dir = config['data_dir']
        for name in os.listdir(data_dir):
            yield cls(os.path.join(data_dir, name))

In [137]:
class GenericWorker:
    def __init__(self, input_data):
        self.input_data = input_data
        self.result = None
        
    def map(self):
        raise NotImplementedError
        
    def reduce(self, other):
        raise NotImplementedError
        
    @classmethod
    def create_workers(cls, input_class, cofig):
        workers = []
        for input_data in input_class.generate_inputs(config):
            workers.append(cls(input_data))
        return workers        

In [138]:
class LineCountWorker(GenericWorker):
    def map(self):
        data = self.input_data.read()
        self.result = data.count('\n')
        
    def reduce(self, other):
        self.result += other.result

In [141]:
from threading import Thread

def execute(workers):
    threads = [Thread(target=w.map) for w in workers]
    for thread in threads: thread.start()
    for thread in threads: thread.join()
    
    first, *rest = workers
    for worker in rest:
        first.reduce(worker)
    return first.result
        
        

In [139]:
def mapreduce(worker_class, input_class, config):
    workers = worker_class.create_workers(input_class, config)
    return execute(workers)


In [146]:
pwd

'/Users/stuartzong/data_science/python_general'

In [147]:

tmpdir = 'test_inputs'
config = {'data_dir': tmpdir}
result = mapreduce(LineCountWorker, PathInputData, config)
print(f'There are {result} lines!')

There are 9 lines!


### metaclass and attribute

In [25]:
# no need to implement getter and setter
class Resistor:
    def __init__(self, ohms):
        self.ohms = ohms
        self.voltage = 0
        self.current = 0

r1 = Resistor(50e3)
r1.ohms = 10e3
r1.__dict__
r1.ohms

{'ohms': 10000.0, 'voltage': 0, 'current': 0}

10000.0

In [26]:
# subclass Resistor, @property to implement getter and setter
class VoltageResistance(Resistor):
    def __init__(self, ohms):
        super().__init__(ohms)
        self._voltage = 0

    @property
    def voltage(self):
        return self._voltage

    @voltage.setter
    def voltage(self, voltage):
        self._voltage = voltage
        self.current = self._voltage / self.ohms

In [27]:
r2 = VoltageResistance(1e3)
print(f'Before: {r2.current:.2f} amps')
r2.voltage = 10
print(f'After:  {r2.current:.2f} amps')

Before: 0.00 amps
After:  0.01 amps


In [28]:
class BoundedResistance(Resistor):
    def __init__(self, ohms):
        super().__init__(ohms)

    @property
    def ohms(self):
        return self._ohms

    @ohms.setter
    def ohms(self, ohms):
        if ohms <= 0:
            raise ValueError(f'ohms must be > 0; got {ohms}')
        self._ohms = ohms

In [34]:
r3 = BoundedResistance(1e3)
r3.ohms
r3.ohms = 0

1000.0

ValueError: ohms must be > 0; got 0

In [33]:
BoundedResistance(-5)

ValueError: ohms must be > 0; got -5

In [35]:
class FixedResistance(Resistor):
    def __init__(self, ohms):
        super().__init__(ohms)

    @property
    def ohms(self):
        return self._ohms

    @ohms.setter
    def ohms(self, ohms):
        if hasattr(self, '_ohms'):
            raise AttributeError("Ohms is immutable")
        self._ohms = ohms

In [36]:
r4 = FixedResistance(1e3)
r4.ohms = 2e3

AttributeError: Ohms is immutable

In [37]:
class MysteriousResistor(Resistor):
    @property
    def ohms(self):
        self.voltage = self._ohms * self.current
        return self._ohms

    @ohms.setter
    def ohms(self, ohms):
        self._ohms = ohms

In [38]:
r7 = MysteriousResistor(10)
r7.current = 0.01
print(f'Before: {r7.voltage:.2f}')
r7.ohms
print(f'After:  {r7.voltage:.2f}')

Before: 0.00


10

After:  0.10


In [44]:
from datetime import datetime, timedelta

In [45]:
class NewBucket:
    def __init__(self, period):
        self.period_delta = timedelta(seconds=period)
        self.reset_time = datetime.now()
        self.max_quota = 0
        self.quota_consumed = 0

    def __repr__(self):
        return (f'NewBucket(max_quota={self.max_quota}, '
                f'quota_consumed={self.quota_consumed})')
    

    @property
    def quota(self):
        return self.max_quota - self.quota_consumed
    
    @quota.setter
    def quota(self, amount):
        delta = self.max_quota - amount
        if amount == 0:
            # Quota being reset for a new period
            self.quota_consumed = 0
            self.max_quota = 0
        elif delta < 0:
            # Quota being filled for the new period
            assert self.quota_consumed == 0
            self.max_quota = amount
        else:
           # Quota being consumed during the period
           assert self.max_quota >= self.quota_consumed
           self.quota_consumed += delta

In [51]:
def fill(bucket, amount):
    now = datetime.now()
    if (now - bucket.reset_time) > bucket.period_delta:
        bucket.quota = 0
        bucket.reset_time = now
    bucket.quota += amount

def deduct(bucket, amount):
    now = datetime.now()
    if (now - bucket.reset_time) > bucket.period_delta:
        return False  # Bucket hasn't been filled this period
    if bucket.quota - amount < 0:
        return False  # Bucket was filled, but not enough

    bucket.quota -= amount
    return True       # Bucket had enough, quota consumed


In [46]:
bucket = NewBucket(60)

In [48]:
bucket.__dict__
bucket

{'period_delta': datetime.timedelta(seconds=60),
 'reset_time': datetime.datetime(2020, 2, 26, 14, 7, 49, 139056),
 'max_quota': 0,
 'quota_consumed': 0}

NewBucket(max_quota=0, quota_consumed=0)

In [52]:
bucket = NewBucket(60)
print('Initial', bucket)
fill(bucket, 100)
print('Filled', bucket)

if deduct(bucket, 99):
    print('Had 99 quota')
else:
    print('Not enough for 99 quota')
print('Now', bucket)

if deduct(bucket, 3):
    print('Had 3 quota')
else:
    print('Not enough for 3 quota')

print('Still', bucket)

Initial NewBucket(max_quota=0, quota_consumed=0)
Filled NewBucket(max_quota=100, quota_consumed=0)
Had 99 quota
Now NewBucket(max_quota=100, quota_consumed=99)
Not enough for 3 quota
Still NewBucket(max_quota=100, quota_consumed=99)


### concurrency and parallelism

In [53]:
import subprocess

result = subprocess.run(
    ['echo', 'Hello from the child!'],
    capture_output=True,
    encoding='utf-8')

result.check_returncode()  # No exception means clean exit
print(result.stdout)

Hello from the child!



In [59]:
proc = subprocess.Popen(['sleep', '0.001'])
while proc.poll() is None:
    print('Working...')

    # Some time-consuming work here


print('Exit status', proc.poll())

Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Working...
Exit status 0


In [63]:
import time

start = time.time()
sleep_procs = []
for _ in range(10):
    proc = subprocess.Popen(['sleep', '1'])
    sleep_procs.append(proc)
for proc in sleep_procs:
    proc.communicate()

end = time.time()
delta = end - start
print(f'Finished in {delta:.3} seconds')

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

(None, None)

Finished in 1.06 seconds


In [65]:
import os
def run_encrypt(data):
    env = os.environ.copy()

    env['password'] = 'zf7ShyBhZOraQDdE/FiZpm/m/8f9X+M1'
    proc = subprocess.Popen(
        ['openssl', 'enc', '-des3', '-pass', 'env:password'],
        env=env,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE)
    proc.stdin.write(data)
    proc.stdin.flush()  # Ensure that the child gets input
    return proc

procs = []
for _ in range(3):
    data = os.urandom(10)
    proc = run_encrypt(data)
    procs.append(proc)
    
for proc in procs:
    out, _ = proc.communicate()
    print(out[-10:])

b'\x86\xba\xf5[\xa7{\xaf\x12\xe3\x06'
b'lT[\x9d\t_\xb9\x1c\x8fk'
b'8\x18:ea6h:UZ'


In [66]:
def run_hash(input_stdin):
    return subprocess.Popen(
        ['openssl', 'dgst', '-whirlpool', '-binary'],
        stdin=input_stdin,
        stdout=subprocess.PIPE)

In [67]:
encrypt_procs = []
hash_procs = []
for _ in range(3):
    data = os.urandom(100)

    encrypt_proc = run_encrypt(data)
    encrypt_procs.append(encrypt_proc)

    hash_proc = run_hash(encrypt_proc.stdout)
    hash_procs.append(hash_proc)

    # Ensure that the child consumes the input stream and
    # the communicate() method doesn't inadvertently steal
    # input from the child. Also lets SIGPIPE propagate to
    # the upstream process if the downstream process dies.
    encrypt_proc.stdout.close()
    encrypt_proc.stdout = None

In [68]:
for proc in encrypt_procs:
    proc.communicate()
    assert proc.returncode == 0

for proc in hash_procs:
    out, _ = proc.communicate()
    print(out[-10:])
    assert proc.returncode == 0

(None, None)

(None, None)

(None, None)

b'\xda\xcf\xa1\xc0]\x9d\x91tF\xae'
b'\x05\xb1&\r\xa11\xca#\x9cu'
b'P4F=\xf0\xd9\xd3\r\x99\xd3'


In [69]:
proc = subprocess.Popen(['sleep', '10'])
try:
    proc.communicate(timeout=0.1)

except subprocess.TimeoutExpired:
    proc.terminate()
    proc.wait()

print('Exit status', proc.poll())

-15

Exit status -15


### threading

In [74]:
def factorize(number):
    for i in range(1, number + 1):
        if number % i == 0:
            yield i

In [75]:
import time

numbers = [2139079, 1214759, 1516637, 1852285]
start = time.time()

for number in numbers:
    list(factorize(number))

end = time.time()
delta = end - start
print(f'Took {delta:.3f} seconds')

[1, 101, 21179, 2139079]

[1, 7, 13, 49, 91, 637, 1907, 13349, 24791, 93443, 173537, 1214759]

[1, 19, 79823, 1516637]

[1, 5, 271, 1355, 1367, 6835, 370457, 1852285]

Took 0.811 seconds


In [76]:
from threading import Thread

class FactorizeThread(Thread):
    def __init__(self, number):
        super().__init__()
        self.number = number

    def run(self):
        self.factors = list(factorize(self.number))
        
        
start = time.time()

threads = []
for number in numbers:
    thread = FactorizeThread(number)
    thread.start()
    threads.append(thread)
    
for thread in threads:
    thread.join()

end = time.time()
delta = end - start
print(f'Took {delta:.3f} seconds')


Took 1.229 seconds


### use cprofiler to profile your program before optimizing

In [130]:
def insertion_sort(data):
    results = [9]
    for value in data:
        print('aaa', value)
        insert_value(result, value)
    return result

def insert_value(arr, value):
    print('bbb', arr)
    for i, existing in enumerate(arr):
        print('aaaaaaaaa')
        if existing > value:
            print('xxxxxxxxx')
            arr.insert(i, value)
            return arr
    arr.append(value)
    return arr

In [131]:
insert_value([9], 12)

bbb [9]
aaaaaaaaa


[9, 12]

In [132]:
data = [3, 5, 2]
insertion_sort(data)

aaa 3
bbb CompletedProcess(args=['echo', 'Hello from the child!'], returncode=0, stdout='Hello from the child!\n', stderr='')


TypeError: 'CompletedProcess' object is not iterable

In [91]:
from random import randint
max_size = 10 ** 4
data = [randint(0, max_size) for _ in range(max_size)]
test = lambda: insertion_sort(data)

from cProfile import Profile
profiler = Profile()
profiler.runcall(test)

TypeError: 'CompletedProcess' object is not iterable

### heapq (priority queue)

In [157]:
class Book:
    def __init__(self, title, due_date):
        self.title = title
        self.due_date = due_date
        
class NoOverdueBooks(Exception):
    pass
        
def add_book(queue, book):
    queue.append(book)
    queue.sort(key=lambda x: x.due_date, reverse=True)

queue = []
add_book(queue, Book('aaaaaaa', '2019-06-07'))
add_book(queue, Book('bbbbbbb', '2019-06-05'))
add_book(queue, Book('cccccccc', '2019-06-08'))
add_book(queue, Book('ddddddd', '2019-06-03'))

queue

[<__main__.Book at 0x7f861768e070>,
 <__main__.Book at 0x7f861768e250>,
 <__main__.Book at 0x7f861768eee0>,
 <__main__.Book at 0x7f861768e5b0>]

In [145]:
def next_overdue_book(queue, now):
    if queue:
        book = queue[-1]
        if book.due_date < now:
            queue.pop()
            return book
    raise NoOverdueBooks

In [146]:
now = '2019-06-10'
found = next_overdue_book(queue, now)
print(found.title)
found = next_overdue_book(queue, now)
print(found.title)

ddddddd
bbbbbbb


In [147]:
def return_book(queue, book):
    queue.remove(book)

In [153]:
queue = []
book = Book('xxxx', '2019-06-04')
add_book(queue, book)
print('before return', [(x.title, x.due_date) for x in queue])
return_book(queue, book)
print('after return', [(x.title, x.due_date) for x in queue])

before return [('xxxx', '2019-06-04')]
after return []


In [163]:
try:
    next_overdue_book(queue, now)
except NoOverdueBooks:
    pass
else:
    assert False

In [164]:
def print_result(count, tests):
    avg_iteration = sum(tests) / len(tests)
    print(f'Count {count:>5,} takes {avg_iteration:.6f}s')
    return count, avg_iteration

In [165]:
def print_delta(before, after):
    before_count, before_time = before
    after_count, after_time = after
    growth = 1 + (after_count - before_count) / before_count
    slowdown = 1 + (after_time - before_time) / before_time
    print(f'{growth:>4.1f}x data size, {slowdown:>4.1f}x time')

In [180]:
import random
import timeit


In [175]:
a = 2.3333344
f'{a:>4.1f}' # width is 4, and 1 decimal place

' 2.3'

In [198]:
def list_overdue_benchmark(count):
    def prepare():
        to_add = list(range(count))
        random.shuffle(to_add)
        return [], to_add
    
    def run(queue, to_add):
        for n, i in enumerate(to_add):
#             if n % 100 == 0:
#                 print(f'added {n} items')
            queue.append(i)
            queue.sort(reverse=True)
            
        while queue:
            queue.pop()
            
    tests = timeit.repeat(
        setup='queue, to_add = prepare()',
        stmt=f'run(queue, to_add)', 
        globals=locals(),
        repeat=100,
        number=1)
    return print_result(count, tests)

In [199]:
baseline = list_overdue_benchmark(500)
for count in (1_000, 1_500, 2_000):
    comparison = list_overdue_benchmark(count)
    print_delta(baseline, comparison)

Count   500 takes 0.002432s
Count 1,000 takes 0.004674s
 2.0x data size,  1.9x time
Count 1,500 takes 0.007972s
 3.0x data size,  3.3x time
Count 2,000 takes 0.013013s
 4.0x data size,  5.4x time


In [204]:
from heapq import heappush
def add_book(queue, book):
    heappush(queue, book)
    
queue = []
# will fail, need total ordering on priority queue
add_book(queue, Book('xxxx', '2019-06-04'))
add_book(queue, Book('ddddddd', '2019-06-03'))

TypeError: '<' not supported between instances of 'Book' and 'Book'

In [219]:
import functools
@functools.total_ordering
class Book:
    def __init__(self, title, due_date):
        self.title = title
        self.due_date = due_date
    def __lt__(self, other):
        return self.due_date < other.due_date

queue = []
add_book(queue, Book('aaaaaaa', '2019-06-07'))
add_book(queue, Book('bbbbbbb', '2019-06-05'))
add_book(queue, Book('cccccccc', '2019-06-08'))
add_book(queue, Book('ddddddd', '2019-06-03'))

In [210]:
queue[-1].title

'aaaaaaa'

heapify is faster than sort

In [221]:
from heapq import heapify
# %timeit queue.sort()
# [i.title for i in queue]

%timeit heapify(queue)
[i.title for i in queue]

626 ns ± 5.13 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


['ddddddd', 'bbbbbbb', 'cccccccc', 'aaaaaaa']

### memoryview, CPython high-performance buffer protocol

In [222]:
data = b'shave and a haircut, two bits'
view = memoryview(data)
chunk = view[12:19]
chunk

<memory at 0x7f86178acf40>

In [228]:
chunk.obj
chunk.nbytes
chunk.tobytes()

b'shave and a haircut, two bits'

7

b'haircut'

In [239]:
a = '5'
b = 5
repr(a)
repr(b)
a
b
print(repr(a))  # representation
print(repr(b))

print(f'{a!r}') # !r printable version representation
print(f'{a!s}') # !r human-readable version string

"'5'"

'5'

'5'

5

'5'
5
'5'
5


### unitest

In [240]:
# utils.py
def to_str(data):
    if isinstance(data, str):
        return data
    elif isinstance(data, bytes):
        return data.decode('utf-8')
    else:
        raise TypeError('Must supply str or bytes, found: %r' % data)
    

In [243]:
# run this using python command line: python3 utils_test.py 
# or test a specific test method: python3 utils_test.py UtilsTestCase.test_to_str_bytes
from unittest import TestCase, main
from utils import to_str

class UtilsTestCase(TestCase):
    def test_to_str_bytes(self):
        self.assertEqual('hello', to_str(b'hello'))
    
    def test_to_str_str(self):
        self.assertEqual('hello', to_str('hello'))
        
    def test_failing(self):
        self.assertEqual('incorrect', to_str('hello'))
        
if __name__ == '__main__':
    main()

E
ERROR: /home/szong/ (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/home/szong/'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [244]:
help(TestCase)

Help on class TestCase in module unittest.case:

class TestCase(builtins.object)
 |  TestCase(methodName='runTest')
 |  
 |  A class whose instances are single test cases.
 |  
 |  By default, the test code itself should be placed in a method named
 |  'runTest'.
 |  
 |  If the fixture may be used for many test cases, create as
 |  many test methods as are needed. When instantiating such a TestCase
 |  subclass, specify in the constructor arguments the name of the test method
 |  that the instance is to execute.
 |  
 |  Test authors should subclass TestCase for their own tests. Construction
 |  and deconstruction of the test's environment ('fixture') can be
 |  implemented by overriding the 'setUp' and 'tearDown' methods respectively.
 |  
 |  If it is necessary to override the __init__ method, the base class
 |  __init__ method must always be called. It is important that subclasses
 |  should not change the signature of their __init__ method, since instances
 |  of the classes are i

find bugs and troubleshoot program using print, unittest, and interactive python debugger, when the program reaches to breakpoint, it returns a pdb shell, which is a full python shell. it allows you to inpsect all variables, and step through the rest of the program by using step, next, continue, return, quit. you can also do post mortem pdb.

In [254]:
import math

def compute_rmse(observed, ideal):
    total_err_2 = 0
    count =0
    for got, wanted in zip(observed, ideal):
        err_2 = (got - wanted) ** 2
#         breakpoint() # start the debugger here
        total_err_2 += err_2
        count += 1
        
    mean_err = total_err_2 / count
    rmse = math.sqrt(mean_err)
    return rmse

result = compute_rmse([1.8, 1.7, 3.2, 6], 
                      [2, 1.5, 3, 5])
print(result)

0.5291502622129182
