## Python Containers

In [1]:
n = 1_000_000  # _ lets you separate thousands

### List vs string concatenation

In [2]:
def s_concat():
    s = ""
    for i in range(n):
        s += str(i)
    return s

In [3]:
def s_concat2():
    s = []
    for i in range(n):
        s.append(str(i))
    return "".join(s)

In [4]:
s_concat() == s_concat2()

True

In [5]:
%timeit -r 10 s_concat()

357 ms ± 52 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [6]:
%timeit -r 10 s_concat2()

288 ms ± 26.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


### Tuple unpacking

In [7]:
a = 1
b = 2
a, b = b, a
a, b

(2, 1)

### Sets vs lists for looking up values

In [9]:
numbers = list(range(1, n+1))
numbers_set = set(numbers)

In [10]:
len(numbers) == len(numbers_set)

True

In [11]:
def list_lookup(val):
    return val in numbers

In [12]:
def set_lookup(val):
    return val in numbers_set

In [13]:
list_lookup(99)

True

In [14]:
set_lookup(99)

True

In [15]:
%timeit -r 10 list_lookup(256)

3.07 µs ± 211 ns per loop (mean ± std. dev. of 10 runs, 100,000 loops each)


In [16]:
%timeit -r 10 set_lookup(256)

92.4 ns ± 0.937 ns per loop (mean ± std. dev. of 10 runs, 10,000,000 loops each)


In [17]:
numbers = [1, 9, 2, 3, 4, 3, 4, 4, 5, 9]

### Sets are unordered + unique / set operations

In [18]:
set(numbers)

{1, 2, 3, 4, 5, 9}

In [19]:
a = {1, 2, 3}
b = {3, 4, 5}

In [20]:
a & b

{3}

In [21]:
a ^ b

{1, 2, 4, 5}

In [22]:
a - b

{1, 2}

In [24]:
dir(b)

['__and__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__iand__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'add',
 'clear',
 'copy',
 'difference',
 'difference_update',
 'discard',
 'intersection',
 'intersection_update',
 'isdisjoint',
 'issubset',
 'issuperset',
 'pop',
 'remove',
 'symmetric_difference',
 'symmetric_difference_update',
 'union',
 'update']

In [25]:
c = set()  # empty set != {} - that would be dict

In [26]:
c.add(1)

In [27]:
c

{1}

In [28]:
c.pop()

1

In [29]:
c

set()

In [30]:
# immutable set
c = frozenset([1, 2, 2])

In [31]:
c

frozenset({1, 2})

In [32]:
1 in c

True

In [33]:
c.add(3)

AttributeError: 'frozenset' object has no attribute 'add'

In [34]:
c.pop()

AttributeError: 'frozenset' object has no attribute 'pop'

In [35]:
dir(c)

['__and__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'copy',
 'difference',
 'intersection',
 'isdisjoint',
 'issubset',
 'issuperset',
 'symmetric_difference',
 'union']

In [36]:
c.difference({3, 2})

frozenset({1})

### Dictionaries

In [37]:
names = 'bob julian tim sara'.split()
ages = '11 22 33 44'.split()
d = dict(zip(names, ages))

In [38]:
d

{'bob': '11', 'julian': '22', 'tim': '33', 'sara': '44'}

In [39]:
d["bob"]

'11'

In [40]:
d.keys()

dict_keys(['bob', 'julian', 'tim', 'sara'])

In [41]:
d.values()

dict_values(['11', '22', '33', '44'])

In [42]:
d.items()

dict_items([('bob', '11'), ('julian', '22'), ('tim', '33'), ('sara', '44')])

In [43]:
d["erik"]

KeyError: 'erik'

In [44]:
d.get("erik", "not found")

'not found'

In [45]:
d["jones"] = 55

In [46]:
d  # ordered! To be more explicit use collections.OrderedDict

{'bob': '11', 'julian': '22', 'tim': '33', 'sara': '44', 'jones': 55}

## collections

### namedtuple

In [47]:
!pip install feedparser

You should consider upgrading via the '/Users/bbelderbos/code/notebooks/venv/bin/python3.10 -m pip install --upgrade pip' command.[0m


In [56]:
from collections import namedtuple

import feedparser

# cached version to have predictable results for testing
FEED_URL = "https://bites-data.s3.us-east-2.amazonaws.com/steam_gaming.xml"

Game = namedtuple('Game', 'title link')


def get_games():
    """Parses Steam's RSS feed and returns a list of Game namedtuples"""
    feed = feedparser.parse(FEED_URL)
    return [Game(entry.title, entry.link)
            for entry in feed.entries]

In [57]:
games = get_games()

In [58]:
first = games[0]

In [61]:
first.link

'http://store.steampowered.com/news/31695/'

In [62]:
from typing import NamedTuple

class Karma(NamedTuple):
    giver: str
    receiver: str
    score: int

In [63]:
karma = Karma('bob', 'julian', 5)

In [64]:
karma.giver, karma.receiver, karma.score

('bob', 'julian', 5)

In [65]:
karma.__annotations__

{'giver': str, 'receiver': str, 'score': int}

### defaultdict

In [66]:
from collections import defaultdict

In [67]:
data = """Tim,ID
Sara,BR
Thelma,CN
Chris,RU
Fina,ID
Juliana,SE
Roberto,CN
Mario,PL
Paul,CN""".splitlines()

In [68]:
data

['Tim,ID',
 'Sara,BR',
 'Thelma,CN',
 'Chris,RU',
 'Fina,ID',
 'Juliana,SE',
 'Roberto,CN',
 'Mario,PL',
 'Paul,CN']

In [69]:
people_by_country = {}

In [73]:
dir(people_by_country)

['__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [77]:
people_by_country = {}
for row in data:
    name, country = row.split(",")
    bucket = people_by_country.setdefault(country, [])
    bucket.append(name)

In [81]:
people_by_country = {}

In [83]:
people_by_country = defaultdict(list)

In [84]:
for row in data:
    name, country = row.split(",")
    people_by_country[country].append(name)

In [85]:
people_by_country

defaultdict(list,
            {'ID': ['Tim', 'Fina'],
             'BR': ['Sara'],
             'CN': ['Thelma', 'Roberto', 'Paul'],
             'RU': ['Chris'],
             'SE': ['Juliana'],
             'PL': ['Mario']})

In [89]:
import random
numbers = [random.randint(1, 10) for _ in range(1000)]

In [90]:
from collections import Counter

In [91]:
cnt = Counter(numbers)

In [92]:
cnt

Counter({10: 90,
         7: 113,
         6: 107,
         8: 88,
         3: 94,
         4: 109,
         5: 94,
         2: 95,
         9: 97,
         1: 113})

In [93]:
cnt.most_common(5)

[(7, 113), (1, 113), (4, 109), (6, 107), (9, 97)]

In [94]:
from collections import Counter
import os
import re
import urllib.request

tmp = os.getenv("TMP", "/tmp")
stopwords_file = os.path.join(tmp, 'stopwords')
harry_text = os.path.join(tmp, 'harry')
urllib.request.urlretrieve(
    'https://bites-data.s3.us-east-2.amazonaws.com/stopwords.txt',
    stopwords_file
)
urllib.request.urlretrieve(
    'https://bites-data.s3.us-east-2.amazonaws.com/harry.txt',
    harry_text
)

def get_harry_most_common_word(n=10):
    with open(stopwords_file) as f:
        stopwords = set(f.read().strip().lower().split('\n'))

    with open(harry_text) as f:
        words = [re.sub(r'\W+', r'', word)  # [^a-zA-Z0-9_]
                 for word in f.read().lower().split()]

        words = [word for word in words if word.strip()
                 and word not in stopwords]

        cnt = Counter(words)
        return cnt.most_common(n)

In [95]:
get_harry_most_common_word()

[('dursley', 45),
 ('dumbledore', 36),
 ('said', 32),
 ('mr', 30),
 ('professor', 30),
 ('mcgonagall', 25),
 ('didnt', 22),
 ('mrs', 21),
 ('people', 21),
 ('cat', 20)]

### ChainMap

In [97]:
# https://docs.python.org/3/library/collections.html#collections.ChainMap
baseline = {'music': 'bach', 'art': 'rembrandt'}
adjustments = {'art': 'van gogh', 'opera': 'carmen'}

In [98]:
baseline | adjustments  # new dict merge syntax >= 3.9

{'music': 'bach', 'art': 'van gogh', 'opera': 'carmen'}

In [99]:
from collections import ChainMap

In [100]:
ChainMap(baseline, adjustments)

ChainMap({'music': 'bach', 'art': 'rembrandt'}, {'art': 'van gogh', 'opera': 'carmen'})

In [101]:
ChainMap(argparse, os.environ)

'rembrandt'

In [102]:
### deque
from collections import deque

In [103]:
s = "this is a string"

In [104]:
d = deque(s)

In [105]:
d.rotate(3)

In [107]:
d

deque(['i',
       'n',
       'g',
       't',
       'h',
       'i',
       's',
       ' ',
       'i',
       's',
       ' ',
       'a',
       ' ',
       's',
       't',
       'r'])

In [108]:
d.rotate(-3)

In [109]:
d

deque(['t',
       'h',
       'i',
       's',
       ' ',
       'i',
       's',
       ' ',
       'a',
       ' ',
       's',
       't',
       'r',
       'i',
       'n',
       'g'])

In [110]:
s[3:] + s[:3]

's is a stringthi'

### deque

In [111]:
lst = list(range(1_000_000))
deq = deque(range(1_000_000))

In [112]:
len(deq)

1000000

In [113]:
def insert_and_delete(ds):
    for i in range(10):
        index = random.choice(range(100))
        ds.remove(index)
        ds.insert(index, index)

In [114]:
%timeit -r 10 insert_and_delete(lst)

26.2 ms ± 4.16 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [115]:
%timeit -r 10 insert_and_delete(deq)

18.2 µs ± 1.08 µs per loop (mean ± std. dev. of 10 runs, 100,000 loops each)


### array

In [121]:
import sys
from array import array

In [122]:
numbers = list(range(1, 21))

In [123]:
a_numbers = array("b", numbers)

In [124]:
sys.getsizeof(numbers)

216

In [125]:
sys.getsizeof(a_numbers)

100

### enum

In [126]:
from enum import Enum, IntEnum

In [127]:
class BiteLevel(Enum):
    INTRO = 1
    BEGINNER = 2
    INTERMEDIATE = 3
    ADVANCED = 4

In [128]:
BiteLevel.INTRO

<BiteLevel.INTRO: 1>

In [129]:
BiteLevel.INTRO == 1

False

In [131]:
class BiteLevel(IntEnum):
    INTRO = 1
    BEGINNER = 2
    INTERMEDIATE = 3
    ADVANCED = 4

In [132]:
BiteLevel.INTRO == 1

True

### bisect

In [133]:
from bisect import insort

In [134]:
items = [3, 5, 7]

In [135]:
insort(items, 6)

In [137]:
items

[3, 5, 6, 7]

In [138]:
insort(items, 4)

In [139]:
items

[3, 4, 5, 6, 7]

### heapq

In [141]:
import heapq

In [142]:
# https://realpython.com/python-data-structures/#heapq-list-based-binary-heaps
q = []

In [143]:
heapq.heappush(q, (2, "code"))

In [144]:
heapq.heappush(q, (1, "eat"))

In [145]:
heapq.heappush(q, (3, "sleep"))

In [146]:
while q:
    next_item = heapq.heappop(q)
    print(next_item)

(1, 'eat')
(2, 'code')
(3, 'sleep')


In [147]:
numbers = random.sample(range(50), 10)

In [148]:
numbers

[41, 5, 18, 3, 15, 16, 22, 2, 37, 30]

In [149]:
heapq.nlargest(3, numbers)

[41, 37, 30]

In [150]:
heapq.nsmallest(3, numbers)

[2, 3, 5]

In [151]:
numbers.sort()
numbers[:3]

[2, 3, 5]

### copy

In [152]:
from copy import copy, deepcopy

In [153]:
items = [dict(id=1, name='laptop')]

In [154]:
items2 = copy(items)

In [155]:
items[0][1] = "mac"

In [156]:
items

[{'id': 1, 'name': 'laptop', 1: 'mac'}]

In [157]:
items2  # oops

[{'id': 1, 'name': 'laptop', 1: 'mac'}]

In [158]:
items = [dict(id=1, name='laptop')]
items2 = deepcopy(items)

In [159]:
items[0][1] = "mac"

In [160]:
items

[{'id': 1, 'name': 'laptop', 1: 'mac'}]

In [161]:
items2

[{'id': 1, 'name': 'laptop'}]

### pprint

In [162]:
from pprint import pprint as pp

In [163]:
feed = feedparser.parse(FEED_URL)

In [164]:
feed

{'bozo': True,
 'entries': [{'title': 'Midweek Madness - RiME, 33% Off',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'https://bites-data.s3.us-east-2.amazonaws.com/steam_gaming.xml',
    'value': 'Midweek Madness - RiME, 33% Off'},
   'links': [{'rel': 'alternate',
     'type': 'text/html',
     'href': 'http://store.steampowered.com/news/31695/'}],
   'link': 'http://store.steampowered.com/news/31695/',
   'id': 'http://store.steampowered.com/news/31695/',
   'guidislink': False,
   'published': 'Tue, 15 Aug 2017 10:51:00 -0700',
   'published_parsed': time.struct_time(tm_year=2017, tm_mon=8, tm_mday=15, tm_hour=17, tm_min=51, tm_sec=0, tm_wday=1, tm_yday=227, tm_isdst=0),
   'summary': 'Save 33% on <a href="http://store.steampowered.com/app/493200/">RiME</a> during this week\'s Midweek Madness*!<br /><br />A land of discovery stretches out before you. Explore the beautiful yet rugged world of RiME, a single-player puzzle adventure. <br /><br />*Offer e

In [165]:
pp(feed)

{'bozo': True,
 'bozo_exception': CharacterEncodingOverride('document declared as us-ascii, but parsed as utf-8'),
 'encoding': 'utf-8',
 'entries': [{'guidislink': False,
              'id': 'http://store.steampowered.com/news/31695/',
              'link': 'http://store.steampowered.com/news/31695/',
              'links': [{'href': 'http://store.steampowered.com/news/31695/',
                         'rel': 'alternate',
                         'type': 'text/html'}],
              'published': 'Tue, 15 Aug 2017 10:51:00 -0700',
              'published_parsed': time.struct_time(tm_year=2017, tm_mon=8, tm_mday=15, tm_hour=17, tm_min=51, tm_sec=0, tm_wday=1, tm_yday=227, tm_isdst=0),
              'summary': 'Save 33% on <a '
                         'href="http://store.steampowered.com/app/493200/">RiME</a> '
                         "during this week's Midweek Madness*!<br /><br />A "
                         'land of discovery stretches out before you. Explore '
                   