In [2]:
%run ../talktools.py

# Data Structures/Manipulation in Pure Python

Before we get into the use of 3rd-party tooling for data, let's quickly look at some useful data structures in the language itself.

### Variables

Duck-types variables are extremely useful in Python programming but mutability is not preferred for data containers:

In [None]:
var = 'a'
var = [1, 2, "oski"]
def var(x): 
    return x*2

In [None]:
var("yo")  # is this really whay we want?

### Tuples (and Lists)

Tuples are the most basic immutable data stores:

In [None]:
tup = (1, 2, "oski")
tup[2]

In [None]:
tup[2] = "bear"

In [None]:
not_tup = [1, 2, "oski"]
not_tup[2] = "bear"
not_tup

In [None]:
import math


def print_list_plus_one(inlist):
    for i in range(len(inlist)):
        inlist[i] += 1
    print(f"inlist plus one is {inlist}")


mylist = [-1,  math.pi]
print_list_plus_one(mylist)

In [None]:
mylist  # Python passes references to Objects

In [None]:
mytup = (-1,  math.pi)
print_list_plus_one(mytup)

This is probably the sort of errors you WANT to throw when dealing with datasets. What's a better way to accomplish the goal of the function?

In [None]:
def print_list_plus_one(inlist):
    print(f"inlist plus one is {[x + 1 for x in inlist]}")


mylist = [-1,  math.pi]
print_list_plus_one(mylist)

In [None]:
mylist

<div class="alert alert-success">
Of course when constructing a function (or method, inside classes) that we and others are going to use a lot, we should do careful type checking and provide documentation (and write unit tests!)
 </div>

In [None]:
import numbers  # built-in abstraction numerical types https://docs.python.org/3/library/numbers.html


def print_list_or_tuple_plus_one(initer):
    """
   Takes as input any list or tuple and, if it can, adds one to each
   element and prints out the result

   Parameters
    ----------
    initer : list or tuple
               A list or a tuple with zero or more elements


    Returns
    -------
    None

    """
    if not isinstance(initer, (list, tuple)):
        raise TypeError(
            f"Must provide a list or a tuple. Instead you provided {type(initer)}")

    test_elementwise_is_number = [
        isinstance(x, numbers.Number) for x in initer]
    if all(test_elementwise_is_number):
        print(f"inlist plus one is {[x + 1 for x in initer]}")
    else:
        first_bad_index = test_elementwise_is_number.index(False)
        raise TypeError(
            "Not all elements can have the value of 1 added to them. "
            f"For example, initer[{first_bad_index}]={initer[first_bad_index]} (type={type(initer[first_bad_index])})"
        )

In [None]:
print_list_or_tuple_plus_one([-1,  math.pi])
print_list_or_tuple_plus_one([-1,  math.pi, "oski"])

<div class="alert alert-warning">
When passing data around your codebase, it's a good idea to insure that the original data cannot be manipulated by downstream processes.
</div>


### Dictionaries 

Dictionaries provide fast (hashed) access as a key, value store not just for numbers and lists but for any Python object (we'll dictionaries a lot):

In [None]:
d = {"my_λ": var, "x": [1, 2, 3]}

In [None]:
list(map(d["my_λ"], d["x"]))  # 

<font color="grey">(see section [5.14 Deitel & Deitel from O'Reilly](https://learning.oreilly.com/library/view/intro-to-python/9780135404799/xhtml/fileP70010164470000000000000000024AD.xhtml#P70010164470000000000000000024AD) for more on map, reduce, filter).</font>

### Arrays (built-in module `array`)


> defines an object type which can compactly represent an array of basic values: characters, integers, floating point numbers. Arrays are sequence types and **behave very much like lists**, except that the type of objects stored in them is constrained.

| code | meaning | size in bytes |
|------|---------|---------------|
|  l   |  unsigned long int | 4  |
|  L   |  signed long int | 4  |
|  d   |  double | 8  |
|  f   |  float | 4  |
| i    | unsigned int | 2 |

-- https://docs.python.org/3/library/array.html#module-array

In [None]:
from array import array

my_array = array('l', [1, 2, 3, 4, 5])
my_array

In [None]:
[x + 1 for x in my_array]  # acts mostly like lists

In [None]:
from sys import getsizeof

# get the memory size in bytes
getsizeof(array('b',list(range(128))))

In [None]:
getsizeof(list(range(128)))

In [None]:
array('b',list(range(128))).tolist() == list(range(128))

In [None]:
f = open("my_array_file.dat", "wb")
array('b',list(range(128))).tofile(f)
f.close()

In [None]:
!cat my_array_file.dat

Reading it back in, we need to know the format we wish to use:

In [None]:
my_bytes = array("b")
f = open("my_array_file.dat", "rb")
my_bytes.fromfile(f, 5)
print(my_bytes)
f.close()

In [None]:
my_bytes *  2  # not really what we'd want...

### dataclasses

> Data Classes can be thought of as "mutable namedtuples with defaults".

-- https://www.python.org/dev/peps/pep-0557/

We use dataclass as a **decorator** to a class that gives us a number of methods out of the box. It allows us to ~strongly type a complex object.

In [None]:
from dataclasses import dataclass, field
from datetime import date

In [None]:
@dataclass
class Series:
    name: str
    release: date
    networks: list[str]
    episodes_per_season: int = 13
    seasons: int = 1
    total_shows: int = field(init=False)
    have_binged: bool = False
    
    def __post_init__(self):
        self.total_shows = self.episodes_per_season * self.seasons

We get a `__repr__` for free:

In [None]:
station_11 = Series(name="Station 11", release=date(2022, 1, 1), networks=["HBO Max"], episodes_per_season=10)
station_11

In [None]:
station_11.have_binged = True

In [None]:
station_11.scary = False

In [None]:
station_11

In [None]:
st = Series(name="Stranger Things", release=date(2012, 1, 1), 
            networks=["Netflix"], episodes_per_season=13, seasons=3, have_binged=True)

In [None]:
@dataclass(frozen=True)
class Series:
    name: str
    release: date
    networks: list[str]
    episodes_per_season: int = 13
    seasons: int = 1
    total_shows: int = field(init=False)
    have_binged: bool = False

In [None]:
station_11 = Series(name="Station 11", release=date(2022, 1, 1), networks=["HBO Max"], episodes_per_season=10)
station_11.have_binged = True

In [None]:
%load_ext watermark

In [None]:
%watermark --iv -g -r