# Bonus lecture Python Coding

1. Part 1: Classes and configs
    1. Use (yaml) configuration files
    1. Organize code in classes
    1. Side point: Uses hashing for data and configs
    1. Side point: Copying
1. Part 2: Data in python
    1. Sparse matrices
    1. Numpy vs pandas
    1. Use dicts
1. Part 3: Coding ~guidelines~ opinions

<br>
<br>

## Importing modules

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/libs.png" width="900"/>

In [None]:
import copy
import hashlib
import json

import bonus_lib
import dict_hash
import numpy as np  # this is the standard way to abbreviate numpy
import pandas as pd  # this is the standard way to abbreviate pandas
import scipy.sparse  # import a submodule
import yaml

In [None]:
import importlib

importlib.reload(bonus_lib)

In [None]:
bonus_lib.read_yaml

In [None]:
bonus_lib

In [None]:
bonus_lib.my_variable

In [None]:
bonus_lib.my_other_variable

In [None]:
import bonus_lib

In [None]:
bonus_lib.my_third_variable

<br>
<br>

## 1. Classes and configs

### YAML files

In [None]:
def read_yaml(f):
    with open(f, "r") as con:
        x = yaml.safe_load(con)
    return x

In [None]:
config = read_yaml("bonus-config.yaml")
config  # prints the dict

In [None]:
print(yaml.dump(config, sort_keys=False))  # prints in yaml format

### Classes

In [None]:
class Config:
    def __init__(self, x=None, method="yaml", indent=4):
        self.method = method
        self.indent = indent
        self.reader = self._reader()  # define reader, this is a function!
        self.writer = self._writer()  # define writer, this is a function!
        self.printer = self._printer()  # define printer, this is a function!

        if isinstance(x, str):
            self.file = x
            self.read_config()
        else:
            self.file = None
            self.config = x

        self.author = self.config["author"]
        self.project = self.config["project"]
        self.version = self.config["version"]

    def __getitem__(self, x):
        return self.config[x]

    def read_config(self):
        with open(self.file, "r") as con:
            config = self.reader(con)
        self.config = config
        self.hash = dict_hash.sha256(self.config)

    def write_config(self, path=""):
        with open(f"{path}{self.hash}.{self.method}", "w") as con:
            self.writer(self.config, con, sort_keys=False)

    def print_config(self):
        print(self.printer(self.config, indent=self.indent, sort_keys=False))

    def _reader(self):
        reader_methods = {
            "yaml": yaml.safe_load,
            "json": json.load,
        }
        return reader_methods[self.method]

    def _writer(self):
        writer_methods = {
            "yaml": yaml.dump,
            "json": json.dump,
        }
        return writer_methods[self.method]

    def _printer(self):
        printer_methods = {
            "yaml": yaml.dump,
            "json": json.dumps,
        }
        return printer_methods[self.method]

In [None]:
my_config = Config("bonus-config.yaml")

In [None]:
my_config

In [None]:
my_config.author

In [None]:
my_config.version

In [None]:
my_config.project

In [None]:
my_config.print_config()

In [None]:
my_config.write_config()

### Hashing

Trace content of configs (and data) files in a single string!

In [None]:
my_config.hash

In [None]:
my_config.hash

In [None]:
my_config.config["data"]

In [None]:
my_config["data"]  # using __getitem__

### Copying

Beware of side effects! Read more about refernces vs. copies in [this article](https://levelup.gitconnected.com/understanding-reference-and-copy-in-python-c681341a0cd8).

<div class="alert-danger">BAD</div>

In [None]:
second_config = my_config.config
second_config["author"] = "gui"

In [None]:
my_config.config  # that's bad!

In [None]:
id(second_config) == id(my_config.config), id(second_config), id(my_config.config)

<div class="alert-success">GOOD</div>

In [None]:
my_config.config["author"] = "sebastian"  # fix the mistake
second_config = copy.deepcopy(my_config.config)  # copy
second_config["author"] = "gui"

In [None]:
id(second_config) == id(my_config.config), id(second_config), id(my_config.config)

In [None]:
my_config.config  # that's good :)

In [None]:
my_config_2 = Config(second_config)

In [None]:
my_config_2.author

<br>
<br>

## 2. Data in python

In [None]:
my_config["dataset"]

In [None]:
my_config["data"]

In [None]:
data_config = my_config["data"][my_config["dataset"]]
data_config

In [None]:
np.random.seed(1234)

In [None]:
data_df = pd.DataFrame(
    {
        "i": np.random.choice(data_config["D1"], data_config["N"], replace=False),
        "j": np.random.choice(data_config["D2"], data_config["N"], replace=False),
        "value": 1,
    }
).drop_duplicates()

In [None]:
data_arr = np.zeros((data_config["D1"], data_config["D2"]), dtype=np.int)
data_arr[data_df["i"], data_df["j"]] = data_df["value"]
data_arr

<div class="alert-danger">BAD</div>

In [None]:
def generate_data(config, seed):
    np.random.seed(seed)
    data_df = pd.DataFrame(
        {
            "i": np.random.choice(config["D1"], config["N"]),
            "j": np.random.choice(config["D2"], config["N"]),
            "value": 1,
        }
    ).drop_duplicates()
    data_arr = np.zeros((config["D1"], config["D2"]), dtype=np.int)
    data_arr[data_df["i"], data_df["j"]] = data_df["value"]
    return data_arr


generate_data(my_config["data"][my_config["dataset"]], seed=1234)

<div class="alert-success">GOOD</div>

In [None]:
def generate_data(D1, D2, N, seed=1234, args, kwargs):
    np.random.seed(seed)
    data_df = pd.DataFrame(
        {
            "i": np.random.choice(D1, N),
            "j": np.random.choice(D2, N),
            "value": 1,
        }
    ).drop_duplicates()
    data_arr = np.zeros((D1, D2), dtype=np.int)
    data_arr[data_df["i"], data_df["j"]] = data_df["value"]
    return data_arr


generate_data(**my_config["data"]["small"])

In [None]:
my_array = generate_data(**my_config["data"]["small"])
my_array.T.dot(my_array)

### Sparse matrices

In [None]:
my_array_large = generate_data(**my_config["data"]["large"])
my_array_large.shape

In [None]:
def generate_data_sparse(D1, D2, N, seed=1234):
    data_df = pd.DataFrame(
        {
            "i": np.random.choice(D1, N),
            "j": np.random.choice(D2, N),
            "value": 1,
        }
    ).drop_duplicates()
    my_array_sparse = scipy.sparse.csr_matrix(
        (data_df["value"], (data_df["i"], data_df["j"])), shape=(D1, D2)
    )
    return my_array_sparse


my_array_sparse = generate_data_sparse(**my_config["data"]["large"])
my_array_sparse

In [None]:
%%timeit
my_array_large.T.dot(my_array_large)

In [None]:
%%timeit
my_array_sparse.T.dot(my_array_sparse)

## Do we like pandas?

### Comparison pandas vs. numpy 

In [None]:
df = pd.DataFrame({"a": np.repeat(range(1000), np.random.choice(range(4, 10), 1000))})
df["b"] = np.random.choice(range(500), df.shape[0])
df = df.drop_duplicates()
df = df.sort_values(["a", "b"])

assert df.groupby("a")["b"].count().max() >= 4
assert df.groupby("a")["b"].count().max() < 10
assert df["b"].max() < 500

In [None]:
df

In [None]:
def df_to_list_pandas(x):
    return x.groupby("a").agg(lambda x: list(x)).b.to_list()

In [None]:
def df_to_list_numpy(x):
    keys, values = x[["a", "b"]].values.T
    ukeys, index = np.unique(keys, True)
    arrays = np.split(values, index)
    x_list = [list(a) for a in arrays[1:]]
    return x_list

In [None]:
list_1 = df_to_list_pandas(df)
list_2 = df_to_list_numpy(df)
assert list_1 == list_2

### Runtime

In [None]:
%%timeit
df_to_list_pandas(df)

In [None]:
%%timeit
df_to_list_numpy(df)

### Line profiler

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f df_to_list_pandas df_to_list_pandas(df)

In [None]:
%lprun -f df_to_list_numpy df_to_list_numpy(df)

## dicts

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/dict.png" width="400"/>

In [None]:
def generate_data(N):
    np.random.seed(1234)
    keys = list(range(N))
    np.random.shuffle(keys)
    values = np.random.uniform(0, 1, N)
    sample_df = pd.DataFrame(
        {
            "key": keys,
            "value": values,
        }
    )
    sample_dict = {k: v for k, v in zip(keys, values)}
    return sample_df, sample_dict

In [None]:
sample_df_small, sample_dict_small = generate_data(N=10000)

In [None]:
sample_df_large, sample_dict_large = generate_data(N=10000000)

### Runtime

#### `pandas`

In [None]:
%%timeit
sample_df_small.loc[sample_df_small["key"] == 1234, "value"].values[0]

In [None]:
%%timeit
sample_df_large.loc[sample_df_large["key"] == 1234, "value"].values[0]

#### `dict`

In [None]:
%%timeit
sample_dict_small[1234]

In [None]:
%%timeit
sample_dict_large[1234]

Some more (theoretical) background:

<img src="https://raw.githubusercontent.com/sbstn-gbl/learning-from-big-data/master/source/_static/img/big-o.png" width="800"/>

Based on this [article](https://nedbatchelder.com/text/bigo.html), read it for some more details.

<br>
<br>

## 3. Python coding ~guidelines~ opinions

This is just and overview, and much of the content is only a personal opinion :) Feel free to disagree or to make up your own guidelines. I added some additional resources at the end of the document that will help you form your opinion.

### Some general thoughts

#### Good vs. bad code

Martin Fowler

> Any fool can write code that a computer can understand. Good programmers write code that humans can understand.

Both may be correct as far as compiling and when they are run. But bad code can present some problems in development, debugging, and modifying. In your projects, no matter how well your program runs, someone will have to read or alter your code at some point.

- Good code is not clever. It does things in straightforward, obvious ways.
- Good code is developed in small, easy to read units of computation.
- Good code is well-organized.
- Good code is well-tested.

#### Goals for our (Python) code

- Do the right thing
- Useful
- Pythonic
- Readable
- "Beautiful"

### Version control

This is a ___must___. Put your code on GitHub (or any other online tool for version control). This way it’s accessible by anyone at any time. Use `git` often to track changes in small increments.

Content of your repository
- a readme (e.g., `.md`)
- a `Makefile`  # convenience
- requirements.txt  # required modules
- setup.py
- a license
- src
- tests
- docs
- a change log/release docs
- ...

### READMEs

A README is a ___must___ for any project. It should contain

- The project goal
- Repository content
- Dependencies/requirements
- Install instructions
- Instructions on how to run tests
- A list of contributors/codeowners
- Instructions on how others can contribute
- ...

### Documentation and comments

Choose your docstring style and stick to it. Adding docstrings to all your module, function, class, or method definitions is an extremely good idea. To make your code as maintainable and readable as possible, you’ll want to choose one docstring style and stick to that.

Use comments when necessary. Good code comments explain ___why___ things are done, not ___what___ is done. The code itself explains what is done. Often, comments can be used sparingly.

### Code reviews

Code reviews are a must. Use pull requests!

Benefits include:
- Find bugs early
- Better code quality
- Share knowledge and learn new things
- Keep standards and maintain consistency
- Compliance
- Team cohesion
- Enforce tests
- Keep `main` branch clean and executable, i.e., only update `main` when task is 100% done (and reviewed), avoid partial work results that break code in `main`

### Use virtual environments

Advantages:
- You can use any version of Python/modules you want for a specific environment without having to worry about collisions
- You can `freeze` your environment and let other users create an identical environment on their machine
- You can organize your packages much better and know exactly the packages you need to run your code in case someone else needs to run it on their machine
- Your main python package directory does not get flooded with unnecessary python packages

### Write readable code

- You should use line breaks and indent your code
- Use naming conventions for identifiers, this makes it easier to understand the code
- Use comments, and whitespaces around operators and assignments
- Maybe: Keep the maximum line length 79 characters (or comparable)

Much of the above that relates to code formatting can be automated, see sections PEP 8 and black.

And please read books! Learn about and follow established best practices. For example, there is no “private” keyword in Python, and the main convention for private properties and implementation details is to prefix all “internals” with an underscore.

### PEP 8

[PEP 8](https://www.python.org/dev/peps/pep-0008/) is the de facto code style guide for Python (also see [pep8.org](pep8.org)). The entire Python community does their best to adhere to the guidelines laid out within this document. 

#### `black` code formatter

The auto-formatter `black` offers an opinionated and deterministic reformatting of your code base. Its main focus lies in providing a uniform code style without the need of configuration. Due to the deterministic approach minimal git diffs with only the relevant changes are guaranteed. Install with `pip install black`, and use with `black my_file.py`.

#### `isort` code formatter

The auto-formatter `isort` sorts the imports in your python code. Install with `pip install isort`, and use with `isort my_file.py`.

#### Jupyter Lab code formatter

The Jupyter plugin `jupyter-code-formatter` can be used in Jupyter to format scripts and notebooks. It requires `black` and `isort`. Install with `pip install jupyter-code-formatter`. Use by clicking the icon in the menu bar. See the [readthedocs](https://jupyterlab-code-formatter.readthedocs.io/en/latest) for more information.

### PEP 20 guiding Python design principles

See https://github.com/hblanks/zen-of-python-by-example/blob/master/pep20_by_example.pdf

In [None]:
import this

## Let's look at some good and some bad code

### Naming

<div class="alert-danger">BAD</div>

In [None]:
n = 10000

<div class="alert-success">GOOD</div>

In [None]:
n_customers = 10000

### Multi-line statements
Some compound statements (e.g., list comprehensions) are appreciated, other than that it is bad practice to have two disjointed statements on the same line of code.

<div class="alert-danger">BAD</div>

In [None]:
print("print 1"); print("print 2")

if 1 == 1: print("print 1")

if (4 > 2) and (9 < 123): print("print true")

Try to fix the cell above by using the jupyterlab code formatter.

<div class="alert-success">GOOD</div>

In [None]:
print("print 1")
print("print 2")

if 1 == 1:
    print("print 1")

cond1 = 4 > 2
cond2 = 9 < 123
if cond1 and cond2:
    print("print true")

### Use f-strings

<div class="alert-success">GOOD</div>

In [None]:
name = "Sebastian"
favorite_language = "Python"
f"Hi, I'm {name}. My favorite programming language is {favorite_language}."

In [None]:
my_number = 0.2
f"Hi, I'm a number, my value is {my_number:.2f}. I found two more digits: {my_number:.4f}"

### Loops

#### Base example

<div class="alert-danger">BAD</div>

In [None]:
x = [1, 2, 4, 8, 16]
for i in range(len(x)):
    print(x[i])

<div class="alert-success">GOOD</div>

In [None]:
x = [1, 2, 4, 8, 16]
for xi in x:
    print(xi)

#### Use `enumerate`

<div class="alert-danger">BAD</div>

In [None]:
for i in range(len(x)):
    print(i, x[i])

<div class="alert-success">GOOD</div>

In [None]:
for i, xi in enumerate(x):
    print(i, xi)

### Write code in the most explicit and straightforward way

<div class="alert-danger">BAD</div>

In [None]:
def make_complex(*args):
    x, y = args
    return dict(**locals())

<div class="alert-success">GOOD</div>

In [None]:
def make_complex(x, y):
    return {"x": x, "y": y}

### Avoid superfluous code

<div class="alert-success">BAD</div>

In [None]:
num = 4

if num >= 0:
    print(True)
else:
    print(False)

<div class="alert-success">GOOD</div>

In [None]:
print(num >= 0)

### Use list comprehensions

<div class="alert-danger">BAD</div>

In [None]:
numbers_to_square = list(range(5))
square_numbers = []
for x in numbers_to_square:
    square_numbers.append(x ** 2)
square_numbers

<div class="alert-success">GOOD</div>

In [None]:
# [expression for item in iterable (if conditional)]
[x ** 2 for x in range(5)]

### List comprehensions and side effects

Never use a list comprehension just for its side effects.

<div class="alert-danger">BAD</div>

In [None]:
[print(x) for x in range(5)]

<div class="alert-success">GOOD</div>

In [None]:
for x in range(5):
    print(x)

#### Use `zip`

<div class="alert-danger">BAD</div>

In [None]:
x = [1, 2, 4, 8, 16]
y = "abcde"
for i in range(len(x)):
    print(x[i], y[i])

<div class="alert-success">GOOD</div>

In [None]:
for x_item, y_item in zip(x, y):
    print(x_item, y_item)

### Using `with`

The `with` block means that the special methods `.enter()` and `.exit()` are called, even in the cases of exceptions.

<div class="alert-danger">BAD</div>

In [None]:
my_file = open("filename.csv", "w")
my_file
my_file.close()

<div class="alert-success">GOOD</div>

In [None]:
with open("filename.csv", "w") as my_file:
    my_file

### Dictionaries

#### Default values

<div class="alert-danger">BAD</div>

In [None]:
d = {"key1": "value1"}

if "key1" in d:
    print(d["key1"])
else:
    print("default_value")

<div class="alert-success">GOOD</div>

In [None]:
d = {"key1": "value1"}

print(d.get("key1", "default_value"))
print(d.get("key2", "default_value"))

#### Looping over key, value pairs

<div class="alert-danger">BAD</div>

In [None]:
z = {"a": 0, "b": 1}
for k in z:
    print(k, z[k])

<div class="alert-success">GOOD</div>

In [None]:
for k, v in z.items():
    print(k, v)

### Additional Resources

- [python.org](https://www.python.org/)
- [realpython.com](https://www.realpython.com/)
- [python.org/dev/peps](https://www.python.org/dev/peps/)
- https://docs.python-guide.org/writing/style/ (basis for some examples in this notebook)
- https://towardsdatascience.com/the-ultimate-guide-to-writing-better-python-code-1362a1209e5a (basis for some examples in this notebook)
- Python Crash Course
- Python Tricks: A Buffet of Awesome Python Features
- Fluent Python: Clear, Concise, and Effective Programming
- Python Cookbook
- ...