# Recap of the vanilla `python`

In [None]:
# importing external libraries
import math

# can do all kinds of math inline
943.0 + (8**2 + 2 / (3 * math.sqrt(2)))
#                        ---------
#                            ↑
#                    built-in math functions

# ============================================================

# use of variables a is a list
# ↓
a = [6, 0, 2, 4, 5, 8, 9, 1, 2]
#   ---------------------------
#               ↑
#         lists of values

# index
# ↓
a[4] == 5

# ============================================================

# dictionaries
foo = {
# key: value
  "a": 1,
  "b": 2,
  "c": 3
}
# can also be done this way:
# foo = dict(a = 1, b = 2, c = 3)

# now access:
foo['a']

# ============================================================

# loops
for i in range(-20, 20):
#        --------------
#              ↑
#       any list can go here
  print (math.sqrt(i * i) == i)   # QUESTION: what's the result of this?

# ============================================================

# conditional statements
if (len(a) > 2):
#   ---
#    ↑
#  take the length of the list
  print ("nah, the list is too long")
else:
  print ("nice")

# ============================================================

# functions
def divide(a, b):
#          ----
#           ↑
#       arguments
  if (b != 0):
    return a / b
  else:
    return None

c = divide(-23.0, 43.0)
# c is -0.534883720930233

d = divide(1, 0)
# d is None

# Motivation behind `numpy`

### Operations with vectors
> so you said python is intuitive, right?

In [None]:
42 + 95

In [None]:
"foo" + "bar"

#### operations with vectors (arrays) in vanilla `python`

In [None]:
[1, 2, 3] + [2, 3, 4]

In [None]:
[2, 3, 4] * 42

In [None]:
[34, 31, 32] + 2

In [None]:
[54, 34, 20]**2

In [None]:
# take some algebraic operation
def pleaseDoSomethingUseful(x):
  return x / (1 + x / (1 + x / (1 + x / (1 + x / (1 + x / (1 + x))))))

I want to get the result of this function applied to every single element of the list

  `list (in)` -> `pleaseDoSomethingUseful` -> `list (out)`

  `[a1, a2, a3, a4, a5]` -> `[f(a1), f(a2), f(a3), f(a4), f(a5)]`
  
a/k/a function mapping

In [None]:
pleaseDoSomethingUseful([54, 34, 20, 23, 4, 521, 231, 23])

In [None]:
# * in practice you can map a function if you want to
list(map(pleaseDoSomethingUseful, [54, 34, 20, 23, 4, 521, 231, 23]))

To be clear this is also not possible in some of the lower level programming languages:

```c++
/*
 * example C++ code
 */

[23, 34, 54] + 2;       // <-- # compile error

void func(int a) {      // <-- function "expects" a single 4-byte integer
  // ...
}

func([2, 3, 4]);        // <-- # compile error
```

But you said `python` is **not** like `c/c++`, that `python` is intuitive and all...

![just no](https://static.wikia.nocookie.net/5586dcfa-2630-4a57-a5d3-3ab995aede6b "just no")


## My dream `python`

Wouldn't it be nice to simplify vector operations?

$$
[a_1, ~a_2, ~a_3] + [b_1, ~b_2, ~b_3] ~ = ~ ?
$$

or be able to solve systems of linear equations thinking in terms of vectors and matrices?

> this happens quite often in applications like data analysis, machine learning, solving differential equations etc

$$
\begin{bmatrix}
a_{11} & a_{12} & a_{13} & \dots & a_{1n} \\
    a_{21} & a_{22} & a_{23} & \dots & a_{2n} \\
    \vdots & \vdots & \vdots & \ddots & \vdots  \\
    a_{m1} & a_{m2} & a_{m3} & \dots & a_{mn}
\end{bmatrix}\cdot
\begin{bmatrix}
x_{1} \\
    x_{2} \\
    \vdots \\
    x_{n} \\
    \end{bmatrix}
=
\begin{bmatrix}
b_{1} \\
    b_{2} \\
    \vdots \\
    b_{n} \\
    \end{bmatrix}
$$


# `numpy` saves the day

#### *disclaimer: `numpy` module is so much more than just what will (can) be discussed here*

In [None]:
# there are two kinds of people:
import numpy as np
# vs
# import numpy

`np.array` instead of vanilla lists

In [None]:
np.array([1, 2, 3]) + np.array([2, 3, 4])

In [None]:
np.array([2, 3, 4]) * 42

In [None]:
np.array([34, 31, 32]) + 2

In [None]:
np.array([54, 34, 20])**2

In [None]:
def pleaseDoSomethingUseful(x):
  return x / (1 + x / (1 + x / (1 + x / (1 + x / (1 + x / (1 + x))))))

pleaseDoSomethingUseful(np.array([54, 34, 20, 23, 4, 521, 231, 23]))

lists vs `numpy` arrays

In [None]:
def foo():
  return 1
#          text                   dictionary
#           ↓                        ↓
#        -------              ----------------
[foo, 2, "dasda", [3, 45, 4], {'a': 1, 'b': 2}]    # <- this is a list (it can store any type of data)
#---  -           ----------
# ↑   ↑               ↑
# |   number          |
# function          list

In [None]:
np.array([foo, 2, "dasda", [3, 45, 4], {'a': 1, 'b': 2}])    # <- this is a numpy array (and it doesn't appreciate diverse data types)

# you can do this in theory (but, please, don't do it)

In [None]:
# two-dimensional list (with uneven rows):
[[1, 2, 3], [4, 5, 6, 7, 8], ["foo", "bar"], 123.3]

In [None]:
np.array([[1, 2, 3], [4, 5, 6, 7, 8], [3, 4], 453])   # <- not going to work as expected


instead `numpy` arrays are designed to work on uniform data types

In [None]:
# everything coverted into 64-bit float:
print(
  np.array([1.5, 3.4, 7.5, 1, 95, 102.4])
)

print(
  np.array([1.5, 3.4, 7.5, 1, 95, 102.4]).dtype
)

In [None]:
# everything coverted into 64-bit integer
print (
  np.array([1.5, 3.4, 7.5, 1, 95, 102.4], dtype='int')
)

# everything coverted into text:
print (
  np.array([1.5, 3.4, 7.5, 1, 95, 102.4], dtype='str')
)


index tricks with `numpy` arrays are similar to those with the lists

In [None]:
my_lovely_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])

print(
    my_lovely_array
)

print(
    # skip the first 2 elements:
    my_lovely_array[2:]
)

print(
    # take only the last element:
    my_lovely_array[-1]
)

print(
    # skip the last 4 elements:
    my_lovely_array[:-4]
)                 # ↑ this is the same as [0:-4]

print(
    # skip the first 2 elements and take the next 4:
    my_lovely_array[2:6]
)

print(
    # skip the first element and take every 3rd element:
    my_lovely_array[1::3]
)

print(
    # reverse the array (QUESTION TO THE AUDIENCE)
    # my_lovely_array[???]
)


## Fun things to do with `numpy`
### *f.u.f. = frequently used functionale*
> 90% of the time you will be using ~5-10% of what `numpy` can offer

* array masking

In [None]:
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
B = np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"])

print(
    # take all the values of A for which a (element of A) is > 10
    A[A > 10]
)

print(
    # take all the values of B for which b (element B) is equal to "d"
    B[B != "d"]
)

print(
    # take all the values of A for which the corresponding value of b (element B) is equal to "j"
    A[B == "j"]
)

print(
    A[(B == "j") | (A**2 == 9)]
)

* `np.where`

In [None]:
A = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
B = np.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"])

print (
    # turns all the a-s (elements of A) into either a (if A < 5) or 0 (if A >= 5)
    np.where(A < 5, A, 0)
    # can also do (rewrites A):
    #   A[A < 5] = 0
)

print (
    # where a is even -- return the corresponding element of B, otherwise return "-"
    np.where(A % 2 == 0, B, "-")
    # could also do (rewrites B):
    #   B[A % 2 != 0] = "-"
)

* make an array of **uniformly spaced** numbers within a given interval **with a given step**

In [None]:
print(
    np.arange(100)
)

print(
    np.arange(-5, 100)
)

print(
    np.arange(-10.5, 10.5, 0.5)
)

print(
    np.arange(100, 0, -1)
)

* make an array of **uniformly spaced** numbers within a given interval **of a given length (number of elements)**

In [None]:
print(
    # interval is [0, 100) with 50 elements (default)
    np.linspace(0, 100)
)

print(
    # interval is [0, 2pi) with 500 elements
    np.linspace(0, 2 * np.pi, 500)
)

* playing around with randomness

In [None]:
print(
    # random integer from [0, 100]
    np.random.randint(100)
)
print(
    # random array with integers from [-10, 10]
    np.random.randint(-10, 10, 100)
)
print(
    # ??? (QUESTION TO THE AUDIENCE: what does this do?)
    np.random.random(100)
    # same as      : np.random.rand(100)
    # can also be  : np.random.rand(4, 6)
)
# ??? (QUESTION TO THE AUDIENCE: how to generate any floating point number from, say, 0 to pi ?)


In [None]:
# getting help with `numpy`
np.random.random?

In [None]:
cities = ["Melborne", "New York", "London", "Paris", "Tokyo", "Sydney"]

print(
    # make an array with random choices from the list
    np.random.choice(cities, 100)
)

print(
    # randomly shuffle the list
    np.random.permutation(cities)
)

# there is a whole lot more in numpy random: https://numpy.org/doc/1.16/reference/routines.random.html

* other useful functions

In [None]:
print(
    np.zeros(100)            # array of 100 zeros
)
print(
    np.zeros((10, 10))       # array of 10 x 10 zeros
)

In [None]:
# + all the math functions are present: https://docs.scipy.org/doc/numpy/reference/routines.math.html
# sin/cos, sqrt, floor/ceil, sum, prod, diff

A = 10 * np.sin(np.linspace(-20, 20, 34))

print(
    A
)

print(
    # find the max element
    np.max(A)
)

print(
    # find the index of the max element
    np.argmax(A)
)

# sanity check
i_max = np.argmax(A)
print(
    A[i_max]
)


In [None]:
B = 10 * np.sin(np.linspace(-20, 20, 42).reshape(6, 7))
# 6 x 7 array

print (
    "shape:", B.shape, "\n",
    "B^2:", B**2, "\n",
    "max:", np.max(B), "\n",
    "sum:", np.sum(B), "\n",
    "mean:", np.mean(B)
)

# Performance of `numpy` vs vanilla `python`

In [None]:
# vanilla python

import math

def vanillaPython():
  C = range(10000)
  for c in C:
    math.sqrt(c)

%timeit vanillaPython()

In [None]:
# numpy

def numpyPython():
  C = np.arange(10000)
  np.sqrt(C)

%timeit numpyPython()

In [None]:
%timeit np.sqrt(123.5)

In [None]:
%timeit math.sqrt(123.5)

> `numpy` is implicitly vectorized!

![](https://lappweb.in2p3.fr/~paubert/ASTERICS_HPC/images/vectorization.png)

### *Punchline: prefer `numpy` on large datasets when possible vs regular for loops*

In [None]:
s = np.arange(1, N + 1)
s = 1.0 / s**2
s = np.sum(s)

print(math.sqrt(s * 6))

---

Myth: `python` is slow.

Fact: `python` is *mostly* slow, but only *because* it's so generic and can be easily misused. if used properly it can actually be (almost) as fast as C/C++/Fortran (trust me).

Example: calculate the sum of $1/n^2$ for $n\in(0; N]$

```python
N = int(1e6)

# novice python (300 ms)
s = 0
for n in range(1, N + 1):
  s += 1 / n**2
  
print (math.sqrt(s * 6))     # prints 3.1415916986604673

# pro python (7 ms: 40x faster)
s = np.arange(1, N + 1)
s = 1.0 / s**2
s = np.sum(s)

print (math.sqrt(s * 6))     # prints 3.1415916986604673
```

(C code does this in 1 ms, see the attached `pi_c.c` file)

---

### examples of `numpy` performance test

In [None]:
def func_vanilla(A, B):
  C = []
  for a, b in zip(A, B):
    C.append(a + b)
  return C

N = 1e6
A = range(1, int(N) + 1)
B = range(1, int(N) + 1)

%timeit func_vanilla(A, B)

In [None]:
def func_numpy(A, B):
  return A + B

N = 1e6
A = np.arange(1, int(N) + 1)
B = np.arange(1, int(N) + 1)

%timeit func_numpy(A, B)


In [None]:
def pi_vanilla(N):
  s = 0
  for n in range(1, int(N) + 1):
    s += 1 / n**2
  return (math.sqrt(s * 6))

%timeit pi_vanilla(1e6)


In [None]:
def pi_numpy(N):
  s = np.arange(1, int(N) + 1)
  s = 1.0 / s**2
  s = np.sum(s)

%timeit pi_numpy(1e6)


## `numpy` and other modules

In [None]:
import pandas as pd
# pandas array is (almost) the same as numpy array

> as a backend `pandas` almost certainly relies on `numpy` in its core

In [None]:
N = 100
cities = ["Melborne", "New York", "London", "Paris", "Tokyo", "Sydney"]

city_names = np.random.choice(cities, N)
city_values = np.random.random(N)

# converting numpy arrays to a dataframe
df = pd.DataFrame({
  "name": city_names,
  "value": city_values
})

# show the first 5 rows
df.head()

In [None]:
# can use the same functions as numpy
print(
    np.mean(df['value'])
)

print(
    # compare with the output of just df['name']
    np.unique(df['name'])
)