In [None]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/20192.ai4eng/master/init.py
import init; init.init(force_download=False); init.get_weblink()

In [1]:
import numpy as np

# `numpy` is mostly about matrix data manipulation

see this cheat sheet: https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf

Python **lists** do not implement matrix semantics

In [2]:
a = [ 1, 2, 3]
b = [10,20,30]
a + b

In [3]:
a = np.array([1,2,3])
b = np.array([10,20,30])
a + b

## Many ways of creating arrays

In [4]:
# manually
a = np.array([[1,2,3],[4,5,6]])
a

In [5]:
# random creation
a = np.random.random(size=(3,5))
a

In [6]:
a = np.random.normal(size=(2,3,4))
a

In [7]:
a = np.random.randint(100, size=(4,10))
a

In [8]:
# deterministic
np.eye(3)

In [9]:
np.linspace(-3,20,10)

In [10]:
np.arange(-3,20)

In [11]:
np.arange(-3,2)

In [12]:
np.zeros((5,10))

In [13]:
np.ones((5,10))

## Info on arrays

In [14]:
a = np.random.randint(100, size=(3,4))
a

In [15]:
a.shape, len(a)

In [16]:
len(a.shape)

In [17]:
a.size

In [18]:
a.dtype

In [19]:
b = a.astype(np.float32)
b

## Operations with arrays

**element by element**

In [20]:
a = np.array([3,5,4])
b = np.array([10,5,30])

In [21]:
a + b

In [22]:
a * b

In [23]:
b ** a

In [24]:
np.sin(a)

In [25]:
a == b

**matrix operations**

In [26]:
a.sum()

In [27]:
np.sum(a), np.max(a), np.min(a), np.mean(a), np.std(a), np.product(a)

In [28]:
a.dot(b)

In [29]:
np.sum(a*b)

In [30]:
a = np.random.randint(100, size=(3,4))
a

In [31]:
a.T

In [32]:
a = np.array([3,5,4])
b = np.array([10,5,30])

In [33]:
np.allclose(a,b)

In [34]:
np.any(a==b)

## Indexing


In [35]:
a = np.random.randint(100, size=(6,10))
a

In [36]:
a[1]

In [37]:
a[:,1]

In [38]:
a[1,:]

In [39]:
a[:3]

In [40]:
a[:,3:8]

In [41]:
a[2:-1,3:-2]

boolean indices

In [42]:
a = np.random.randint(100, size=(10))
a

In [43]:
a[[ True,  True, False, False, False, False,  True, False, False, False]]

In [44]:
a<50

In [45]:
a[a<50]

In [46]:
a[(a<50)&(a%2==0)]

## Axis operations

In [47]:
a = np.random.randint(100, size=(6,10))
a

In [48]:
np.max(a, axis=0)

In [49]:
np.max(a, axis=1)

In [50]:
np.mean(a, axis=1)

In [51]:
np.argmax(a, axis=1)

**reshaping is very useful**

In [52]:
a.shape

In [53]:
a.reshape(5,12)

In [54]:
a.reshape(5,-1)

## Many things can be represented by matrices. For instance, images

In [55]:
from skimage import io
import matplotlib.pyplot as plt
%matplotlib inline

In [56]:
img = io.imread("local/imgs/sample_img.jpg")
type(img)

In [57]:
img.shape

In [58]:
np.min(img), np.max(img)

convert it to standard [0,1] range

In [59]:
img = img/255

In [60]:
np.min(img), np.max(img)

In [61]:
plt.imshow(img)

observe the channel composition of the image: lighter $\rightarrow$ greater color presence

In [62]:
cnames = ["red", "green", "blue"]
plt.figure(figsize=(20,4))
for i in range(3):
    plt.subplot(1,3,i+1)
    plt.imshow(img[:,:,i], plt.cm.Greys_r)
    plt.title("channel %i: %s"%(i, cnames[i]))

grayscale version

In [63]:
plt.imshow(np.mean(img, axis=2), plt.cm.Greys_r)

In [64]:
plt.imshow(img[:img.shape[0]//2,:,:])

In [65]:
plt.imshow(img[:,img.shape[1]//2:,:])

In [66]:
plt.imshow(img[90:220, 150:330,:])

**copy an array**

In [67]:
img2 = img.copy()
id(img), id(img2)

increase luminosity

In [68]:
img2[90:220, 150:330,:] *= 2
plt.imshow(img2)

remove channel

In [69]:
img2[30:120, 280:, 0] = 0
plt.imshow(img2)

In [70]:
plt.imshow(img[::-1,:,:])

understand luminosity on each channel

In [71]:
img.flatten().shape

In [72]:
img[:,:,0].flatten().shape

In [73]:
cnames = ["red", "green", "blue"]
plt.figure(figsize=(20,4))
for i in range(3):
    plt.subplot(1,3,i+1)
    plt.hist(img[:,:,i].flatten(), bins=20);
    plt.title("channel %i: %s"%(i, cnames[i]))


reduce luminosity on red channel

In [74]:
img3 = img.copy()
img3[:,:,0][img3[:,:,0]>0.8] = 0.5
plt.hist(img3[:,:,0].flatten(), bins=20);

In [75]:
plt.figure(figsize=(15,4))
plt.subplot(121); plt.imshow(img); plt.title("original")
plt.subplot(122); plt.imshow(img3); plt.title("red reduced")

shift and overlap

In [76]:
img4 = (img[5:,:,:] + img[:-5:,:,:])/2
plt.imshow(img4)

In [77]:
img3.shape, img.shape

## Vectorization

exploit `numpy` vectorized operations, avoid **for** loops as much as possible

In [78]:
a = np.random.randint(100, size=(6,10))
a

In [79]:
np.mean(a, axis=1)

In [80]:
np.array([np.mean(a[i,:]) for i in range(a.shape[0])])

In [81]:
%timeit np.mean(a, axis=1)

In [82]:
%timeit np.array([np.mean(a[i,:]) for i in range(a.shape[0])])

always think if oyu can vectorize. For instance, for two (**large**) matrices

In [83]:
a = np.random.randint(100, size=(1000,100))
b = np.random.randint(200, size=(1000,100))

the number of elements which are the equal

In [84]:
np.mean(a==b)

The mean of the elements of `a` that are greater to its corresponding position in `b`

In [85]:
np.mean(a[a>b])

The mean of the elements of `b` that are greater to its corresponding position in `a`


In [86]:
np.mean(b[b>a])

with smaller matrices

In [87]:
a = np.random.randint(100, size=(10))
b = np.random.randint(200, size=(10))
print (a)
print (b)

the element in `b` corresponding to the position of the greatest element in `a`

In [88]:
b[np.argmax(a)]

## Broadcasting

usually `numpy` needs matrix dimensions to match when doing operations among them

In [89]:
a = np.random.randint(100, size=(3,5))
b = np.random.randint(10, size=(3,4))
print (a)
print (b)
a + b

but `numpy` _tries_ to expand the operations if some dimensions match

In [90]:
a

In [91]:
a*10

observe the `reshape` in the following operation

In [92]:
a + b[:,1].reshape(-1,1)

In [93]:
b[:,1].reshape(-1,1)

In [94]:
b[:,1]

In [95]:
a + b[:,1]

observe row wise

In [96]:
a + b.flatten()[:a.shape[1]]

In [97]:
print (a)
print (b)

In [98]:
b.flatten()

In [99]:
b.flatten()[:a.shape[1]]

## Functions args by reference

except scalar, function arguments are always passed **by reference**

- if you modify it within a function it will change
- the name within a function can be different, but will point to the same object

observe the difference if the following expressions (showing with `numpy` arrays, not general in python)

In [100]:
a = np.round(np.random.random(size=5),3)
print (a)
id(a)

In [101]:
a = a + 1
print (a)
id(a)

this operation is semantically the same, but it produces a different memory footprint (faster, no copy, modifies in place)

In [102]:
a += 1
print (a)
id(a)

now in functions

In [103]:
a = np.round(np.random.random(size=5),3)
print (a)
id(a)

In [104]:
def getmax(x):
    print ("mem address in function", id(x))
    return np.max(x)

In [105]:
getmax(a)

In [106]:
def getmax_after_sinplus1(x):
    print ("mem address in function before op", id(x))
    x = np.sin(x+1)
    print ("mem address in function after op", id(x))
    return np.max(x)

In [107]:
getmax_after_sinplus1(a)

In [108]:
print (a)

however, the following implementation changes `a` **outside** the function

In [109]:
def getmax_after_sinplus1(x):
    print ("mem address in function before op", id(x))
    x += 1
    print ("mem address in function after +1", id(x))
    x = np.sin(x)
    print ("mem address in function after sin", id(x))
    return np.max(x)

In [110]:
getmax_after_sinplus1(a)

In [111]:
print (a)

Expressions like `+1` are **faster** and use **less memory** but may have side effects. We will see this in `pandas`

## Matplotlib

plotting naturally exploits vectorization

see https://matplotlib.org/gallery.html for exameples and guides.

In [112]:
x = np.linspace(-1,1,20)
x

In [113]:
x**2

In [114]:
plt.plot(x, x**2, label="$x^2$")
# cosmetics
plt.grid();
plt.title("plotting function")
plt.xlabel("$x$")
plt.ylabel("$x^2$")
plt.legend();

all plotting happens in the same figure until we create a new one

In [115]:
plt.plot(x, x**2, color="red", label="$x^2$")

x2_with_noise = x**2 + np.random.normal(size=x.shape)*.1

xdense = np.linspace(np.min(x), np.max(x), 200)
xdense2_with_noise = xdense**2 + np.random.normal(size=xdense.shape)*.1


plt.scatter(x, x**2, s=50, color="red", alpha=.5, label="actual data points")
plt.scatter(x, x2_with_noise, s=50, color="black", label="data with noise")
plt.scatter(xdense, xdense2_with_noise, s=5, color="blue", alpha=.5, label="data with noise")
plt.grid();

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))


some statistical plots

In [116]:
x1 = np.random.normal(loc=0, scale=1, size=10000)
x2 = np.random.normal(loc=2, scale=2, size=10000)
plt.hist(x1, bins=30, alpha=.5, density=True, label="x1");
plt.hist(x2, bins=30, alpha=.5, density=True, label="x2");
plt.grid(); plt.legend();

In [117]:
plt.boxplot([x1, x2]);
plt.grid();
plt.xticks(range(1,3), ["x1", "x2"]);