In [230]:
import numpy as np
import pandas as pd

* Make a simple 2x3 numpy array

In [168]:
x = np.array([[4,5,6],[7,8,9]])
x
x.shape

(2, 3)

* Create a np array out of a sequence and then alter its dimensions

In [169]:
n = np.arange(0,20,2)
n

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [170]:
n = n.reshape(2,5)
n

array([[ 0,  2,  4,  6,  8],
       [10, 12, 14, 16, 18]])

In [171]:
type(n)

numpy.ndarray

In [172]:
 o = np.linspace(0,4,9)
 o

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ])

In [173]:
type(o)

numpy.ndarray

In [174]:
o.reshape(3,3) #could have also used o.resize(3,3)
o

array([ 0. ,  0.5,  1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ])

In [175]:
x = np.ones((2,3), int)
x

array([[1, 1, 1],
       [1, 1, 1]])

In [176]:
np.vstack([x, 2*x])

array([[1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [2, 2, 2]])

In [177]:
np.hstack([x, 2*x])

array([[1, 1, 1, 2, 2, 2],
       [1, 1, 1, 2, 2, 2]])

In [178]:
v = np.linspace(1,5,5)
v

array([ 1.,  2.,  3.,  4.,  5.])

In [179]:
w = np.linspace(2,6,5)
w

array([ 2.,  3.,  4.,  5.,  6.])

In [180]:
v*w #element-wise product

array([  2.,   6.,  12.,  20.,  30.])

In [181]:
v.dot(w) #dot product

70.0

In [182]:
print(v)
print(v.dtype) #prints datatype of the numpy array

[ 1.  2.  3.  4.  5.]
float64


In [183]:
v = v.astype('int')
print(v)
print(v.dtype)

[1 2 3 4 5]
int32


In [184]:
print(v.min())
print(v.max())
print(v.mean())
print(v.std())

1
5
3.0
1.41421356237


* Now we shall generate the y values for the equation y = 2x^2 for x in the range[-2-2] in steps of 0.5

In [185]:
w = np.array([2*x**2 for x in [0.5*y for y in range(-4,5)]])


Now let's see which index contains the minimum value and which contains the maximum value.

In [186]:
w.argmin()

4

In [187]:
w.argmax() 

0

**Notice in the above, that there are two maximum values of 8, but the lowest of the indices bearing that value is returned**

## Indexing and slicing of numpy arrays

In [188]:
s = np.arange(13)**2
s

array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121, 144])

In [189]:
s[0], s[4], s[0:3]

(0, 16, array([0, 1, 4]))

In [190]:
s[1:5]

array([ 1,  4,  9, 16])

In [191]:
s[-4:]

array([ 81, 100, 121, 144])

In [192]:
s[-5:-2]

array([ 64,  81, 100])

In [193]:
r = np.arange(36)
r.resize((6,6))
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

In [194]:
r[3,2] #uses zero-based indices

20

In [195]:
r[3,3:6] #2nd index in slice is out-of-bounds

array([21, 22, 23])

In [196]:
r[-1, ::2] #select every other elemenet from last row

array([30, 32, 34])

In [197]:
r[r>30]

array([31, 32, 33, 34, 35])

In [198]:
v = r[r>27] #even though original elements exceeding 27 span two rows,
# this conditional slice only spans one
v

array([28, 29, 30, 31, 32, 33, 34, 35])

In [199]:
v.shape 

(8,)

**Let's reprint r**

In [200]:
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

**We'll now cap the maximum element at 30**

In [201]:
r[r>30]=30
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 30, 30, 30, 30, 30]])

## On copying numpy arrays

**We'll first assign the slice of r comprised of its first 3 rows and first 3 columns to s**

In [202]:
s = r[:3, :3]
s

array([[ 0,  1,  2],
       [ 6,  7,  8],
       [12, 13, 14]])

**We'll now change all the values in s to zero and see if that change also happened to the correspondingly positioned values in r**

In [203]:
s[:] = 0
s

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [204]:
r

array([[ 0,  0,  0,  3,  4,  5],
       [ 0,  0,  0,  9, 10, 11],
       [ 0,  0,  0, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 30, 30, 30, 30, 30]])

**Sadly, the correspondingly positioned values in r changed. Here's how we can prevent that from happening**

In [205]:
r = np.arange(0, 36)
r.resize(6,6) #restore pre-modification state of r
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

In [206]:
s = r[:3,:3].copy()
s

array([[ 0,  1,  2],
       [ 6,  7,  8],
       [12, 13, 14]])

In [207]:
s[:] = 0 
s

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [208]:
r

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])

**So r remained untouched**

## Interating over numpy arrays

**A 2-d numpy array is an iterable of its rows**

In [209]:
v = np.arange(1,26)
v.resize(5,5)
v

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25]])

In [210]:
for item in v:
    print(item)

[1 2 3 4 5]
[ 6  7  8  9 10]
[11 12 13 14 15]
[16 17 18 19 20]
[21 22 23 24 25]


**To iterate over all elements over a 2-d numpy array, use nditer static method**

In [211]:
for item in np.nditer(v):
    print(item, end =" ")
print()

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 


**Standard 2-d iteration paradigm**

In [212]:
for line in v:
    for item in line:
        print(item, end=" ")
    print()
    

1 2 3 4 5 
6 7 8 9 10 
11 12 13 14 15 
16 17 18 19 20 
21 22 23 24 25 


**Retrieving both row number and row contents using enumerate fn**

In [213]:
for index, row in enumerate(v):
    print("row "+str(index)+":", row)

row 0: [1 2 3 4 5]
row 1: [ 6  7  8  9 10]
row 2: [11 12 13 14 15]
row 3: [16 17 18 19 20]
row 4: [21 22 23 24 25]


**<em>Using zip for element-wise computations</em>**

In [214]:
w = np.arange(26,26+25).reshape(5,5)
for i,j in zip(v,w):
    print(i, "+", j, '=', i+j)

[1 2 3 4 5] + [26 27 28 29 30] = [27 29 31 33 35]
[ 6  7  8  9 10] + [31 32 33 34 35] = [37 39 41 43 45]
[11 12 13 14 15] + [36 37 38 39 40] = [47 49 51 53 55]
[16 17 18 19 20] + [41 42 43 44 45] = [57 59 61 63 65]
[21 22 23 24 25] + [46 47 48 49 50] = [67 69 71 73 75]


**<em>Same thing can be done using an element-wise computation</em>**

In [215]:
v+w

array([[27, 29, 31, 33, 35],
       [37, 39, 41, 43, 45],
       [47, 49, 51, 53, 55],
       [57, 59, 61, 63, 65],
       [67, 69, 71, 73, 75]])

### Numpy Array stacking and concatenation

*Let's illustrate horizontal/vertical stacking of numpy arrays as well as concatenation of numpy arrays. To do so, we'll first create three numpy arrays:
A (a 2x4 array), B (a 2x3 array), and C(a 3x4 array)*

In [216]:
A = np.arange(8).reshape(2,4) + 0.1
A

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1]])

In [217]:
B = np.arange(6).reshape(2,3) + 0.2
B

array([[ 0.2,  1.2,  2.2],
       [ 3.2,  4.2,  5.2]])

In [218]:
C = np.arange(12).reshape(3,4) + 0.3
C

array([[  0.3,   1.3,   2.3,   3.3],
       [  4.3,   5.3,   6.3,   7.3],
       [  8.3,   9.3,  10.3,  11.3]])

Horizontally stacking the 2x4 array A and the 2x3 array B returns a 2x7 array

In [219]:
np.hstack([A,B])

array([[ 0.1,  1.1,  2.1,  3.1,  0.2,  1.2,  2.2],
       [ 4.1,  5.1,  6.1,  7.1,  3.2,  4.2,  5.2]])

In [220]:
%%HTML
<style>
b em {
    color: red;
}
</style>

This output can be reproduced by horizontally concatenating 2x4 array, A, and 2x3 array, B. <b>Note: the numpy concatenation function is <em>numpy.concatenate</em> while the pandas concatenation function is <em>pandas.concat</em><b>

In [221]:
np.concatenate([A,B], axis = 1)

array([[ 0.1,  1.1,  2.1,  3.1,  0.2,  1.2,  2.2],
       [ 4.1,  5.1,  6.1,  7.1,  3.2,  4.2,  5.2]])

Vertically stacking the 2x4 array and the 3x4 array returns a 5x4 array.

In [222]:
np.vstack([A,C])

array([[  0.1,   1.1,   2.1,   3.1],
       [  4.1,   5.1,   6.1,   7.1],
       [  0.3,   1.3,   2.3,   3.3],
       [  4.3,   5.3,   6.3,   7.3],
       [  8.3,   9.3,  10.3,  11.3]])

This output can be reproduced by vertically concatenating 2x4 array, A, and 3x4 array, B. 

In [227]:
np.concatenate([A,C], axis = 0)

array([[  0.1,   1.1,   2.1,   3.1],
       [  4.1,   5.1,   6.1,   7.1],
       [  0.3,   1.3,   2.3,   3.3],
       [  4.3,   5.3,   6.3,   7.3],
       [  8.3,   9.3,  10.3,  11.3]])

<em> Now we'll read two csv file files related census 2010 data; one file contains data related to population and the other contains data related to unemployment. Both data files have a zip code column that'll be in as the index of the associated pandas DataFrame. However, only one common zip code (2860) is contained between the files.<em>

In [244]:
fn = 'census_2010_pop_tiny.csv'
population = pd.read_csv(fn, index_col = 0)
population

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
57538,322
59916,130
37660,40038
2860,45199


Now that we've read the file into a pandas DataFrame, we can easily convert it to a numpy array using the np.array function- passing in the DataFrame as input. Notice how the index is discarded.

In [247]:
population_array = np.array(population)
population_array

array([[  322],
       [  130],
       [40038],
       [45199]], dtype=int64)

In [245]:
fn = 'census_2010_unemployment_tiny.csv'
unemployment = pd.read_csv(fn, index_col = 0)
unemployment

Unnamed: 0_level_0,unemployment,participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1
2860,0.11,34447
46167,0.02,4800
1097,0.33,42
80808,0.07,4310


<b>Notice how 2860 is the only common index value between the data frames <em>population</em> and <em>unemployment. </em></b>

In [248]:
unemployment_array = np.array(unemployment)
unemployment_array

array([[  1.10000000e-01,   3.44470000e+04],
       [  2.00000000e-02,   4.80000000e+03],
       [  3.30000000e-01,   4.20000000e+01],
       [  7.00000000e-02,   4.31000000e+03]])

Notice once again (above) that the index is discarded when converting a pandas DataFrame to a numpy array by passing the DataFrame to the np.array function.

<b>We could horizontally concatenate the numpy arrays <em>population</em> and <em>unemployment</em> (performed below), but in doing so, we'll get a meaningless result since the two original data frames only had one index value in common.<b>

In [249]:
np.concatenate([population_array,unemployment_array], axis = 1)

array([[  3.22000000e+02,   1.10000000e-01,   3.44470000e+04],
       [  1.30000000e+02,   2.00000000e-02,   4.80000000e+03],
       [  4.00380000e+04,   3.30000000e-01,   4.20000000e+01],
       [  4.51990000e+04,   7.00000000e-02,   4.31000000e+03]])