# Agenda

1. `nan`
2. 2D arrays and shapes
3. sorting
4. Pandas

In [2]:
import numpy as np


In [3]:
a = np.array([10, 20, 30, 40, 50])
a.dtype

dtype('int64')

In [4]:
a[2] = 12.34
a

array([10, 20, 12, 40, 50])

In [5]:
a = np.array([10, 20, 30, 40, 50], dtype=np.float64)
a.dtype

dtype('float64')

In [6]:
a[2] = 12.34
a

array([10.  , 20.  , 12.34, 40.  , 50.  ])

In [12]:
a = np.array([10, 20, 30, 40, 50])
a.dtype

dtype('int64')

In [13]:
a[2] = '123'

In [14]:
a

array([ 10,  20, 123,  40,  50])

In [15]:
a.dtype

dtype('int64')

In [16]:
a[3] = 98.76
a.dtype

dtype('int64')

In [10]:
a[3] = 'hello'

ValueError: invalid literal for int() with base 10: 'hello'

In [11]:
a.dtype

dtype('int64')

In [17]:
a = np.array('hello out there'.split())

In [18]:
a

array(['hello', 'out', 'there'], dtype='<U5')

In [19]:
a[0] = 'xyz'
a

array(['xyz', 'out', 'there'], dtype='<U5')

In [20]:
a[1] = 'hijklmnop'

In [21]:
a

array(['xyz', 'hijkl', 'there'], dtype='<U5')

In [23]:
a = a.astype('<U30')
a

array(['xyz', 'hijkl', 'there'], dtype='<U30')

In [24]:
a[1] = 'rstuvwxyz'
a

array(['xyz', 'rstuvwxyz', 'there'], dtype='<U30')

In [25]:
a = np.array([{'a':10, 'b':20}])

In [26]:
a

array([{'a': 10, 'b': 20}], dtype=object)

In [27]:
a = np.array([27, 26, 27, 26, 27])
a.mean()

26.6

In [28]:
# what if we have a problem getting a reading?
a = np.array([27, 26, 0, 26, 27])
a.mean()

21.2

# `nan` -- not a number

`nan` allows us to have numbers but know that they aren't accurate/real.

In [29]:
np.nan

nan

In [30]:
type(np.nan)

float

In [31]:
np.nan == 0

False

In [32]:
np.nan == np.nan

False

In [33]:
a = np.array([10, 20, 30, np.nan, 50, 60])
a

array([10., 20., 30., nan, 50., 60.])

In [34]:
a.dtype

dtype('float64')

In [35]:
a.mean()

nan

In [37]:
# option 1: find nan, remove it, and calculate with the rest
# this doesn't work...

a[a != np.nan].mean()

nan

In [38]:
# use np.isnan, which returns True for nan

np.isnan(a)

array([False, False, False,  True, False, False])

In [41]:
# use ~ to flip the logic
a[~np.isnan(a)].mean()

34.0

In [42]:
a = np.array([10, 20, 30])
a/0

  a/0


array([inf, inf, inf])

In [43]:
type(np.inf)

float

In [44]:
np.inf == np.inf

True

In [45]:
a

array([10, 20, 30])

In [46]:
a = np.array([10, 20, 30, np.nan, np.nan, 50, 60])
a

array([10., 20., 30., nan, nan, 50., 60.])

In [47]:
a.size

7

In [48]:
a.isnan()

AttributeError: 'numpy.ndarray' object has no attribute 'isnan'

In [49]:
np.isnan(a).sum()

2

In [50]:
np.count_nonzero(a)

7

In [51]:
a

array([10., 20., 30., nan, nan, 50., 60.])

In [54]:
# another technique: replace nan with something

a[np.isnan(a)] = 999

In [55]:
a

array([ 10.,  20.,  30., 999., 999.,  50.,  60.])

In [57]:
# similiar: replace nan with the mean

a = np.array([10., 20., 30., np.nan, np.nan, 50., 60.])

# Exercise: Working with `nan`

1. Define a NumPy array with 20 integers between 0-1,000.
2. Change all even numbers to `nan`.
3. Calculate the mean of the numbers that remained.
4. Replace all `nan` values with the non-`nan` mean.

In [58]:
np.random.seed(0)
a = np.random.randint(0, 1000, 20)
a

array([684, 559, 629, 192, 835, 763, 707, 359,   9, 723, 277, 754, 804,
       599,  70, 472, 600, 396, 314, 705])

In [62]:
a = a.astype(np.float64)
a

array([684., 559., 629., 192., 835., 763., 707., 359.,   9., 723., 277.,
       754., 804., 599.,  70., 472., 600., 396., 314., 705.])

In [63]:
# find the even numbers
a[a % 2 == 0] = np.nan

In [64]:
a

array([ nan, 559., 629.,  nan, 835., 763., 707., 359.,   9., 723., 277.,
        nan,  nan, 599.,  nan,  nan,  nan,  nan,  nan, 705.])

In [70]:
# find the mean of these numbers, and assign to the nans

a[np.isnan(a)] = a[~np.isnan(a)].mean()

In [71]:
a

array([560.45454545, 559.        , 629.        , 560.45454545,
       835.        , 763.        , 707.        , 359.        ,
         9.        , 723.        , 277.        , 560.45454545,
       560.45454545, 599.        , 560.45454545, 560.45454545,
       560.45454545, 560.45454545, 560.45454545, 705.        ])

In [73]:
np.nanmean(a)

560.4545454545455

In [75]:
a = np.array([10, 20, 30, np.nan, 40, np.nan])
a

array([10., 20., 30., nan, 40., nan])

In [76]:
np.nan_to_num(a)

array([10., 20., 30.,  0., 40.,  0.])

In [77]:
np.nan_to_num(a, nan=999)

array([ 10.,  20.,  30., 999.,  40., 999.])

In [78]:
np.nan_to_num?

In [80]:
a.astype(float).dtype

dtype('float64')

In [81]:
a.astype(int).dtype

  a.astype(int).dtype


dtype('int64')

In [82]:
a

array([10., 20., 30., nan, 40., nan])

In [83]:
a.astype('float64')

array([10., 20., 30., nan, 40., nan])

# 2D arrays

NumPy arrays are actually "ndarray" -- n-dimensional arrays. 

I can use a list of lists to create a 2D array.

In [85]:
a = np.array([[10, 20, 30, 40],
              [50, 60, 70, 80],
             [90, 100, 110, 120]])
a

array([[ 10,  20,  30,  40],
       [ 50,  60,  70,  80],
       [ 90, 100, 110, 120]])

In [86]:
a[0] 

array([10, 20, 30, 40])

In [87]:
a[0] + 5

array([15, 25, 35, 45])

In [88]:
# what happens with broadcasting on the whole 2D array?
a + 5

array([[ 15,  25,  35,  45],
       [ 55,  65,  75,  85],
       [ 95, 105, 115, 125]])

In [89]:
a.shape

(3, 4)

In [90]:
# how can I change the shape?
# option 1: assign to .shape

a.shape = (2, 6)
a

array([[ 10,  20,  30,  40,  50,  60],
       [ 70,  80,  90, 100, 110, 120]])

In [91]:
a.shape = (12,)   # tuple with one element still needs ,

In [92]:
a

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120])

In [93]:
a.shape = (3, 4)
a

array([[ 10,  20,  30,  40],
       [ 50,  60,  70,  80],
       [ 90, 100, 110, 120]])

In [94]:
a.shape = (3, 5)

ValueError: cannot reshape array of size 12 into shape (3,5)

In [95]:
a.shape = (3, :)

SyntaxError: invalid syntax (1867200925.py, line 1)

In [96]:
# option 2: use the "reshape" method

# 
a.reshape(3, 4)

array([[ 10,  20,  30,  40],
       [ 50,  60,  70,  80],
       [ 90, 100, 110, 120]])

In [98]:
a.reshape(3, :)

SyntaxError: invalid syntax (1741208929.py, line 1)

In [100]:
# get random numbers

np.random.randint(0, 1000, 12).reshape(3, 4)

array([[115, 976, 755, 709],
       [847, 431, 448, 850],
       [ 99, 984, 177, 755]])

In [101]:
# even better!
np.random.randint(0, 1000, [3,4])

array([[797, 659, 147, 910],
       [423, 288, 961, 265],
       [697, 639, 544, 543]])

In [102]:
np.random.seed(0)
a = np.random.randint(0, 1000, [5, 6])
a

array([[684, 559, 629, 192, 835, 763],
       [707, 359,   9, 723, 277, 754],
       [804, 599,  70, 472, 600, 396],
       [314, 705, 486, 551,  87, 174],
       [600, 849, 677, 537, 845,  72]])

In [103]:
a[1]

array([707, 359,   9, 723, 277, 754])

In [104]:
a[3]

array([314, 705, 486, 551,  87, 174])

In [105]:
# what if I want both index 1 and 3?
# fancy indexing!
a[ [1,3] ]

array([[707, 359,   9, 723, 277, 754],
       [314, 705, 486, 551,  87, 174]])

In [106]:
# using a slice is a bit different...
a[1:3]   # from 1, up to and not including 3

array([[707, 359,   9, 723, 277, 754],
       [804, 599,  70, 472, 600, 396]])

In [107]:
a[1:5:2]

array([[707, 359,   9, 723, 277, 754],
       [314, 705, 486, 551,  87, 174]])

In [108]:
a

array([[684, 559, 629, 192, 835, 763],
       [707, 359,   9, 723, 277, 754],
       [804, 599,  70, 472, 600, 396],
       [314, 705, 486, 551,  87, 174],
       [600, 849, 677, 537, 845,  72]])

In [110]:
# what if I want from row index 2, and column index 4

a[2][4]

600

In [113]:
# better!
a[2,4]   # I'm really passing a[(2,4)]

600

In [114]:
# what if I want a column?
# we want column index 3

a[:, 3]

array([192, 723, 472, 551, 537])

In [115]:
a

array([[684, 559, 629, 192, 835, 763],
       [707, 359,   9, 723, 277, 754],
       [804, 599,  70, 472, 600, 396],
       [314, 705, 486, 551,  87, 174],
       [600, 849, 677, 537, 845,  72]])

In [116]:
# rows: from 2, until (not including) 4
# columns: 3

a[2:4, 3]

array([472, 551])

In [117]:
# we can slice the columns, too!
# rows: all
# columns: 1-4
a[:, 1:5]

array([[559, 629, 192, 835],
       [359,   9, 723, 277],
       [599,  70, 472, 600],
       [705, 486, 551,  87],
       [849, 677, 537, 845]])

In [118]:
# we can also use lists for fancy indexing!
# rows: 1,2
# columns: 3
a[[1,2], 3]

array([723, 472])

In [119]:
# rows: 3
# columns: 2, 4

a[3, [2,4]]

array([486,  87])

In [121]:
# check this out...
# if you use lists for both, then it creates coordinates
a[[1,2], [3,4]]    # a[1,3] and a[2,4]

array([723, 600])

In [122]:
a[1:3, 3:5]

array([[723, 277],
       [472, 600]])

In [123]:
a = np.array([10, 20, 30, 40, 50])
a

array([10, 20, 30, 40, 50])

In [124]:
a.shape

(5,)

In [126]:
a = a.reshape(1,5)
a.shape

(1, 5)

In [127]:
a

array([[10, 20, 30, 40, 50]])

In [128]:
a = np.random.randint(0, 1000, [3,4])
a

array([[777, 916, 115, 976],
       [755, 709, 847, 431],
       [448, 850,  99, 984]])

In [129]:
a.sum()

7907

In [130]:
a.sum(axis=0)

array([1980, 2475, 1061, 2391])

In [131]:
a.sum(axis=1)

array([2784, 2742, 2381])

# Exercise: 2D NumPy

1. Create a 2D array with numbers from 0-100, 45 random ints, 5x9.
2. Get all of the values from row index 2.
3. Get all of the values from column index 3.
4. Get all of the values from row indexes 1 and 4.
5. Get all values from column indexes 2 and 5.
6. Get the mean of even numbers in row 4.
7. Get the mean of odd numbers in columns 1 and 3.

In [132]:
np.random.seed(0)

a = np.random.randint(0, 100, [5,9])
a

array([[44, 47, 64, 67, 67,  9, 83, 21, 36],
       [87, 70, 88, 88, 12, 58, 65, 39, 87],
       [46, 88, 81, 37, 25, 77, 72,  9, 20],
       [80, 69, 79, 47, 64, 82, 99, 88, 49],
       [29, 19, 19, 14, 39, 32, 65,  9, 57]])

In [133]:
a[2]   # row index 2

array([46, 88, 81, 37, 25, 77, 72,  9, 20])

In [134]:
# column index 3
a[:, 3]

array([67, 88, 37, 47, 14])

In [135]:
# rows 1 + 4

a[ [1,4]  ]

array([[87, 70, 88, 88, 12, 58, 65, 39, 87],
       [29, 19, 19, 14, 39, 32, 65,  9, 57]])

In [137]:
a[1::3]  # slice -- starting from 1, until the end, step size 3

array([[87, 70, 88, 88, 12, 58, 65, 39, 87],
       [29, 19, 19, 14, 39, 32, 65,  9, 57]])

In [138]:
# columns 2 + 5

a[:, [2,5]]

array([[64,  9],
       [88, 58],
       [81, 77],
       [79, 82],
       [19, 32]])

In [140]:
a[:, 2:6:3]

array([[64,  9],
       [88, 58],
       [81, 77],
       [79, 82],
       [19, 32]])

In [144]:
# mean of even numbers in row index 4

a[4][a[4] % 2 == 0].mean()

23.0

In [147]:
# mean of odd numbers in columns 1 + 3 (separately)

a[:, [1,3]][a[:, [1,3]] % 2 == 1]

array([47, 67, 37, 69, 47, 19])

In [148]:
c = a[:, [1,3]]

In [149]:
c

array([[47, 67],
       [70, 88],
       [88, 37],
       [69, 47],
       [19, 14]])

In [150]:
np.random.seed(0)
a = np.random.randint(0, 100, [4,5])
a

array([[44, 47, 64, 67, 67],
       [ 9, 83, 21, 36, 87],
       [70, 88, 88, 12, 58],
       [65, 39, 87, 46, 88]])

In [151]:
b = a.reshape(2, 10)
b

array([[44, 47, 64, 67, 67,  9, 83, 21, 36, 87],
       [70, 88, 88, 12, 58, 65, 39, 87, 46, 88]])

In [152]:
b[0, 2]

64

In [153]:
b[0, 2] = 999
b

array([[ 44,  47, 999,  67,  67,   9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58,  65,  39,  87,  46,  88]])

In [154]:
a

array([[ 44,  47, 999,  67,  67],
       [  9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58],
       [ 65,  39,  87,  46,  88]])

In [155]:
c = a.reshape(2, 10).copy()
c

array([[ 44,  47, 999,  67,  67,   9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58,  65,  39,  87,  46,  88]])

In [156]:
c[0, 4] = 888
c

array([[ 44,  47, 999,  67, 888,   9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58,  65,  39,  87,  46,  88]])

In [157]:
a

array([[ 44,  47, 999,  67,  67],
       [  9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58],
       [ 65,  39,  87,  46,  88]])

In [158]:
a.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [159]:
b.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

In [160]:
np.shares_memory(a, b)

True

In [161]:
np.shares_memory(a, c)

False

In [162]:
a

array([[ 44,  47, 999,  67,  67],
       [  9,  83,  21,  36,  87],
       [ 70,  88,  88,  12,  58],
       [ 65,  39,  87,  46,  88]])

In [163]:
a.sort()

In [164]:
a

array([[ 44,  47,  67,  67, 999],
       [  9,  21,  36,  83,  87],
       [ 12,  58,  70,  88,  88],
       [ 39,  46,  65,  87,  88]])

In [166]:
a.sort?

In [167]:
np.random.seed(0)
a = np.random.randint(0, 100, [4,5])
a.sort(axis=0)

In [168]:
a

array([[ 9, 39, 21, 12, 58],
       [44, 47, 64, 36, 67],
       [65, 83, 87, 46, 87],
       [70, 88, 88, 67, 88]])

In [169]:
# which sort algorithm works fastest?
# let's check! 

np.random.seed(0)
a = np.random.randint(0, 100, [4,5])


In [170]:
# timeit is a module that comes with the Python standard library

%timeit a.sort()

275 ns ± 2.18 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [171]:
%timeit a.sort(kind='mergesort')

314 ns ± 6.34 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [183]:
mylist = list(a.reshape(1,20)[0])
%timeit sorted(mylist)

1.01 µs ± 16.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [185]:
%timeit mylist = sorted(mylist)

UnboundLocalError: cannot access local variable 'mylist' where it is not associated with a value

In [186]:
import pandas as pd

In [187]:
from pandas import Series, DataFrame

# Pandas data structures

1. Series -- 1D data
2. Data frame -- 2D data

In [188]:
s = Series([10, 20, 30, 40, 50, 60])
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [189]:
# series works similiar to NumPy arrays

In [190]:
s[0]

10

In [191]:
s[0:2]

0    10
1    20
dtype: int64

In [193]:
# methods on our series
s.sum()

210

In [194]:
s.mean()

35.0

In [195]:
s.std()

18.708286933869708

In [196]:
# what is behind the scenes?
s.values

array([10, 20, 30, 40, 50, 60])

In [197]:
s = Series([10, 20, 30, 40, 50],
          index=[2,4,6,8,10])

In [198]:
s

2     10
4     20
6     30
8     40
10    50
dtype: int64

In [199]:
s[6]

30

In [200]:
s[0]

KeyError: 0

In [201]:
s = Series([10, 20, 30, 40, 50],
          index=list('abcde'))

In [202]:
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [203]:
s['a']

10

In [204]:
s['c']

30

In [205]:
s[['a', 'c']]   # fancy index

a    10
c    30
dtype: int64

In [207]:
# slices are up to *and* including!
s['a':'c']

a    10
b    20
c    30
dtype: int64

In [208]:
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

# Exercise: Simple series

1. Create a series containing 10 random ints, with index a-j.
2. Get the item at index `b`.
3. Retrieve items at indexes `c`, `d`, and `f`.
4. What is the mean of items at indexes `a`, `e`, `g`, and `h`?
5. What is the sum of the even values?

In [209]:
list('abcde')

['a', 'b', 'c', 'd', 'e']

In [210]:
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
          index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [211]:
import string
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
          index=list(string.ascii_lowercase[:10]))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [212]:
s['b']

47

In [213]:
s[['c', 'd', 'f']]

c    64
d    67
f     9
dtype: int64

In [216]:
s[list('cdf')]

c    64
d    67
f     9
dtype: int64

In [218]:
# What is the mean of items at indexes a, e, g, and h?

s[['a', 'e', 'g', 'h']].mean()

53.75

In [219]:
# What is the sum of the even values?

s % 2 == 0

a     True
b    False
c     True
d    False
e    False
f    False
g    False
h    False
i     True
j    False
dtype: bool

In [221]:
s[s%2==0].sum()

144

In [222]:
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [223]:
# better ways to retrieve data

# 1. .loc[index]
# 2. .iloc[position]

In [224]:
s.loc['b']

47

In [225]:
s.loc[['c', 'd', 'f']]

c    64
d    67
f     9
dtype: int64

In [226]:
s.iloc[0]

44

In [227]:
s.iloc[3]

67

In [228]:
s.iloc[::2]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [229]:
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [230]:
z = Series([10, 20, 30])
z.index

RangeIndex(start=0, stop=3, step=1)

In [231]:
s1 = Series([10, 20, 30, 40, 50],
           index=list('abcde'))

s2 = Series([100, 200, 300, 400, 500],
           index=list('edcba'))

s1 + s2

a    510
b    420
c    330
d    240
e    150
dtype: int64

In [232]:
s3 = Series([111, 222, 333, 444, 555],
           index=list('abcab'))

In [233]:
s3.loc['a']

a    111
a    444
dtype: int64

In [234]:
s3.loc[['a', 'b']]

a    111
a    444
b    222
b    555
dtype: int64

In [235]:
s3.loc['c']

333

In [236]:
# add to itself
s3 + s3

a     222
b     444
c     666
a     888
b    1110
dtype: int64

In [237]:
s1 + s3

a    121.0
a    454.0
b    242.0
b    575.0
c    363.0
d      NaN
e      NaN
dtype: float64

In [238]:
s3

a    111
b    222
c    333
a    444
b    555
dtype: int64

In [239]:
s1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [240]:
s1 + s3

a    121.0
a    454.0
b    242.0
b    575.0
c    363.0
d      NaN
e      NaN
dtype: float64

In [242]:
# how can I get the output to be 0 if there is no match on the other side?

x = 1
y = 2

x.__add__(y)   # 1 + 2

3

In [244]:
s1.add(s3)

a    121.0
a    454.0
b    242.0
b    575.0
c    363.0
d      NaN
e      NaN
dtype: float64

In [246]:
s1.add(s3, fill_value=0)

a    121.0
a    454.0
b    242.0
b    575.0
c    363.0
d     40.0
e     50.0
dtype: float64

In [247]:
s1.add?

# Math operators

- `add`
- `sub`
- `mul`
- `truediv` (/)
- `floordiv` (//)
- `exp` (**)
- `mod` (%)