# Agenda

1. Series
2. NumPy array vs. series
3. Indexing
4. dtypes
5. `NaN`
6. Methods

In [3]:
import pandas as pd
from pandas import Series, DataFrame

In [4]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
s.values

array([10, 20, 30, 40, 50])

In [6]:
s.values.dtype

dtype('int64')

In [7]:
s = Series([10, 20, 30, 40, 50],
          dtype='int16')
s

0    10
1    20
2    30
3    40
4    50
dtype: int16

In [8]:
s[0]

np.int16(10)

In [9]:
s[1]

np.int16(20)

In [10]:
s[-1]

KeyError: -1

In [11]:
# .loc -- we use [] to get values

s.loc[0]

np.int16(10)

In [12]:
s.loc[1]

np.int16(20)

In [13]:
s.iloc[0]

np.int16(10)

In [14]:
s.iloc[1]

np.int16(20)

In [15]:
s

0    10
1    20
2    30
3    40
4    50
dtype: int16

# Some attributes

- `.values` -- returns the NumPy array used for storage
- `.loc` -- uses `[]`, returns one or more values from our series based on the index
- `.iloc` -- uses `[]`, return one or more values from our series based on the *position*

In [17]:
s.iloc[-1]

np.int16(50)

In [18]:
s.loc[0]

np.int16(10)

In [19]:
s.values

array([10, 20, 30, 40, 50], dtype=int16)

In [20]:
s.values[0]

np.int16(10)

In [21]:
s.loc[2:5]   

2    30
3    40
4    50
dtype: int16

In [22]:
s.iloc[2:5]

2    30
3    40
4    50
dtype: int16

In [23]:
# fandcy indexing
s.loc[[2, 4, 1]]

2    30
4    50
1    20
dtype: int16

In [24]:
s.iloc[[2, 4, 1]]

2    30
4    50
1    20
dtype: int16

In [25]:
# we can set the index!

s = Series([10, 20, 30, 40, 50],
          index=list('abcde'))
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [26]:
s.loc['a']

np.int64(10)

In [27]:
s.loc['b']

np.int64(20)

In [28]:
s.loc['b':'d']

b    20
c    30
d    40
dtype: int64

In [29]:
s.iloc[0]

np.int64(10)

In [31]:
s.iloc[1]

np.int64(20)

In [33]:
s.iloc[1:3]

b    20
c    30
dtype: int64

In [34]:
s.loc[['a', 'c', 'b']]

a    10
c    30
b    20
dtype: int64

In [37]:
%timeit s.loc[['b', 'd', 'c']]

49.5 μs ± 910 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [38]:
%timeit s.iloc[[1, 3, 2]]

16.1 μs ± 127 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [39]:
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [40]:
s + s

a     20
b     40
c     60
d     80
e    100
dtype: int64

In [42]:
s + 3   # broadcast

a    13
b    23
c    33
d    43
e    53
dtype: int64

In [43]:
s2 = Series([10, 20, 30, 40, 50],
           index=list('edcba'))
s2

e    10
d    20
c    30
b    40
a    50
dtype: int64

In [48]:
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [44]:
s + s2

a    60
b    60
c    60
d    60
e    60
dtype: int64

In [45]:
s3 = Series([10, 20, 30, 40, 50])
s3

s + s3

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

In [46]:
s.add(s3)

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

In [49]:
s4 = Series([100, 200, 300, 400, 500],
           index=list('cdefg'))

s + s4

a      NaN
b      NaN
c    130.0
d    240.0
e    350.0
f      NaN
g      NaN
dtype: float64

In [52]:
s.add(s4, fill_value=0)    # use 0 where there is no corresponding value

a     10.0
b     20.0
c    130.0
d    240.0
e    350.0
f    400.0
g    500.0
dtype: float64

In [53]:
help(s.add)

Help on method add in module pandas.core.series:

add(other, level=None, fill_value=None, axis: Axis = 0) -> Series method of pandas.core.series.Series instance
    Return Addition of series and other, element-wise (binary operator `add`).

    Equivalent to ``series + other``, but with support to substitute a fill_value for
    missing data in either one of the inputs.

    Parameters
    ----------
    other : Series or scalar value
    level : int or name
        Broadcast across a level, matching Index values on the
        passed MultiIndex level.
    fill_value : None or float value, default None (NaN)
        Fill existing missing (NaN) values, and any new element needed for
        successful Series alignment, with this value before computation.
        If data in both corresponding Series locations is missing
        the result of filling (at that location) will be missing.
    axis : {0 or 'index'}
        Unused. Parameter needed for compatibility with DataFrame.

    Return

In [56]:
s.mean()

np.float64(30.0)

In [57]:
s.std()

np.float64(15.811388300841896)

In [58]:
s.min()


np.int64(10)

In [59]:
s.max()

np.int64(50)

In [60]:
s.sum()

np.int64(150)

In [61]:
s < s.mean()

a     True
b     True
c    False
d    False
e    False
dtype: bool

In [62]:
s.loc[ s < s.mean() ]  # mask index

a    10
b    20
dtype: int64

In [63]:
s.iloc[ s < s.mean() ]  # mask index

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [64]:
s

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [65]:
# what's the biggest number?
s.max()

np.int64(50)

In [66]:
# what is the index of the biggest number?
s.idxmax()

'e'

In [69]:
s.idxmin()

'a'

# Exercises: Series

1. Create a series with 10 random ints from 0-100, index a-j.
2. Retrieve the value at index b.
3. Retreive from indexes c, d, f
4. CAlculate the mean of indexes a, e, g, and h
5. Calculate the mean of items with even positional indexes.
6. Calculate the mean of the even numbers.

In [70]:
import numpy as np

np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
          index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [71]:
# 2. Retrieve the value at index b.

s.loc['b']   # s.loc.__getitem__('b')

np.int64(47)

In [73]:
# 3. Retreive from indexes c, d, f

s.loc[ ['c', 'd', 'f' ]]     # s.loc.__getitem__(['c', 'd', 'f'])

c    64
d    67
f     9
dtype: int64

In [74]:
s.loc[list('cdf')]

c    64
d    67
f     9
dtype: int64

In [75]:
s.loc[['c', 'd', 'd']]

c    64
d    67
d    67
dtype: int64

In [77]:
# 4. CAlculate the mean of indexes a, e, g, and h

s.loc[['a', 'e', 'g', 'h']].mean()

np.float64(53.75)

In [79]:
# 5. Calculate the mean of items with even positional indexes.

s.iloc[::2].mean()

np.float64(58.8)

In [83]:
# 6. Calculate the mean of the even numbers.

s.loc[s % 2 == 0].mean()

np.float64(48.0)

In [84]:
s.iloc[s % 2 == 0].mean()

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [85]:
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [86]:
mylist = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# slice
mylist[2:5]   # from 2 until (not including) 5

[30, 40, 50]

In [87]:
mylist[:5]   # from the start

[10, 20, 30, 40, 50]

In [88]:
mylist[5:]  # from 5 through the end

[60, 70, 80, 90, 100]

In [89]:
# if we have a third part in pour slice, it's the step size

mylist[2:7:3]

[30, 60]

In [90]:
mylist[::3]

[10, 40, 70, 100]

In [92]:
s.loc[::2]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [93]:
s.iloc[::2]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [94]:
slice(None, None, 2)

slice(None, None, 2)

In [95]:
s.iloc[(slice(None, None, 2))]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [96]:
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [101]:
s = Series(np.random.randint(0, 100, 20),
           index=[chr(i)
                 for i in range(ord('A'), ord('U'))])

In [102]:
s

A    19
B    72
C    71
D    87
E    13
F    58
G    81
H    55
I    64
J    75
K    92
L    36
M    25
N    32
O    42
P    14
Q    86
R    28
S    20
T    82
dtype: int64

In [103]:
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
           index=list('abcdefabcd'))

In [104]:
s

a    44
b    47
c    64
d    67
e    67
f     9
a    83
b    21
c    36
d    87
dtype: int64

In [105]:
s.loc['f']

np.int64(9)

In [106]:
s.loc['a']

a    44
a    83
dtype: int64

In [107]:
s.iloc[[0]]

a    44
dtype: int64

In [108]:
s.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'a', 'b', 'c', 'd'], dtype='object')

In [109]:
s

a    44
b    47
c    64
d    67
e    67
f     9
a    83
b    21
c    36
d    87
dtype: int64

In [110]:
s.loc['c':'f']

KeyError: "Cannot get left slice bound for non-unique label: 'c'"

In [111]:
s

a    44
b    47
c    64
d    67
e    67
f     9
a    83
b    21
c    36
d    87
dtype: int64

In [112]:
s.loc['e':'f']

e    67
f     9
dtype: int64

In [114]:
s.sort_index().loc['c':'f']

c    64
c    36
d    67
d    87
e    67
f     9
dtype: int64

In [115]:
s.index.is_monotonic_increasing

False

In [116]:
s.sort_index().index.is_monotonic_increasing

True

In [117]:
s.loc['f':'c':-1]

KeyError: "Cannot get left slice bound for non-unique label: 'c'"

In [119]:
s.sort_index().loc['f':'c':-1]

f     9
e    67
d    87
d    67
c    36
c    64
dtype: int64

In [120]:
s.sort_index().loc['f':'c']

Series([], dtype: int64)

In [121]:
s

a    44
b    47
c    64
d    67
e    67
f     9
a    83
b    21
c    36
d    87
dtype: int64

In [122]:
# series are mutable!

s.loc['b'] = 999
s

a     44
b    999
c     64
d     67
e     67
f      9
a     83
b    999
c     36
d     87
dtype: int64

In [123]:
s.loc[['d', 'f', 'a']] = 888
s

a    888
b    999
c     64
d    888
e     67
f    888
a    888
b    999
c     36
d    888
dtype: int64

In [126]:
# 
s.loc[s > s.mean()] = 543
s

a    543
b    543
c     64
d    543
e     67
f    543
a    543
b    543
c     36
d    543
dtype: int64

In [128]:
s.loc[s > s.mean()] = [0,1,2,3,4,5,6]

In [129]:
s

a     0
b     1
c    64
d     2
e    67
f     3
a     4
b     5
c    36
d     6
dtype: int64