# Agenda

1. Series
2. Comparing between NumPy array and series
3. Indexing (`.loc` and `.iloc`)
4. Dtypes
5. `NaN`
6. Methods

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
s = Series([10, 20, 30, 40, 50, 60])
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64

In [3]:
s = Series([10, 20, 30, 40, 50, 60],
          dtype=np.int16)
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int16

In [4]:
s[0]

np.int16(10)

In [5]:
s[1]

np.int16(20)

In [6]:
s.loc[0]

np.int16(10)

In [7]:
s.loc[1]

np.int16(20)

In [8]:
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int16

In [10]:
s.loc[[2, 3, 4]]   # fancy indexing

2    30
3    40
4    50
dtype: int16

In [11]:
s.mean()

np.float64(35.0)

In [12]:
s.max()

np.int16(60)

In [13]:
s.std()

np.float64(18.708286933869708)

In [14]:
s

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int16

In [15]:
s.loc[3] = 999
s

0     10
1     20
2     30
3    999
4     50
5     60
dtype: int16

In [16]:
s.loc[[1, 3, 5]] = 888
s

0     10
1    888
2     30
3    888
4     50
5    888
dtype: int16

In [17]:
s = Series([10, 20, 30, 40, 50, 60],
          index=list('abcdef'))

In [18]:
s

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [19]:
s.loc['a']

np.int64(10)

In [20]:
s.loc[['a', 'd', 'f']]

a    10
d    40
f    60
dtype: int64

In [21]:
s.loc['a':'d']

a    10
b    20
c    30
d    40
dtype: int64

In [22]:
# what if I want by position?

s.iloc[0]

np.int64(10)

In [23]:
s.iloc[4]

np.int64(50)

In [24]:
s.iloc[[2,3,4]]

c    30
d    40
e    50
dtype: int64

In [25]:
s

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [26]:
%timeit s.loc[['b', 'd', 'f']]

205 µs ± 4.18 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [27]:
%timeit s.iloc[[1, 3, 5]]

75.9 µs ± 4.65 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [28]:
s

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [29]:
s.loc[[True, False, True, True, False, True]]

a    10
c    30
d    40
f    60
dtype: int64

In [30]:
s > s.mean()

a    False
b    False
c    False
d     True
e     True
f     True
dtype: bool

In [31]:
s.loc[s > s.mean()]

d    40
e    50
f    60
dtype: int64

In [32]:
s.loc['b':'e']

b    20
c    30
d    40
e    50
dtype: int64

In [34]:
s.iloc[1:4]

b    20
c    30
d    40
dtype: int64

In [35]:
s.iloc[s > s.mean()]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

In [36]:
s

a    10
b    20
c    30
d    40
e    50
f    60
dtype: int64

In [37]:
s.index[3]

'd'

In [38]:
s.min()

np.int64(10)

In [39]:
s.max()

np.int64(60)

In [40]:
s.idxmin()

'a'

In [41]:
s.idxmax()

'f'

# Exercises:

1. Create a series with 10 random integers, index a-j.
2. Retrieve from index b.
3. Retrieve from indexes c, d, and f
4. What is the mean of indexes a, e, g, and h?
5. What is the mean of items with even (positional) indexes?
6. What is the mean of the even numbers?

In [42]:
np.random.seed(0)
s = Series(np.random.randint(0, 100, 10),
          index=list('abcdefghij'))
s

a    44
b    47
c    64
d    67
e    67
f     9
g    83
h    21
i    36
j    87
dtype: int64

In [43]:
# Retrieve from index b.

s.loc['b']

np.int64(47)

In [44]:
# Retrieve from indexes c, d, and f

s.loc[['c', 'd', 'f']]

c    64
d    67
f     9
dtype: int64

In [46]:
# What is the mean of indexes a, e, g, and h?

s.loc[['a', 'e', 'g', 'h']].mean()

np.float64(53.75)

In [47]:
# What is the mean of items with even (positional) indexes?

s.iloc[[0,2,4,6,8]]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [48]:
s.iloc[range(0,9,2)]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [53]:
s.iloc[::2]

a    44
c    64
e    67
g    83
i    36
dtype: int64

In [50]:
# What is the mean of the even numbers?

s.loc[s % 2 == 0]

a    44
c    64
i    36
dtype: int64

In [54]:
s1 = Series([10, 20, 30, 40, 50],
          index=list('abcde'))
s2 = Series([100, 200, 300, 400, 500],
           index=list('edcba'))



In [55]:
s1 + 5   # broadcast

a    15
b    25
c    35
d    45
e    55
dtype: int64

In [59]:
s1 + 1_000_000

a    1000010
b    1000020
c    1000030
d    1000040
e    1000050
dtype: int64

In [61]:
s = Series([10, 20, 30, 40, 50], dtype=np.int8)

s + 1000

OverflowError: Python integer 1000 out of bounds for int8

In [62]:
s1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [63]:
s1 + s1

a     20
b     40
c     60
d     80
e    100
dtype: int64

In [64]:
s2

e    100
d    200
c    300
b    400
a    500
dtype: int64

In [65]:
s1 + s2

a    510
b    420
c    330
d    240
e    150
dtype: int64

In [66]:
s3 = Series([123, 456, 789, 135, 246],
           index=list('abcab'))
s3

a    123
b    456
c    789
a    135
b    246
dtype: int64

In [67]:
s3.loc['a']

a    123
a    135
dtype: int64

In [69]:
s3.loc['c']

np.int64(789)

In [70]:
s1

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [71]:
s3

a    123
b    456
c    789
a    135
b    246
dtype: int64

In [72]:
s1 + s3

a    133.0
a    145.0
b    476.0
b    266.0
c    819.0
d      NaN
e      NaN
dtype: float64

In [73]:
# how can we get a default of 0 if there is no index match?

s1.add(s3)

a    133.0
a    145.0
b    476.0
b    266.0
c    819.0
d      NaN
e      NaN
dtype: float64

In [74]:
help(s1.add)

Help on method add in module pandas.core.series:

add(other, level=None, fill_value=None, axis: 'Axis' = 0) -> 'Series' method of pandas.core.series.Series instance
    Return Addition of series and other, element-wise (binary operator `add`).

    Equivalent to ``series + other``, but with support to substitute a fill_value for
    missing data in either one of the inputs.

    Parameters
    ----------
    other : Series or scalar value
    level : int or name
        Broadcast across a level, matching Index values on the
        passed MultiIndex level.
    fill_value : None or float value, default None (NaN)
        Fill existing missing (NaN) values, and any new element needed for
        successful Series alignment, with this value before computation.
        If data in both corresponding Series locations is missing
        the result of filling (at that location) will be missing.
    axis : {0 or 'index'}
        Unused. Parameter needed for compatibility with DataFrame.

    Re

In [75]:
s1.add(s3, fill_value=0)

a    133.0
a    145.0
b    476.0
b    266.0
c    819.0
d     40.0
e     50.0
dtype: float64

In [77]:
s1.mul(s3, fill_value=1)

a     1230.0
a     1350.0
b     9120.0
b     4920.0
c    23670.0
d       40.0
e       50.0
dtype: float64

In [79]:
s3

a    123
b    456
c    789
a    135
b    246
dtype: int64

In [80]:
s3.loc['b':'c']

KeyError: "Cannot get left slice bound for non-unique label: 'b'"

In [81]:
s1.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [82]:
s3.index

Index(['a', 'b', 'c', 'a', 'b'], dtype='object')

In [83]:
s = Series([10, 20, 30, 40, 50])
s.index

RangeIndex(start=0, stop=5, step=1)

In [84]:
# we can assign to the index
s.index = 'this is also an index'.split()

In [85]:
s

this     10
is       20
also     30
an       40
index    50
dtype: int64

In [86]:
s = Series([10.5, 20.3, 30.2, 40, 50])
s

0    10.5
1    20.3
2    30.2
3    40.0
4    50.0
dtype: float64

In [87]:
# we can use astype to change dtypes, if we want

s.astype(np.int64)

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [88]:
s.index = s3

In [89]:
s

123    10.5
456    20.3
789    30.2
135    40.0
246    50.0
dtype: float64

In [90]:
s.index

Index([123, 456, 789, 135, 246], dtype='int64')

# Exercise: Indexes

1. Define a series of 10 random ints (1-1,000) with index a-j.
2. Assign a-e to a new series, and f-j to a second series, but with the index a-e.
3. What happens when we assign them together?
4. Change the index in the second to be cabab.
5. Add them together, using 0 as a default.
6. Change them to both be int16, and then multiply them by each other. Does this work? 

In [91]:
np.random.seed(0)
s = Series(np.random.randint(0, 1000, 10),
          index=list('abcdefghij'))
s

a    684
b    559
c    629
d    192
e    835
f    763
g    707
h    359
i      9
j    723
dtype: int64

In [93]:
# Assign a-e to a new series, and f-j to a second series, but with the index a-e.

s2 = s.loc['a':'e']
s3 = s.loc['f':'j']
s3.index=s2.index

In [94]:
s2

a    684
b    559
c    629
d    192
e    835
dtype: int64

In [95]:
s3

a    763
b    707
c    359
d      9
e    723
dtype: int64

In [96]:
s2 + s3

a    1447
b    1266
c     988
d     201
e    1558
dtype: int64

In [None]:
# What happens when we assign them together?

In [97]:
# Change the index in the second to be cabab.
s3.index = list('cabab')
s3

c    763
a    707
b    359
a      9
b    723
dtype: int64

In [98]:
# Add them together, using 0 as a default.

s2 + s3

a    1391.0
a     693.0
b     918.0
b    1282.0
c    1392.0
d       NaN
e       NaN
dtype: float64

In [99]:
s3 + s2

a    1391.0
a     693.0
b     918.0
b    1282.0
c    1392.0
d       NaN
e       NaN
dtype: float64

In [100]:
s2.add(s3, fill_value=0)

a    1391.0
a     693.0
b     918.0
b    1282.0
c    1392.0
d     192.0
e     835.0
dtype: float64

In [101]:
# Change them to both be int16, and then multiply them by each other.
# Does this work?

s2.astype(np.int16) * s3.astype(np.int16)

a    483588.0
a      6156.0
b    200681.0
b    404157.0
c    479927.0
d         NaN
e         NaN
dtype: float64

In [103]:
s2.astype(np.int16) * s2.astype(np.int16)

a     9104
b   -15199
c     2425
d   -28672
e   -23671
dtype: int16

# Next up

1. `NaN` and friends
2. Methods with series



In [104]:
s2 + s3

a    1391.0
a     693.0
b     918.0
b    1282.0
c    1392.0
d       NaN
e       NaN
dtype: float64

In [105]:
np.nan

nan

In [106]:
np.NaN

AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

In [107]:
s = Series([10, 20, 30, np.nan, 50])
s

0    10.0
1    20.0
2    30.0
3     NaN
4    50.0
dtype: float64

In [108]:
s.astype(np.int64)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [109]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [110]:
a = np.array([10, 20, 30, 40, 50])
a[2] = np.nan

ValueError: cannot convert float NaN to integer

In [111]:
s.loc[2] = np.nan

In [112]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [113]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [114]:
s.sum()

np.float64(120.0)

In [115]:
s.mean()

np.float64(30.0)

In [116]:
s.std()

np.float64(18.257418583505537)

In [124]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [126]:
(10+20+40+50) / 4

30.0

In [127]:
# we can remove NaN values

s.dropna()


0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64

In [128]:
help(s.dropna)

Help on method dropna in module pandas.core.series:

dropna(*, axis: 'Axis' = 0, inplace: 'bool' = False, how: 'AnyAll | None' = None, ignore_index: 'bool' = False) -> 'Series | None' method of pandas.core.series.Series instance
    Return a new Series with missing values removed.

    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.

    Parameters
    ----------
    axis : {0 or 'index'}
        Unused. Parameter needed for compatibility with DataFrame.
    inplace : bool, default False
        If True, do operation inplace and return None.
    how : str, optional
        Not in use. Kept for compatibility.
    ignore_index : bool, default ``False``
        If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

        .. versionadded:: 2.0.0

    Returns
    -------
    Series or None
        Series with NA entries dropped from it or None if ``inplace=True``.

    See Also
    --------
    Seri

In [129]:
s

0    10.0
1    20.0
2     NaN
3    40.0
4    50.0
dtype: float64

In [131]:
s.fillna(999)

0     10.0
1     20.0
2    999.0
3     40.0
4     50.0
dtype: float64

# Exercise: `nan` stuff

1. Create a series with 15 random ints from 0-1,000. Give it an index of a-e, 3 times.
2. Replace all of the items at 'b' and 'd' with `np.nan`.
3. What are the mean + std now?  What about the dtype?
4. Replace the `nan` values with the mean of the non-`nan` values.
5. What are the mean + std now?

In [132]:
np.random.seed(0)
s = Series(np.random.randint(0, 1000, 15),
          index = list('abcde' * 3))
s

a    684
b    559
c    629
d    192
e    835
a    763
b    707
c    359
d      9
e    723
a    277
b    754
c    804
d    599
e     70
dtype: int64

In [133]:
# Replace all of the items at 'b' and 'd' with np.nan.
s.loc[['b', 'd']] = np.nan
s

a    684.0
b      NaN
c    629.0
d      NaN
e    835.0
a    763.0
b      NaN
c    359.0
d      NaN
e    723.0
a    277.0
b      NaN
c    804.0
d      NaN
e     70.0
dtype: float64

In [134]:
# What are the mean + std now? What about the dtype?

s.mean()

np.float64(571.5555555555555)

In [135]:
s.std()

np.float64(269.82962731653055)

In [136]:
np.std(s)

np.float64(254.39781232074353)

In [138]:
# Replace the nan values with the mean of the non-nan values.
s = s.fillna(s.mean())
s

a    684.000000
b    571.555556
c    629.000000
d    571.555556
e    835.000000
a    763.000000
b    571.555556
c    359.000000
d    571.555556
e    723.000000
a    277.000000
b    571.555556
c    804.000000
d    571.555556
e     70.000000
dtype: float64

In [139]:
# What are the mean + std now?

s.mean()

np.float64(571.5555555555555)