# Pandas series

1. Creating a series
2. Indexes (Pandas vs. NumPy)
3. `.loc` and `.iloc`
4. dtypes
5. `nan` and Pandas
6. Methods, mask indexes

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
a = np.array([10, 20, 30, 40, 50], dtype=np.int8)
a

array([10, 20, 30, 40, 50], dtype=int8)

In [3]:
# Create a series in a similar way

s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
# NumPy methods work!

In [5]:
s.min()

np.int64(10)

In [6]:
s.max()

np.int64(50)

In [7]:
s.mean()

np.float64(30.0)

In [8]:
s.std()

np.float64(15.811388300841896)

In [9]:
# retrieve values from our series with []
s[2]

np.int64(30)

In [10]:
# fancy indexing

s[ [2, 4] ]

2    30
4    50
dtype: int64

In [11]:
s[ [2 ]]

2    30
dtype: int64

In [12]:
# comparisons

s == 30

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [13]:
# mask index / boolean index
s[ s == 30  ]

2    30
dtype: int64

In [15]:
s[ s <= 30] 

0    10
1    20
2    30
dtype: int64

In [16]:
s = Series(a)
s

0    10
1    20
2    30
3    40
4    50
dtype: int8

# Exercise: Pandas series

1. Create a series for the expected high temperatures in the next 10 days.
2. What are the min and max high temperatures?
3. On how many days will we have temperatures below the mean?
4. What is the mean in the first 5 days? In the last 5 days?

In [17]:
s = Series([18, 15, 12, 11, 10, 9, 12, 15, 17, 18])
s

0    18
1    15
2    12
3    11
4    10
5     9
6    12
7    15
8    17
9    18
dtype: int64

In [18]:
# 2. What are the min and max high temperatures?

s.min()

np.int64(9)

In [20]:
s.max()

np.int64(18)

In [21]:
help(s.max)  

Help on method max in module pandas.core.series:

max(
    axis: 'Axis | None' = 0,
    skipna: 'bool' = True,
    numeric_only: 'bool' = False,
    **kwargs
) method of pandas.core.series.Series instance
    Return the maximum of the values over the requested axis.

    If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``.

    Parameters
    ----------
    axis : {index (0)}
        Axis for the function to be applied on.
        For `Series` this parameter is unused and defaults to 0.

        For DataFrames, specifying ``axis=None`` will apply the aggregation
        across both axes.

        .. versionadded:: 2.0.0

    skipna : bool, default True
        Exclude NA/null values when computing the result.
    numeric_only : bool, default False
        Include only float, int, boolean columns. Not implemented for Series.

    **kwargs
        Additional keyword arguments to be passed to the function.

    Returns


In [22]:
s.describe()

count    10.000
mean     13.700
std       3.335
min       9.000
25%      11.250
50%      13.500
75%      16.500
max      18.000
dtype: float64

In [23]:
s.agg(['min', 'max'])

min     9
max    18
dtype: int64

In [25]:
s.describe()[['min', 'max']]

min     9.0
max    18.0
dtype: float64

In [29]:
# 3. On how many days will we have temperatures below the mean?

s[s < s.mean()].count()

np.int64(5)

In [31]:
# 4. What is the mean in the first 5 days? In the last 5 days?

s[:5].mean()

np.float64(13.2)

In [32]:
s[5:].mean()

np.float64(14.2)

In [34]:
s.describe()

count    10.000
mean     13.700
std       3.335
min       9.000
25%      11.250
50%      13.500
75%      16.500
max      18.000
dtype: float64

In [35]:
s.describe()['count']

np.float64(10.0)

In [36]:
s.describe()['mean']

np.float64(13.7)

In [37]:
s.describe()[['count', 'mean']]

count    10.0
mean     13.7
dtype: float64

In [38]:
n = s.describe()['mean']

In [39]:
n

np.float64(13.7)

In [41]:
int(n) + 5

18

In [43]:
s.agg('mean', 'std')

ValueError: No axis named std for object type Series

In [44]:
s = Series([10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
           index=list('abcdefghij'))
s

a     10
b     20
c     30
d     40
e     50
f     60
g     70
h     80
i     90
j    100
dtype: int64

# `.loc` and `.iloc`

- `.loc` retrieves by index
- `.iloc` retrieves by numeric position

In [46]:
s

a     10
b     20
c     30
d     40
e     50
f     60
g     70
h     80
i     90
j    100
dtype: int64

In [47]:
s.loc['a']

np.int64(10)

In [48]:
s.loc[['b', 'd', 'c']]

b    20
d    40
c    30
dtype: int64

In [50]:
# .loc + slices -- the end point is included!
s.loc['d':'g']

d    40
e    50
f    60
g    70
dtype: int64

In [51]:
s.loc['b'] = 999
s

a     10
b    999
c     30
d     40
e     50
f     60
g     70
h     80
i     90
j    100
dtype: int64

In [52]:
s.iloc[4]

np.int64(50)

In [53]:
s.iloc[5]

np.int64(60)

In [54]:
s.iloc[[4, 5]]

e    50
f    60
dtype: int64

In [55]:
s.iloc[2:7]

c    30
d    40
e    50
f    60
g    70
dtype: int64

In [56]:
s.iloc[7] = 888

In [57]:
s

a     10
b    999
c     30
d     40
e     50
f     60
g     70
h    888
i     90
j    100
dtype: int64

In [58]:
s.tail(1)

j    100
dtype: int64

In [59]:
s2 = Series([100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
           index=list('jihgfedcba'))

In [60]:
s + s

a      20
b    1998
c      60
d      80
e     100
f     120
g     140
h    1776
i     180
j     200
dtype: int64

In [61]:
s + 10   # broadcast

a      20
b    1009
c      40
d      50
e      60
f      70
g      80
h     898
i     100
j     110
dtype: int64

In [62]:
s + s2

a    1010
b    1899
c     830
d     740
e     650
f     560
g     470
h    1188
i     290
j     200
dtype: int64

In [63]:
s / s2

a    0.010000
b    1.110000
c    0.037500
d    0.057143
e    0.083333
f    0.120000
g    0.175000
h    2.960000
i    0.450000
j    1.000000
dtype: float64

In [64]:
s * s2

a     10000
b    899100
c     24000
d     28000
e     30000
f     30000
g     28000
h    266400
i     18000
j     10000
dtype: int64

In [65]:
s ** s2

a                      0
b    1697842202855098849
c                      0
d                      0
e                      0
f                      0
g                      0
h                      0
i                      0
j                      0
dtype: int64

# Exercise: Series + index

1. Define two series, `highs` and `lows`, with 10 days of expected temperatures. Each index should be in MMDD format, as a string (so `0218` would be today.)
2. What is the predicted mean difference in temperature?
3. What is the predicted mean difference in temperature on the first 5 days? Do this in two ways, with `.loc` and `.iloc`.


In [66]:
highs = Series([18, 15, 12, 11, 10, 9, 11, 15, 17, 18],
              index='0218 0219 0220 0221 0222 0223 0224 0225 0226 0227'.split())
lows = Series([10, 8, 7, 5, 3, 2, 3, 6, 6, 7],
              index='0218 0219 0220 0221 0222 0223 0224 0225 0226 0227'.split())

In [67]:
[f'02{day}'
for day in range(18, 28)]

['0218',
 '0219',
 '0220',
 '0221',
 '0222',
 '0223',
 '0224',
 '0225',
 '0226',
 '0227']

In [69]:
# 2. What is the predicted mean difference in temperature?

(highs - lows).mean()

np.float64(7.9)

In [70]:
# 3. What is the predicted mean difference in temperature on the first 5 days? Do this in two ways, with `.loc` and `.iloc`.

# loc
(highs.loc[:'0222'] - lows.loc[:'0222']).mean()

np.float64(6.6)

In [72]:
(highs.iloc[:5] - lows.iloc[:5]).mean()

np.float64(6.6)

In [73]:
(highs - lows).iloc[:5].mean()

np.float64(6.6)

In [75]:
# method chaining

(
    (highs - lows)
    .iloc[:5]
    .mean()
)

np.float64(6.6)