# Agenda: 
* Mask indexes
* Broadcasts and comparisons
* Boolean series - what can we do with it?
Using that to filter our series with a "boolean index" or a "mask index"
Complex comparisons with "and" and "or"

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
s = Series([10, 20, 30, 40, 50, 60, 70],
           index=list('abcdefg'))

In [3]:
s

a    10
b    20
c    30
d    40
e    50
f    60
g    70
dtype: int64

In [4]:
# I can retrieve any element of the series with either .loc (based on the index) or .iloc (based on the position)


In [5]:
s.loc['d']

40

In [6]:
s.iloc[4]

50

In [7]:
# Inside of the [], I can put a list of locations that I want to retrieve
# this is known as "fancy indexing"

s.loc[['a', 'd']]

a    10
d    40
dtype: int64

In [8]:
s.iloc[[2, 5]]

c    30
f    60
dtype: int64

In [9]:
# there is another way that we can retrieve values, though
# we can pass a list of boolean values (True and False)

s.loc[ [True, False, False, True, True, False, True] ]

a    10
d    40
e    50
g    70
dtype: int64

# Boolean/mask index
The idea here is:

Pass, inside of [], a list of booleans
Wherever there is a True value, we get the value from the original series
Wherever there is a False value, the original value is ignored
This is used all of the time, but you will alm


In [10]:
s > 30    # this is a comparison operation, broadcast across all values of s

a    False
b    False
c    False
d     True
e     True
f     True
g     True
dtype: bool

In [11]:
# I can take this boolean series and use it as a mask index with .loc

s.loc[ s > 30]   # only have [] once here, because we're getting a series back from s>30

# say this as: show the values of s where s > 30

d    40
e    50
f    60
g    70
dtype: int64

In [12]:
# Let's find all of the values that are greater than the mean

s > s.mean()


a    False
b    False
c    False
d    False
e     True
f     True
g     True
dtype: bool

# Exercise: Find extreme temperatures
1. Create two series, highs and lows, containing the forecast high and low temps for the coming 10 days.
2. Find all high temps greater than the mean.
3. Are any high temps greater than the mean + 1 standard deviation?
4. Calculate the difference in temp between highs and lows.
5. Show all days in which the difference was greater than the median difference.

In [30]:
sLow = Series([12, 11, 11, 11, 10, 10, 9, 11, 9, 10],
           index='Mon Tue Wed Thu Fri Sat Sun Mon Tue Wed'.split())
sHigh = Series([19, 17, 20, 18, 18, 18, 15, 17, 14, 14],
           index='Mon Tue Wed Thu Fri Sat Sun Mon Tue Wed'.split())

In [31]:
sLow

Mon    12
Tue    11
Wed    11
Thu    11
Fri    10
Sat    10
Sun     9
Mon    11
Tue     9
Wed    10
dtype: int64

In [32]:
sHigh.describe()

count    10.000000
mean     17.000000
std       2.054805
min      14.000000
25%      15.500000
50%      17.500000
75%      18.000000
max      20.000000
dtype: float64

In [33]:
sLow.describe()


count    10.000000
mean     10.400000
std       0.966092
min       9.000000
25%      10.000000
50%      10.500000
75%      11.000000
max      12.000000
dtype: float64

In [34]:
sLow > sLow.mean()

Mon     True
Tue     True
Wed     True
Thu     True
Fri    False
Sat    False
Sun    False
Mon     True
Tue    False
Wed    False
dtype: bool

In [35]:
sHigh > sHigh.mean()+1

Mon     True
Tue    False
Wed     True
Thu    False
Fri    False
Sat    False
Sun    False
Mon    False
Tue    False
Wed    False
dtype: bool

In [36]:
sHigh.loc[sHigh > sHigh.mean()]

Mon    19
Wed    20
Thu    18
Fri    18
Sat    18
dtype: int64

In [37]:
diffs = sHigh - sLow

In [38]:
diffs

Mon    7
Tue    6
Wed    9
Thu    7
Fri    8
Sat    8
Sun    6
Mon    6
Tue    5
Wed    4
dtype: int64

In [39]:
diffs.loc[diffs > diffs.median()]

Mon    7
Wed    9
Thu    7
Fri    8
Sat    8
dtype: int64

In [None]:

sRand = Series(np.random.randint(0,1000,20), 
        index=list('abcdefghijklmnopqrst'))

In [None]:
sRand

In [91]:
sRand.mean() - sRand.std()

230.99333416273885

In [92]:
sRand.loc[sRand > sRand.mean() + sRand.std()]

b    901
l    863
q    944
dtype: int64

In [93]:
sRand.loc[sRand > sRand.mean() - sRand.std()]

b    901
c    258
d    721
e    368
g    581
h    628
j    381
k    589
l    863
m    682
n    289
p    384
q    944
r    519
s    711
t    536
dtype: int64

In [94]:
sRand.loc[sRand > sRand.mean() - sRand.std()]

b    901
c    258
d    721
e    368
g    581
h    628
j    381
k    589
l    863
m    682
n    289
p    384
q    944
r    519
s    711
t    536
dtype: int64

In [96]:
sRand.loc[sRand > sRand.mean() - sRand.std()]




b    901
c    258
d    721
e    368
g    581
h    628
j    381
k    589
l    863
m    682
n    289
p    384
q    944
r    519
s    711
t    536
dtype: int64

# Exericse: More complex comparisons
1. Create a series of 20 random ints between 0 and 1,000. The index should contain unique letters (a-t). You can generate these random numbers with np.random.randint(0, 1000, 20).
2. Find all of the values that are < mean - 1 std.
3. Find all of the values that are > mean + 1 std.
4. Find all values that are either < mean - 1 std or > mean + 1 std.
5. Find even numbers greater than the mean.
6. Find even numbers greater than the mean, and also odd numbers less than the mean.

In [110]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

sRand = Series(np.random.randint(0,1000,20), 
        index=list('abcdefghijklmnopqrst'))


In [100]:
sRand.describe()

count     20.000000
mean     501.100000
std      285.283589
min       33.000000
25%      285.750000
50%      478.000000
75%      692.750000
max      950.000000
dtype: float64

In [103]:
sRand > sRand.mean()

a     True
b     True
c     True
d    False
e    False
f     True
g    False
h    False
i    False
j     True
k     True
l    False
m    False
n     True
o    False
p    False
q    False
r     True
s     True
t    False
dtype: bool

In [104]:
sRand.loc[sRand > sRand.mean() - sRand.std()]

a    734
b    590
c    638
d    264
e    316
f    950
h    387
i    412
j    867
k    679
l    472
m    484
n    674
o    293
r    907
s    886
dtype: int64

In [105]:
sRand.loc[sRand > sRand.mean() + sRand.std()]

f    950
j    867
r    907
s    886
dtype: int64

In [116]:
# Find all values that are either < mean - 1 std *or* > mean + 1 std.

sRand.loc[(
    (sRand < sRand.mean() - sRand.std())
    |
    (sRand > sRand.mean() + sRand.std())
)]

a    801
b     54
e     47
g    809
i    938
l     94
n    788
p    810
t      5
dtype: int64

In [117]:
# Find even numbers greater than the mean, and also odd numbers less than the mean.

s.loc[

    (((sRand%2 == 0)
     &
    (sRand > sRand.mean())))

    | 

    (((sRand%2 == 1)
     &
    (sRand < sRand.mean())))

]


d    40
e    50
f    60
dtype: int64