# <u>Using the same DataFrame as last time.

In [2]:
import pandas as pd
import numpy as np

In [3]:
from numpy.random import randn
np.random.seed(101)

In [4]:
df = pd.DataFrame(data = randn(5, 4), index = ['A', 'B', 'C', 'D', 'E'], columns = ['W', 'X', 'Y', 'Z'])

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


---

# <u>Conditional Selection.

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [8]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [9]:
# Compares every value in the DataFrame with the digit using the comparision operators.
# Comparing with a digit gives a DataFrame of bool values.
# Similar to NumPy

df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [10]:
booldf = df > 0

In [11]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [12]:
# Conditional Selection.
# Returns back the values that happened to be True and Nan where it happened to be False.

df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [13]:
# Doing it in a single step.

df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


---

## <u>Instead of just passing in the entire data frame we'll pass in a row or a column.

In [16]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### 1.<u>Using conditional selection on a column:

In [18]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [19]:
# Returns a series of bool values.

df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

I can now use this series of boolean values corresponding to rows to filter out rows based off of a column value:

In [21]:
# Will return the filtered out DataFrame without NaN Values.

df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


- <u>NOTE:

    - And now since we're passing in a series(condition on a column), we won't get those NaN values anymore.
    - We only get those NaN values when we're doing condition on an entire DataFrame.

### 2.<u>Example: Grab all the rows in DatFrame where Z is less than zero.

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [25]:
df[df['Z'] < 0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


### 3.<u>We can save these operations like</u> df[df['W'] > 0] <u>to a variable and use conditional selection on them too:

In [27]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [28]:
resultdf = df[df['W'] > 0]

In [29]:
# Subset of a DataFramd df

resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [30]:
# Grab the X column values.

resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [31]:
# Grab the X column values using loc method.

resultdf.loc[['A', 'B', 'D', 'E'], 'X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [32]:
# Grab the X column values using loc method.
# NOTE We can use : notation (start: stop) instead of passing every index name in the list.

resultdf.loc[:, 'X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [33]:
# Grab the X column values using iloc method.

resultdf.iloc[:, 1]

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [34]:
# Grab the X column values using iloc method.
# NOTE We can use : notation (start: stop) instead of passing every index number in the list.

resultdf.iloc[[0, 1, 2, 3], 1]

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [35]:
# Grab the X and Y column values using loc method.

resultdf.loc[:, ['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [36]:
# Grab the X and Y column values using iloc method.
# Will only grab things in order (start: stop).

resultdf.iloc[:, 1:3]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [37]:
# Grab the X and Y column values using iloc method.
# Preferred if we don't want to grab things in order.
# [2, 1] Will grab specified columns.

resultdf.iloc[:, [2, 1]]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


- **Do it in a single step.**

In [39]:
# Grab the X column values.

df[df['W'] > 0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [40]:
# Grab the Y and X column values.

df[df['W'] > 0][['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [41]:
# Grab the Y and X column values using loc method.

df[df['W'] > 0].loc[:, ['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [42]:
# Grab the Y and X column values using iloc method.

df[df['W'] > 0].iloc[:, [2, 1]]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


- **Further breaking down df[df['W'] > 0][['Y', 'X']] to better understand it:**

In [44]:
boolser = df['W'] > 0
boolser

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [45]:
result = df[boolser]
result

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [46]:
result[['Y', 'X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


- <u>NOTE:

    - As we get more practice we will use more and more one liners like this: df[df['W'] > 0][['Y', 'X']] instead of doing it in multiple steps.

---

## <u>For two conditions we can use | (pipe operator) and & (Ampersand) with parenthesis:

In [50]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [51]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


What if we wanted multiple conditions such as where W is greater than 0 and Y was greater than 1.

In [53]:
# We could use the () and the and operator.
# ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# df[(df['W'] > 0) and (df['Y'] > 1)]

- <u>NOTE:

    - And what this error is trying to say is that Python's normal and operator can't actually take into account a series of boolean values compared to another series of boolean values and and operator can only take into account single booleans at a time.

In [55]:
# Single booleans at a time using and operator.

True and True

True

In [56]:
# Single booleans at a time using and operator.

True and False

False

In [57]:
# Single booleans at a time using and operator.

False and False

False

In [58]:
# Single booleans at a time using or operator.

True or True

True

In [59]:
# Single booleans at a time using or operator.

True or False

True

In [60]:
# Single booleans at a time using or operator.

False or False

False

That means when we pass an entire series of boolean values such as this the and operator begins to get confused.

In [62]:
# Entire series of boolean values.

df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

That's why we use | (pipe operator) and & (Ampersand) symbol:

In [64]:
df[(df['W'] > 0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [65]:
df[(df['W'] > 0) | (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


---

# <u>More Index Details.

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy in Part 3.

In [68]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [69]:
# Resets to default 0,1...n index.
# Dosen't occur inplace. Set inplace = True to affect the DataFrame.
# Old index becomes a column.

df.reset_index(inplace = False)

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [70]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
# State abbriviations.
# split at blank space to create a list.

newind = 'CA NY WY OR CO'.split()

In [72]:
# And since the dimentions match so there's five items here.
# It will match from 0 to 4 on DataFrame.

newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [73]:
# Creating a new column.

df['States'] = newind

In [74]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


**If we have a column in our DataFrame that we want to be the index, we can go ahead and use the set_index method instead of reset_index.**

In [76]:
# inplace = False

df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


- <u>NOTE:

    - Overwrites the old index column with the column specified in the set_index method.

In [78]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


---