# More Indexing in Pandas

In [1]:
import pandas as pd

In [2]:
# Create a data frame
my_dict = {'Name': ['Mary', 'Joe', 'Roger', 'Sarah', 'Mike', 'Robert'],
           'Age': [20, 45, 32, 58, 39, 42],
           'City': ['Albuquerque', 'Rio Rancho', 'Santa Fe', 'Taos', 'Albuquerque', 'Placitas'],
           'Occupation': ['student', 'doctor', 'data scientist', 'general contractor', 'student', 'manager']}
people_df = pd.DataFrame(my_dict)
people_df

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


## Review of column and row indexing

### Using column names

In [3]:
# Column indexing
people_df['Name']

Unnamed: 0,Name
0,Mary
1,Joe
2,Roger
3,Sarah
4,Mike
5,Robert


In [4]:
# Column indexing
people_df[['Name']]

Unnamed: 0,Name
0,Mary
1,Joe
2,Roger
3,Sarah
4,Mike
5,Robert


In [5]:
# Multiple column indexing
people_df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Mary,20
1,Joe,45
2,Roger,32
3,Sarah,58
4,Mike,39
5,Robert,42


In [9]:
# Multiple column indexing
people_df[['Name', 'Age','Name']]

Unnamed: 0,Name,Age,Name.1
0,Mary,20,Mary
1,Joe,45,Joe
2,Roger,32,Roger
3,Sarah,58,Sarah
4,Mike,39,Mike
5,Robert,42,Robert


In [17]:
name_index = people_df.columns[::-2]
name_index

Index(['Occupation', 'Age'], dtype='object')

In [18]:
people_df[ name_index ]

Unnamed: 0,Occupation,Age
0,student,20
1,doctor,45
2,data scientist,32
3,general contractor,58
4,student,39
5,manager,42


### Columns with iloc indices

In [6]:
# Column indexing using .iloc - indexing
people_df.iloc[:,0]

Unnamed: 0,Name
0,Mary
1,Joe
2,Roger
3,Sarah
4,Mike
5,Robert


In [7]:
# Column indexing using .iloc - slicing
people_df.iloc[:,0::2]

Unnamed: 0,Name,City
0,Mary,Albuquerque
1,Joe,Rio Rancho
2,Roger,Santa Fe
3,Sarah,Taos
4,Mike,Albuquerque
5,Robert,Placitas


In [8]:
# Column indexing using .iloc - list
people_df.iloc[:,[1,0,3,2,3]]

Unnamed: 0,Age,Name,Occupation,City,Occupation.1
0,20,Mary,student,Albuquerque,student
1,45,Joe,doctor,Rio Rancho,doctor
2,32,Roger,data scientist,Santa Fe,data scientist
3,58,Sarah,general contractor,Taos,general contractor
4,39,Mike,student,Albuquerque,student
5,42,Robert,manager,Placitas,manager


### Rows with indices


In [19]:
# Row indexing using .loc
people_df.loc[1]

Unnamed: 0,1
Name,Joe
Age,45
City,Rio Rancho
Occupation,doctor


In [20]:
# Row indexing using .iloc
people_df.iloc[1] # .loc and .iloc are the same for this data frame because the index has not been changed

Unnamed: 0,1
Name,Joe
Age,45
City,Rio Rancho
Occupation,doctor


In [21]:
# Multiple rows - slicing
people_df.loc[0:2]

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist


In [22]:
people_df.iloc[0:2]

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor


In [23]:
# Selecting rows in any order - list
people_df.loc[[0,3,3,2,1]]

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
3,Sarah,58,Taos,general contractor
3,Sarah,58,Taos,general contractor
2,Roger,32,Santa Fe,data scientist
1,Joe,45,Rio Rancho,doctor


### Indexing for a single entry

In [24]:
# Row and column indexing
people_df['Name'][0]

'Mary'

In [25]:
# Another way
people_df['Name'].loc[0]

'Mary'

In [26]:
# Another way
people_df.loc[0]['Name']

'Mary'

In [27]:
# Another way
people_df.loc[0].loc['Name']

'Mary'

In [28]:
# Another way
people_df.loc[0,'Name']

'Mary'

In [29]:
# Another way
people_df.iloc[0,0]

'Mary'

In [30]:
# Another way
people_df.iloc[0].iloc[0]

'Mary'

In [31]:
# Another way
people_df.iloc[0].loc["Name"]

'Mary'

## Attribute Access

> WARNING: Do Not Use This

You can access columns in a dataframe as attributes. Note: there are a few cases when the attribute will not work, such as if your column is named a number (e.g., 1) or if it is named after an existing method (e.g., min).

In [32]:
people_df

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


In [33]:
people_df.Name

Unnamed: 0,Name
0,Mary
1,Joe
2,Roger
3,Sarah
4,Mike
5,Robert


In [34]:
people_df.Name.loc[0]

'Mary'

## Boolean Indexing

In [35]:
filter = ( people_df['Age'] > 40 ) # Create a filter
filter

Unnamed: 0,Age
0,False
1,True
2,False
3,True
4,False
5,True


In [36]:
people_df[ filter ] # Use that filter to index

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
3,Sarah,58,Taos,general contractor
5,Robert,42,Placitas,manager


In [37]:
# Another example - people from Albuquerque
filter = ( people_df['City'] == 'Albuquerque' )
filter


Unnamed: 0,City
0,True
1,False
2,False
3,False
4,True
5,False


In [38]:
people_df[ filter ] # Use that filter to index

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
4,Mike,39,Albuquerque,student


In [39]:
# A third example! People from albuquerque or people who are doctors
filter1 = ( people_df['City'] == 'Albuquerque' )
filter1


Unnamed: 0,City
0,True
1,False
2,False
3,False
4,True
5,False


In [40]:
filter2 = ( people_df['Occupation'] == 'doctor' )
filter2


Unnamed: 0,Occupation
0,False
1,True
2,False
3,False
4,False
5,False


In [41]:
filter = filter1 | filter2
filter


Unnamed: 0,0
0,True
1,True
2,False
3,False
4,True
5,False


In [42]:
# verify
pd.concat([filter1, filter2, filter, people_df], axis='columns')


Unnamed: 0,City,Occupation,0,Name,Age,City.1,Occupation.1
0,True,False,True,Mary,20,Albuquerque,student
1,False,True,True,Joe,45,Rio Rancho,doctor
2,False,False,False,Roger,32,Santa Fe,data scientist
3,False,False,False,Sarah,58,Taos,general contractor
4,True,False,True,Mike,39,Albuquerque,student
5,False,False,False,Robert,42,Placitas,manager


In [43]:
people_df[ filter ] # Use that filter to index


Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
4,Mike,39,Albuquerque,student


In [44]:
# Negating
filter = ( people_df['Name'] == 'Mary' )
filter


Unnamed: 0,Name
0,True
1,False
2,False
3,False
4,False
5,False


In [45]:
# verify
pd.concat([filter, ~filter, people_df], axis='columns')


Unnamed: 0,Name,Name.1,Name.2,Age,City,Occupation
0,True,False,Mary,20,Albuquerque,student
1,False,True,Joe,45,Rio Rancho,doctor
2,False,True,Roger,32,Santa Fe,data scientist
3,False,True,Sarah,58,Taos,general contractor
4,False,True,Mike,39,Albuquerque,student
5,False,True,Robert,42,Placitas,manager


In [46]:
people_df[ ~filter ] # Use that filter to index


Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


In [47]:
people_df[people_df['Name'] != 'Mary']

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


## Indexing w/ Isin

In [48]:
filter = ( people_df['City'].isin(['Rio Rancho', 'Santa Fe', 'Taos']) ) # Create a filter
filter

Unnamed: 0,City
0,False
1,True
2,True
3,True
4,False
5,False


In [49]:
# verify
pd.concat( [ filter, people_df ], axis='columns')


Unnamed: 0,City,Name,Age,City.1,Occupation
0,False,Mary,20,Albuquerque,student
1,True,Joe,45,Rio Rancho,doctor
2,True,Roger,32,Santa Fe,data scientist
3,True,Sarah,58,Taos,general contractor
4,False,Mike,39,Albuquerque,student
5,False,Robert,42,Placitas,manager


In [50]:
people_df[ filter ] # Use that filter to index


Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor


## Numpy Where
The numpy where method can be useful to find rows that correspond to a particular conditional statement.

In [51]:
import numpy as np


In [52]:
filter = ( people_df['City'] == 'Albuquerque' )
filter


Unnamed: 0,City
0,True
1,False
2,False
3,False
4,True
5,False


In [53]:
where_index = np.where( filter )
where_index[0]

array([0, 4])

In [54]:
people_df

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


In [55]:
people_df.iloc[ where_index[0] ]

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
4,Mike,39,Albuquerque,student


In [56]:
people_df.drop( where_index[0] )

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
5,Robert,42,Placitas,manager


In [57]:
people_df_no_abq = people_df.drop( where_index[0] )
people_df_no_abq

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
5,Robert,42,Placitas,manager


In [58]:
people_df_no_abq.iloc[ where_index[0][0] ]

Unnamed: 0,1
Name,Joe
Age,45
City,Rio Rancho
Occupation,doctor


In [65]:
people_df_no_abq.reset_index()

Unnamed: 0,index,Name,Age,City,Occupation
0,1,Joe,45,Rio Rancho,doctor
1,2,Roger,32,Santa Fe,data scientist
2,3,Sarah,58,Taos,general contractor
3,5,Robert,42,Placitas,manager


In [60]:
people_df_no_abq

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
5,Robert,42,Placitas,manager


In [61]:
people_df_no_abq.reset_index( drop = True )

Unnamed: 0,Name,Age,City,Occupation
0,Joe,45,Rio Rancho,doctor
1,Roger,32,Santa Fe,data scientist
2,Sarah,58,Taos,general contractor
3,Robert,42,Placitas,manager


In [66]:
people_df_no_abq.reset_index(drop = True, inplace = True)
people_df_no_abq

Unnamed: 0,Name,Age,City,Occupation
0,Joe,45,Rio Rancho,doctor
1,Roger,32,Santa Fe,data scientist
2,Sarah,58,Taos,general contractor
3,Robert,42,Placitas,manager


## Random Sampling in Pandas

In [67]:
people_df

Unnamed: 0,Name,Age,City,Occupation
0,Mary,20,Albuquerque,student
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager


In [73]:
# Take a random sample of size 3
people_df.sample( n = 3, )

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
0,Mary,20,Albuquerque,student
3,Sarah,58,Taos,general contractor


In [79]:
# Take a random proportion
people_df.sample( frac = 1/3 )

Unnamed: 0,Name,Age,City,Occupation
1,Joe,45,Rio Rancho,doctor
5,Robert,42,Placitas,manager


In [100]:
people_df.groupby('City').sample(n = 1)

Unnamed: 0,Name,Age,City,Occupation
4,Mike,39,Albuquerque,student
5,Robert,42,Placitas,manager
1,Joe,45,Rio Rancho,doctor
2,Roger,32,Santa Fe,data scientist
3,Sarah,58,Taos,general contractor


In [101]:
# Take a randoom sample but set the seed for reproducibility
people_df.sample(n = 3, random_state = 0)

Unnamed: 0,Name,Age,City,Occupation
5,Robert,42,Placitas,manager
2,Roger,32,Santa Fe,data scientist
1,Joe,45,Rio Rancho,doctor
