# Pandas Filtering Walkthrough

In [1]:
import numpy as np
import pandas as pd

In [21]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4, dtype='int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})

### Getting

In [3]:
df['A']

2013-01-01   -0.631963
2013-01-02   -0.520628
2013-01-03   -1.623383
2013-01-04   -0.102989
2013-01-05   -1.696310
2013-01-06   -0.099953
Freq: D, Name: A, dtype: float64

In [4]:
df.A

2013-01-01   -0.631963
2013-01-02   -0.520628
2013-01-03   -1.623383
2013-01-04   -0.102989
2013-01-05   -1.696310
2013-01-06   -0.099953
Freq: D, Name: A, dtype: float64

In [5]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.631963,-1.454939,0.725221,0.281909
2013-01-02,-0.520628,-1.142348,0.028926,0.339182
2013-01-03,-1.623383,-0.833818,-0.127153,1.384191


In [6]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.520628,-1.142348,0.028926,0.339182
2013-01-03,-1.623383,-0.833818,-0.127153,1.384191
2013-01-04,-0.102989,-0.534526,0.437519,0.556764


### Selection by label

In [7]:
df.loc['2013-01-01']

A   -0.631963
B   -1.454939
C    0.725221
D    0.281909
Name: 2013-01-01 00:00:00, dtype: float64

In [8]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.631963,-1.454939
2013-01-02,-0.520628,-1.142348
2013-01-03,-1.623383,-0.833818
2013-01-04,-0.102989,-0.534526
2013-01-05,-1.69631,1.074822
2013-01-06,-0.099953,-0.056545


In [9]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.520628,-1.142348
2013-01-03,-1.623383,-0.833818
2013-01-04,-0.102989,-0.534526


In [10]:
df.loc['20130102',['A','B']]

A   -0.520628
B   -1.142348
Name: 2013-01-02 00:00:00, dtype: float64

In [11]:
df.loc[dates[0],'A']

-0.631962864551791

### Selection by position

In [12]:
df.iloc[3]

A   -0.102989
B   -0.534526
C    0.437519
D    0.556764
Name: 2013-01-04 00:00:00, dtype: float64

In [13]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.102989,-0.534526
2013-01-05,-1.69631,1.074822


In [14]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,-0.520628,-1.142348,0.028926,0.339182
2013-01-03,-1.623383,-0.833818,-0.127153,1.384191


### Selection by dtype

In [19]:
df3 = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})

In [20]:
df3.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


### Boolean indexing

In [22]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2013-01-02,2.578148,-0.857857,-1.161789,1.492685
2013-01-03,0.398778,1.825516,-0.093756,-0.4384


In [23]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [24]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.398778,1.825516,-0.093756,-0.4384,two
2013-01-05,-0.764497,-0.178624,-0.4634,1.421112,four


In [25]:
df.iat[0,1]  = 0

In [26]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.355904,0.0,0.565386,2.391247
2013-01-02,2.578148,-0.857857,-1.161789,1.492685
2013-01-03,0.398778,1.825516,-0.093756,-0.4384
2013-01-04,-1.585992,-1.232289,0.853673,1.064691
2013-01-05,-0.764497,-0.178624,-0.4634,1.421112
2013-01-06,-0.154666,-0.150389,0.383298,0.224872


In [27]:
df.iloc[0,1] = 2
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.355904,2.0,0.565386,2.391247
2013-01-02,2.578148,-0.857857,-1.161789,1.492685
2013-01-03,0.398778,1.825516,-0.093756,-0.4384
2013-01-04,-1.585992,-1.232289,0.853673,1.064691
2013-01-05,-0.764497,-0.178624,-0.4634,1.421112
2013-01-06,-0.154666,-0.150389,0.383298,0.224872


In [28]:
df.at[dates[0],'A'] = 0

In [29]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,2.0,0.565386,2.391247
2013-01-02,2.578148,-0.857857,-1.161789,1.492685
2013-01-03,0.398778,1.825516,-0.093756,-0.4384
2013-01-04,-1.585992,-1.232289,0.853673,1.064691
2013-01-05,-0.764497,-0.178624,-0.4634,1.421112
2013-01-06,-0.154666,-0.150389,0.383298,0.224872


In [30]:
df.loc[dates[0], 'A']=2
df

Unnamed: 0,A,B,C,D
2013-01-01,2.0,2.0,0.565386,2.391247
2013-01-02,2.578148,-0.857857,-1.161789,1.492685
2013-01-03,0.398778,1.825516,-0.093756,-0.4384
2013-01-04,-1.585992,-1.232289,0.853673,1.064691
2013-01-05,-0.764497,-0.178624,-0.4634,1.421112
2013-01-06,-0.154666,-0.150389,0.383298,0.224872


In [34]:
df.loc[:, 'D'] = np.array([5]*len(df))

In [35]:
df

Unnamed: 0,A,B,C,D
2013-01-01,2.0,2.0,0.565386,5
2013-01-02,2.578148,-0.857857,-1.161789,5
2013-01-03,0.398778,1.825516,-0.093756,5
2013-01-04,-1.585992,-1.232289,0.853673,5
2013-01-05,-0.764497,-0.178624,-0.4634,5
2013-01-06,-0.154666,-0.150389,0.383298,5
