## Selecting Columns

In [1]:
import pandas as pd
import janitor
import numpy as np
import datetime
import re
from janitor import patterns
from pandas.api.types import is_datetime64_dtype

In [2]:
df = pd.DataFrame(
        {
            "id": [0, 1],
            "Name": ["ABC", "XYZ"],
            "code": [1, 2],
            "code1": [4, np.nan],
            "code2": ["8", 5],
            "type": ["S", "R"],
            "type1": ["E", np.nan],
            "type2": ["T", "U"],
            "code3": pd.Series(["a", "b"], dtype="category"),
            "type3": pd.to_datetime([np.datetime64("2018-01-01"),
                                    datetime.datetime(2018, 1, 1)]),
        }
    )

df



Unnamed: 0,id,Name,code,code1,code2,type,type1,type2,code3,type3
0,0,ABC,1,4.0,8,S,E,T,a,2018-01-01
1,1,XYZ,2,,5,R,,U,b,2018-01-01


- Select by string:

In [3]:
df.select_columns("id")

Unnamed: 0,id
0,0
1,1


- Select via shell-like glob strings (`*`) is possible:

In [4]:
df.select_columns("type*")

Unnamed: 0,type,type1,type2,type3
0,S,E,T,2018-01-01
1,R,,U,2018-01-01


- Select by slice:

In [5]:
df.select_columns(slice("code1", "type1"))

Unnamed: 0,code1,code2,type,type1
0,4.0,8,S,E
1,,5,R,


- Select by `Callable` (the callable is applied to every column  and should return a single `True` or `False` per column):

In [6]:
df.select_columns(is_datetime64_dtype)

Unnamed: 0,type3
0,2018-01-01
1,2018-01-01


In [7]:
df.select_columns(lambda x: x.name.startswith("code") or
                            x.name.endswith("1"))

Unnamed: 0,code,code1,code2,type1,code3
0,1,4.0,8,E,a
1,2,,5,,b


In [8]:
df.select_columns(lambda x: x.isna().any())

Unnamed: 0,code1,type1
0,4.0,E
1,,


- Select by regular expression:

In [9]:
df.select_columns(re.compile("\\d+"))

Unnamed: 0,code1,code2,type1,type2,code3,type3
0,4.0,8,E,T,a,2018-01-01
1,,5,,U,b,2018-01-01


In [10]:
# same as above, with janitor.patterns
# simply a wrapper around re.compile

df.select_columns(patterns("\\d+"))


  df.select_columns(patterns("\\d+"))


Unnamed: 0,code1,code2,type1,type2,code3,type3
0,4.0,8,E,T,a,2018-01-01
1,,5,,U,b,2018-01-01


 - Select a combination of the above (you can combine any of the previous options):

In [11]:
df.select_columns("id", "code*", slice("code", "code2"))

Unnamed: 0,id,code,code1,code2,code3
0,0,1,4.0,8,a
1,1,2,,5,b


- You can also pass a sequence of booleans:

In [12]:
df.select_columns([True, False, True, True, True,
                   False, False, False, True, False])

Unnamed: 0,id,code,code1,code2,code3
0,0,1,4.0,8,a
1,1,2,,5,b


- Setting `invert` to `True` returns the complement of the columns provided:

In [13]:
df.select_columns("id", "code*", slice("code", "code2"),
                  invert = True)

Unnamed: 0,Name,type,type1,type2,type3
0,ABC,S,E,T,2018-01-01
1,XYZ,R,,U,2018-01-01


In [14]:
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                    'B': {0: 1, 1: 3, 2: 5},
                    'C': {0: 2, 1: 4, 2: 6}})

df.columns = [list('ABC'), list('DEF')]

df


Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,D,E,F
0,a,1,2
1,b,3,4
2,c,5,6


In [15]:
df.select_columns(slice(('C','F'),('B','E')))

Unnamed: 0_level_0,C,B
Unnamed: 0_level_1,F,E
0,2,1
1,4,3
2,6,5


In [16]:
start = datetime.datetime(2011, 1, 1)

end = datetime.datetime(2012, 1, 1)

rng = pd.date_range(start, end, freq="BM")

ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [17]:
ts

2011-01-31    0.335450
2011-02-28   -0.337035
2011-03-31    0.040042
2011-04-29    0.304694
2011-05-31   -1.010589
2011-06-30    0.645795
2011-07-29   -0.150588
2011-08-31    1.612465
2011-09-30    0.208183
2011-10-31   -2.101096
2011-11-30    0.843838
2011-12-30    0.403690
Freq: BM, dtype: float64

In [18]:
dummy = ts.reset_index().pivot(None, 'index').droplevel(axis = 1,level = 0)
dummy


index,2011-01-31,2011-02-28,2011-03-31,2011-04-29,2011-05-31,2011-06-30,2011-07-29,2011-08-31,2011-09-30,2011-10-31,2011-11-30,2011-12-30
0,0.33545,,,,,,,,,,,
1,,-0.337035,,,,,,,,,,
2,,,0.040042,,,,,,,,,
3,,,,0.304694,,,,,,,,
4,,,,,-1.010589,,,,,,,
5,,,,,,0.645795,,,,,,
6,,,,,,,-0.150588,,,,,
7,,,,,,,,1.612465,,,,
8,,,,,,,,,0.208183,,,
9,,,,,,,,,,-2.101096,,


In [19]:
dummy.loc[:, '2011-01':'2011-11']

index,2011-01-31,2011-02-28,2011-03-31,2011-04-29,2011-05-31,2011-06-30,2011-07-29,2011-08-31,2011-09-30,2011-10-31,2011-11-30
0,0.33545,,,,,,,,,,
1,,-0.337035,,,,,,,,,
2,,,0.040042,,,,,,,,
3,,,,0.304694,,,,,,,
4,,,,,-1.010589,,,,,,
5,,,,,,0.645795,,,,,
6,,,,,,,-0.150588,,,,
7,,,,,,,,1.612465,,,
8,,,,,,,,,0.208183,,
9,,,,,,,,,,-2.101096,


In [20]:
dft = pd.DataFrame(
       np.random.randn(100000, 1),
       columns=["A"],
      index=pd.date_range("20130101", periods=100000, freq="T"),
    )

dft

Unnamed: 0,A
2013-01-01 00:00:00,-2.711903
2013-01-01 00:01:00,-0.043594
2013-01-01 00:02:00,0.708947
2013-01-01 00:03:00,0.828016
2013-01-01 00:04:00,0.949109
...,...
2013-03-11 10:35:00,-0.208556
2013-03-11 10:36:00,-2.074058
2013-03-11 10:37:00,0.175706
2013-03-11 10:38:00,1.271932


In [21]:
ind = dft.index
ind

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 00:01:00',
               '2013-01-01 00:02:00', '2013-01-01 00:03:00',
               '2013-01-01 00:04:00', '2013-01-01 00:05:00',
               '2013-01-01 00:06:00', '2013-01-01 00:07:00',
               '2013-01-01 00:08:00', '2013-01-01 00:09:00',
               ...
               '2013-03-11 10:30:00', '2013-03-11 10:31:00',
               '2013-03-11 10:32:00', '2013-03-11 10:33:00',
               '2013-03-11 10:34:00', '2013-03-11 10:35:00',
               '2013-03-11 10:36:00', '2013-03-11 10:37:00',
               '2013-03-11 10:38:00', '2013-03-11 10:39:00'],
              dtype='datetime64[ns]', length=100000, freq='T')

In [22]:
ind.get_loc('2013')

slice(0, 100000, None)

In [45]:
dft.loc[slice('2013-1',pd.Timestamp.today(), 2)]

Unnamed: 0,A
2013-01-01 00:00:00,-2.711903
2013-01-01 00:02:00,0.708947
2013-01-01 00:04:00,0.949109
2013-01-01 00:06:00,-0.276914
2013-01-01 00:08:00,-0.487298
...,...
2013-03-11 10:30:00,0.859029
2013-03-11 10:32:00,0.599693
2013-03-11 10:34:00,0.019025
2013-03-11 10:36:00,-2.074058


In [49]:
dft.index.get_loc('2015-01')

KeyError: '2015-01'

In [43]:
dft.index.get_loc('2013-3')

slice(84960, 100000, None)

In [26]:
ind

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 00:01:00',
               '2013-01-01 00:02:00', '2013-01-01 00:03:00',
               '2013-01-01 00:04:00', '2013-01-01 00:05:00',
               '2013-01-01 00:06:00', '2013-01-01 00:07:00',
               '2013-01-01 00:08:00', '2013-01-01 00:09:00',
               ...
               '2013-03-11 10:30:00', '2013-03-11 10:31:00',
               '2013-03-11 10:32:00', '2013-03-11 10:33:00',
               '2013-03-11 10:34:00', '2013-03-11 10:35:00',
               '2013-03-11 10:36:00', '2013-03-11 10:37:00',
               '2013-03-11 10:38:00', '2013-03-11 10:39:00'],
              dtype='datetime64[ns]', length=100000, freq='T')

In [41]:
pd.api.types.is_datetime64_any_dtype(ind)

True

In [27]:
isinstance(ind, pd.DatetimeIndex)

True

In [28]:
dft2 = pd.DataFrame(
     np.random.randn(20, 1),
   columns=["A"],
     index=pd.MultiIndex.from_product(
         [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]]
     ),
 )

dft2

Unnamed: 0,Unnamed: 1,A
2013-01-01 00:00:00,a,1.634207
2013-01-01 00:00:00,b,-0.429777
2013-01-01 12:00:00,a,-0.422313
2013-01-01 12:00:00,b,-0.288744
2013-01-02 00:00:00,a,0.929457
2013-01-02 00:00:00,b,-0.833717
2013-01-02 12:00:00,a,0.452186
2013-01-02 12:00:00,b,-1.879699
2013-01-03 00:00:00,a,0.153699
2013-01-03 00:00:00,b,-0.51885


In [29]:
mu = dft2.index

mu

MultiIndex([('2013-01-01 00:00:00', 'a'),
            ('2013-01-01 00:00:00', 'b'),
            ('2013-01-01 12:00:00', 'a'),
            ('2013-01-01 12:00:00', 'b'),
            ('2013-01-02 00:00:00', 'a'),
            ('2013-01-02 00:00:00', 'b'),
            ('2013-01-02 12:00:00', 'a'),
            ('2013-01-02 12:00:00', 'b'),
            ('2013-01-03 00:00:00', 'a'),
            ('2013-01-03 00:00:00', 'b'),
            ('2013-01-03 12:00:00', 'a'),
            ('2013-01-03 12:00:00', 'b'),
            ('2013-01-04 00:00:00', 'a'),
            ('2013-01-04 00:00:00', 'b'),
            ('2013-01-04 12:00:00', 'a'),
            ('2013-01-04 12:00:00', 'b'),
            ('2013-01-05 00:00:00', 'a'),
            ('2013-01-05 00:00:00', 'b'),
            ('2013-01-05 12:00:00', 'a'),
            ('2013-01-05 12:00:00', 'b')],
           )

In [30]:
isinstance(mu, pd.DatetimeIndex)

False

In [31]:
ind

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 00:01:00',
               '2013-01-01 00:02:00', '2013-01-01 00:03:00',
               '2013-01-01 00:04:00', '2013-01-01 00:05:00',
               '2013-01-01 00:06:00', '2013-01-01 00:07:00',
               '2013-01-01 00:08:00', '2013-01-01 00:09:00',
               ...
               '2013-03-11 10:30:00', '2013-03-11 10:31:00',
               '2013-03-11 10:32:00', '2013-03-11 10:33:00',
               '2013-03-11 10:34:00', '2013-03-11 10:35:00',
               '2013-03-11 10:36:00', '2013-03-11 10:37:00',
               '2013-03-11 10:38:00', '2013-03-11 10:39:00'],
              dtype='datetime64[ns]', length=100000, freq='T')

In [32]:
import datetime

ind.get_loc(datetime.datetime(2011, 12, 25), method='nearest')

0

In [33]:
ind

DatetimeIndex(['2013-01-01 00:00:00', '2013-01-01 00:01:00',
               '2013-01-01 00:02:00', '2013-01-01 00:03:00',
               '2013-01-01 00:04:00', '2013-01-01 00:05:00',
               '2013-01-01 00:06:00', '2013-01-01 00:07:00',
               '2013-01-01 00:08:00', '2013-01-01 00:09:00',
               ...
               '2013-03-11 10:30:00', '2013-03-11 10:31:00',
               '2013-03-11 10:32:00', '2013-03-11 10:33:00',
               '2013-03-11 10:34:00', '2013-03-11 10:35:00',
               '2013-03-11 10:36:00', '2013-03-11 10:37:00',
               '2013-03-11 10:38:00', '2013-03-11 10:39:00'],
              dtype='datetime64[ns]', length=100000, freq='T')

In [34]:
ts

2011-01-31    0.335450
2011-02-28   -0.337035
2011-03-31    0.040042
2011-04-29    0.304694
2011-05-31   -1.010589
2011-06-30    0.645795
2011-07-29   -0.150588
2011-08-31    1.612465
2011-09-30    0.208183
2011-10-31   -2.101096
2011-11-30    0.843838
2011-12-30    0.403690
Freq: BM, dtype: float64

In [35]:
dft.loc[datetime.datetime(2011, 12, 25):]

Unnamed: 0,A
2013-01-01 00:00:00,-2.711903
2013-01-01 00:01:00,-0.043594
2013-01-01 00:02:00,0.708947
2013-01-01 00:03:00,0.828016
2013-01-01 00:04:00,0.949109
...,...
2013-03-11 10:35:00,-0.208556
2013-03-11 10:36:00,-2.074058
2013-03-11 10:37:00,0.175706
2013-03-11 10:38:00,1.271932


In [36]:
r = pd.Timestamp(datetime.datetime(2011, 12, 25))
r

Timestamp('2011-12-25 00:00:00')

In [37]:
ts.loc[r]

KeyError: Timestamp('2011-12-25 00:00:00')