## Selecting Columns

In [1]:
import pandas as pd
import janitor
import numpy as np
import datetime
import re
from janitor import patterns
from pandas.api.types import is_datetime64_dtype

In [34]:
start = datetime.datetime(2011, 1, 1)
end = datetime.datetime(2012, 1, 1)
rng = pd.date_range(start, end, freq="BM")
df =  pd.DataFrame([np.random.randn(len(rng))], columns=rng)
df

Unnamed: 0,2011-01-31,2011-02-28,2011-03-31,2011-04-29,2011-05-31,2011-06-30,2011-07-29,2011-08-31,2011-09-30,2011-10-31,2011-11-30,2011-12-30
0,-1.047557,0.532579,0.949826,0.697671,0.85052,-0.52548,-1.529388,0.628935,-1.492307,-0.731681,-0.0014,-0.611675


In [40]:
df.columns.get_loc('2011-01', method=None)

slice(0, 1, None)

In [41]:
df.loc[:, slice('2011-01')]

Unnamed: 0,2011-01-31
0,-1.047557


In [4]:
df = pd.DataFrame(
        {
            "id": [0, 1],
            "Name": ["ABC", "XYZ"],
            "code": [1, 2],
            "code1": [4, np.nan],
            "code2": ["8", 5],
            "type": ["S", "R"],
            "type1": ["E", np.nan],
            "type2": ["T", "U"],
            "code3": pd.Series(["a", "b"], dtype="category"),
            "type3": pd.to_datetime([np.datetime64("2018-01-01"),
                                    datetime.datetime(2018, 1, 1)]),
        }
    )

df



Unnamed: 0,id,Name,code,code1,code2,type,type1,type2,code3,type3
0,0,ABC,1,4.0,8,S,E,T,a,2018-01-01
1,1,XYZ,2,,5,R,,U,b,2018-01-01


- Select by string:

In [5]:
df.select_columns("id")

na here we dey


[['id']]

- Select via shell-like glob strings (`*`) is possible:

In [6]:
df.select_columns("type*")

na here we dey


[['type', 'type1', 'type2', 'type3']]

- Select by slice:

In [7]:
df.select_columns(slice("code1", "type1"))

how we take reach here?


[Index(['code1', 'code2', 'type', 'type1'], dtype='object')]

- Select by `Callable` (the callable is applied to every column  and should return a single `True` or `False` per column):

In [8]:
df.select_columns(is_datetime64_dtype)

[Index(['type3'], dtype='object')]

In [9]:
df.select_columns(lambda x: x.name.startswith("code") or
                            x.name.endswith("1"))

[Index(['code', 'code1', 'code2', 'type1', 'code3'], dtype='object')]

In [10]:
df.select_columns(lambda x: x.isna().any())

[Index(['code1', 'type1'], dtype='object')]

- Select by regular expression:

In [11]:
df.select_columns(re.compile("\\d+"))

[Index(['code1', 'code2', 'type1', 'type2', 'code3', 'type3'], dtype='object')]

In [12]:
# same as above, with janitor.patterns
# simply a wrapper around re.compile

df.select_columns(patterns("\\d+"))


  df.select_columns(patterns("\\d+"))


[Index(['code1', 'code2', 'type1', 'type2', 'code3', 'type3'], dtype='object')]

 - Select a combination of the above (you can combine any of the previous options):

In [13]:
df.select_columns("id", "code*", slice("code", "code2"))

na here we dey
na here we dey
how we take reach here?


[['id'],
 ['code', 'code1', 'code2', 'code3'],
 Index(['code', 'code1', 'code2'], dtype='object')]

- You can also pass a sequence of booleans:

In [14]:
df.select_columns([True, False, True, True, True,
                   False, False, False, True, False])

Index(['id', 'code', 'code1', 'code2', 'code3'], dtype='object')

- Setting `invert` to `True` returns the complement of the columns provided:

In [15]:
df.select_columns("id", "code*", slice("code", "code2"),
                  invert = True)

na here we dey
na here we dey
how we take reach here?


[['id'],
 ['code', 'code1', 'code2', 'code3'],
 Index(['code', 'code1', 'code2'], dtype='object')]