## Selecting Columns

In [1]:
import pandas as pd
import janitor
import numpy as np
import datetime
import re
from janitor import patterns
from janitor.functions.utils import _select_index
from pandas.api.types import is_datetime64_dtype

In [2]:
start = datetime.datetime(2011, 1, 1)
end = datetime.datetime(2012, 1, 1)
rng = pd.date_range(start, end, freq="BM")
dates = pd.DataFrame({"numbers": np.random.randn(len(rng))}, index=rng)
dates

Unnamed: 0,numbers
2011-01-31,-0.125897
2011-02-28,-1.017592
2011-03-31,1.416618
2011-04-29,-0.222489
2011-05-31,-0.512718
2011-06-30,1.059305
2011-07-29,-1.300161
2011-08-31,-0.640599
2011-09-30,-1.126297
2011-10-31,1.392858


In [3]:
arr = pd.Index(['2011-01-31'])
_select_index(arr, dates, 'index')

array([0])

In [4]:
dates.select_rows(arr)

Unnamed: 0,numbers
2011-01-31,-0.125897


In [5]:
df = pd.DataFrame(
        {
            "id": [0, 1],
            "Name": ["ABC", "XYZ"],
            "code": [1, 2],
            "code1": [4, np.nan],
            "code2": ["8", 5],
            "type": ["S", "R"],
            "type1": ["E", np.nan],
            "type2": ["T", "U"],
            "code3": pd.Series(["a", "b"], dtype="category"),
            "type3": pd.to_datetime([np.datetime64("2018-01-01"),
                                    datetime.datetime(2018, 1, 1)]),
        }
    )

df



Unnamed: 0,id,Name,code,code1,code2,type,type1,type2,code3,type3
0,0,ABC,1,4.0,8,S,E,T,a,2018-01-01
1,1,XYZ,2,,5,R,,U,b,2018-01-01


- Select by string:

In [6]:
df.select_columns("id")

Unnamed: 0,id
0,0
1,1


- Select via shell-like glob strings (`*`) is possible:

In [7]:
df.select_columns("type*")

Unnamed: 0,type,type1,type2,type3
0,S,E,T,2018-01-01
1,R,,U,2018-01-01


- Select by slice:

In [8]:
df.select_columns(slice("code1", "type1"))

Unnamed: 0,code1,code2,type,type1
0,4.0,8,S,E
1,,5,R,


- Select by `Callable` (the callable is applied to every column  and should return a single `True` or `False` per column):

In [9]:
df.select_columns(is_datetime64_dtype)



TypeError: len() of unsized object

In [None]:
df.select_columns(lambda x: x.name.startswith("code") or
                            x.name.endswith("1"))

['code', 'code1', 'code2', 'type1', 'code3']

In [None]:
df.select_columns(lambda x: x.isna().any())

['code1', 'type1']

- Select by regular expression:

In [None]:
df.select_columns(re.compile("\\d+"))

['code1', 'code2', 'type1', 'type2', 'code3', 'type3']

In [None]:
# same as above, with janitor.patterns
# simply a wrapper around re.compile

df.select_columns(patterns("\\d+"))


  df.select_columns(patterns("\\d+"))


['code1', 'code2', 'type1', 'type2', 'code3', 'type3']

 - Select a combination of the above (you can combine any of the previous options):

In [None]:
df.select_columns("id", "code*", slice("code", "code2"))

array(['id', 'code', 'code1', 'code2', 'code3'], dtype=object)

- You can also pass a sequence of booleans:

In [None]:
df.select_columns([True, False, True, True, True,
                   False, False, False, True, False])

Index(['id', 'code', 'code1', 'code2', 'code3'], dtype='object')

- Setting `invert` to `True` returns the complement of the columns provided:

In [None]:
df.select_columns("id", "code*", slice("code", "code2"),
                  invert = True)

array(['id', 'code', 'code1', 'code2', 'code3'], dtype=object)