# Binning

In [1]:
import pandas as pd
import numpy as np

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
bins = [18, 25, 35, 60, 100]

The pandas **cut()** function can be used to create bins

In [4]:
agegroup = pd.cut(ages, bins)

In [5]:
agegroup

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
type(agegroup)


pandas.core.arrays.categorical.Categorical

In [7]:
pd.value_counts(agegroup)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [8]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

You can define your own bins using the pandas **cut()** function

In [9]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [10]:
agegroup = pd.cut(ages, bins, labels=group_names)


In [11]:
agegroup

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

If you pass cut a integer number of bins instead of explicit bin edges, it will compute
equal-length bins based on the minimum and maximum values in the data

In [12]:
data = np.random.rand(20)

In [13]:
pd.cut(data, 4, precision=2)

[(0.094, 0.32], (0.77, 0.99], (0.54, 0.77], (0.094, 0.32], (0.54, 0.77], ..., (0.77, 0.99], (0.54, 0.77], (0.32, 0.54], (0.54, 0.77], (0.32, 0.54]]
Length: 20
Categories (4, interval[float64, right]): [(0.094, 0.32] < (0.32, 0.54] < (0.54, 0.77] < (0.77, 0.99]]

In [14]:
data = np.random.randn(1000) # Normally distributed

In [15]:
cats = pd.qcut(data, 4) # Cut into quartiles

In [16]:
cats

[(-3.139, -0.665], (-0.665, 0.0229], (-0.665, 0.0229], (-0.665, 0.0229], (0.68, 3.021], ..., (0.68, 3.021], (-0.665, 0.0229], (-0.665, 0.0229], (-3.139, -0.665], (0.68, 3.021]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.139, -0.665] < (-0.665, 0.0229] < (0.0229, 0.68] < (0.68, 3.021]]

In [17]:
pd.value_counts(cats)

(-3.139, -0.665]    250
(-0.665, 0.0229]    250
(0.0229, 0.68]      250
(0.68, 3.021]       250
Name: count, dtype: int64

In [18]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.309, 0.0229], (-1.309, 0.0229], (-1.309, 0.0229], (-1.309, 0.0229], (0.0229, 1.267], ..., (0.0229, 1.267], (-1.309, 0.0229], (-1.309, 0.0229], (-1.309, 0.0229], (0.0229, 1.267]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.139, -1.309] < (-1.309, 0.0229] < (0.0229, 1.267] < (1.267, 3.021]]