# Discretization

Continuous data is often discretized or otherwised separated into “bins” for analysis

In [136]:
import pandas as pd
import numpy as np

In [137]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [138]:
bins = [18, 25, 35, 60, 100]

In [139]:
cats = pd.cut(ages, bins)

In [140]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [141]:
type(cats)

pandas.core.arrays.categorical.Categorical

In [142]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [143]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [144]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [145]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [146]:
data = np.random.rand(20)

In [147]:
pd.cut(data, 4, precision=2)

[(0.034, 0.28], (0.034, 0.28], (0.28, 0.52], (0.76, 1.0], (0.76, 1.0], ..., (0.28, 0.52], (0.034, 0.28], (0.76, 1.0], (0.76, 1.0], (0.52, 0.76]]
Length: 20
Categories (4, interval[float64, right]): [(0.034, 0.28] < (0.28, 0.52] < (0.52, 0.76] < (0.76, 1.0]]

In [148]:
data = np.random.randn(1000) # Normally distributed

In [149]:
cats = pd.qcut(data, 4) # Cut into quartiles

In [150]:
cats

[(-0.674, 0.00259], (0.649, 3.43], (0.649, 3.43], (0.649, 3.43], (-2.866, -0.674], ..., (-0.674, 0.00259], (0.00259, 0.649], (-0.674, 0.00259], (-2.866, -0.674], (0.00259, 0.649]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.866, -0.674] < (-0.674, 0.00259] < (0.00259, 0.649] < (0.649, 3.43]]

In [151]:
pd.value_counts(cats)

(-2.866, -0.674]     250
(-0.674, 0.00259]    250
(0.00259, 0.649]     250
(0.649, 3.43]        250
Name: count, dtype: int64

In [152]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.367, 0.00259], (1.215, 3.43], (0.00259, 1.215], (1.215, 3.43], (-2.866, -1.367], ..., (-1.367, 0.00259], (0.00259, 1.215], (-1.367, 0.00259], (-1.367, 0.00259], (0.00259, 1.215]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.866, -1.367] < (-1.367, 0.00259] < (0.00259, 1.215] < (1.215, 3.43]]