In [1]:
from typing import List
import numpy as np
import pandas as pd

In [2]:
ages : List[int] = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
bins : List[int] = [18, 25, 35, 60, 100]

In [4]:
age_categories = pd.cut(ages, bins) #pd.cut divides the list of bins into discrete intervals

In [5]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
#this returns a special categorical object. 
#each bin is represented by an interval range from the lower and upper elements
#of each bin
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
#view the categories:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
#view the first category
age_categories.categories[0]

Interval(18, 25, closed='right')

In [13]:
#bin counts for pd.cut
pd.value_counts(age_categories)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [14]:
#in the string representation of an interval, a parenthesis means the side is open
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [15]:
#we can override default interval-based bin labelling by passing a list or array to
#the labels option
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [17]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [19]:
#if we pass an integer number of bins, then it computes equal-length bins based
#on the minimum and maximum in the data
data = np.random.uniform(size=20)

In [20]:
data #before we cut the data

array([0.41375549, 0.60762815, 0.88094573, 0.94098458, 0.681435  ,
       0.76410091, 0.63889364, 0.72400269, 0.19926197, 0.37893033,
       0.43771491, 0.44052635, 0.81701145, 0.52447935, 0.848409  ,
       0.25615554, 0.09289381, 0.36225321, 0.81360891, 0.35079566])

In [22]:
pd.cut(data, bins=4) #with the bins keyword argument

[(0.305, 0.517], (0.517, 0.729], (0.729, 0.941], (0.729, 0.941], (0.517, 0.729], ..., (0.092, 0.305], (0.092, 0.305], (0.305, 0.517], (0.729, 0.941], (0.305, 0.517]]
Length: 20
Categories (4, interval[float64, right]): [(0.092, 0.305] < (0.305, 0.517] < (0.517, 0.729] < (0.729, 0.941]]