In [1]:
from typing import List, Dict
import pandas as pd
import numpy as np

In [2]:
#we can separate continuous data into bins for analysis
#here we have ages and we want to separate them into age buckets
ages : List[int] = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
#we will divide the ages into bins of 18-25, 26-35, 36-60, 60-100
bins : List[int] = [18, 25, 35, 60, 100]

In [4]:
age_categories = pd.cut(ages, bins)

In [5]:
age_categories #a categorical object

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
#each bin has a special value that contains the lower and upper limits of the bin. The output
#is information that depicts the Categorical object.
#The square bracket is a hard boundary, while the curly bracket is a soft boundary.
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [9]:
pd.value_counts(age_categories) #bin counts for the result of pandas.cut

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [10]:
np.unique(age_categories.codes, return_counts=True)

(array([0, 1, 2, 3], dtype=int8), array([5, 3, 3, 1]))

In [11]:
#we can change which side of the intervals are exclusive with the "right=False" argument
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [12]:
pd.cut(ages, bins, right=True) #same as the original

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [15]:
#override interval-based labelling: pass a list or array of labels to the labels option
group_names : List[str] = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [16]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [18]:
#display the categories
pd.cut(ages, bins, labels=group_names).categories

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

In [19]:
pd.cut(ages, bins, labels=group_names).codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [20]:
#if we pass an integer number of bins, pandas will automatically compute equal length bins
#make uniformly distributed data
data = np.random.uniform(size=20)

In [21]:
pd.cut(data, 4, precision=2)

[(0.085, 0.31], (0.53, 0.75], (0.085, 0.31], (0.085, 0.31], (0.53, 0.75], ..., (0.31, 0.53], (0.085, 0.31], (0.75, 0.97], (0.75, 0.97], (0.75, 0.97]]
Length: 20
Categories (4, interval[float64, right]): [(0.085, 0.31] < (0.31, 0.53] < (0.53, 0.75] < (0.75, 0.97]]

In [22]:
#pd.qcut: similar to pd.cut, but automatically calculates equally sized bins
data = np.random.standard_normal(1000)

In [23]:
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(0.71, 3.49], (0.71, 3.49], (0.71, 3.49], (0.031, 0.71], (-0.67, 0.031], ..., (-0.67, 0.031], (0.031, 0.71], (0.031, 0.71], (-3.4299999999999997, -0.67], (-0.67, 0.031]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.4299999999999997, -0.67] < (-0.67, 0.031] < (0.031, 0.71] < (0.71, 3.49]]

In [24]:
pd.value_counts(quartiles)

(-3.4299999999999997, -0.67]    250
(-0.67, 0.031]                  250
(0.031, 0.71]                   250
(0.71, 3.49]                    250
dtype: int64

In [25]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#first let's type the code
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.188, 3.486], (0.0313, 1.188], (0.0313, 1.188], (0.0313, 1.188], (-1.284, 0.0313], ..., (-1.284, 0.0313], (0.0313, 1.188], (0.0313, 1.188], (-3.425, -1.284], (-1.284, 0.0313]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.425, -1.284] < (-1.284, 0.0313] < (0.0313, 1.188] < (1.188, 3.486]]

In [26]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#type the code then get the value_counts
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.425, -1.284]    100
(-1.284, 0.0313]    400
(0.0313, 1.188]     400
(1.188, 3.486]      100
dtype: int64

In [27]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#we can also use the standalone pd.value_counts() method
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))

(-1.284, 0.0313]    400
(0.0313, 1.188]     400
(-3.425, -1.284]    100
(1.188, 3.486]      100
dtype: int64

Note: we will return to using pd.cut and pd.qcut later for aggregation and group operations.

In [29]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.020188,0.03743,0.022943,-0.029026
std,0.98859,1.040346,1.011265,1.01326
min,-3.001212,-3.169716,-2.707933,-3.879819
25%,-0.68957,-0.689296,-0.720052,-0.715882
50%,-0.004829,0.030883,0.022692,-0.010995
75%,0.600622,0.748121,0.738377,0.640114
max,2.940414,3.249725,3.350035,2.878894


In [30]:
#Task: find values in one of the columns exceeding 3 in absolute value
col = data[2]
col

0     -0.253225
1      0.642069
2      0.251830
3      0.021578
4      0.874424
         ...   
995    0.161863
996   -0.219551
997   -0.126874
998    1.475531
999    0.658000
Name: 2, Length: 1000, dtype: float64

In [31]:
#filter the column such that the absolute value is greater than three
col[col.abs() > 3]

323    3.350035
765    3.198088
Name: 2, dtype: float64

In [32]:
#to select rows with a value greater than 3 or less than -3, use the any method on a boolean dataframe
data[(data.abs() > 3)] #we are using all so it is all np.Nan values

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [41]:
data[(data.abs() > 3).all(axis=1)]

Unnamed: 0,0,1,2,3


In [42]:
data[(data.abs() > 3).all(axis='columns')]

Unnamed: 0,0,1,2,3
