In [1]:
# Loading a Sample Pandas DataFrame
import pandas as pd

df = pd.DataFrame.from_dict({
    'Name': ['Ray', 'Jane', 'Kate', 'Nik', 'Autumn', 'Kasi', 'Mandeep', 'Evan', 'Kyra', 'Jim'],
    'Age': [12, 7, 33, 34, 45, 65, 77, 11, 32, 55]
})

print(df.head())

     Name  Age
0     Ray   12
1    Jane    7
2    Kate   33
3     Nik   34
4  Autumn   45


In [None]:
# Parameters of the Pandas .qcut() method
pd.qcut(
    x,                      # Column to bin
    q,                      # Number of quantiles
    labels=None,            # List of labels to include
    retbins=False,          # Whether to return the bins/labels or not
    precision=3,            # The precision to store and display the bins labels
    duplicates='raise'      # If bin edges are not unique, raise a ValueError
)

In [2]:
# Splitting Age Column into Four Quantiles
df['Age Groups'] = pd.qcut(df['Age'], 4)
print(df.head())

     Name  Age     Age Groups
0     Ray   12  (6.999, 17.0]
1    Jane    7  (6.999, 17.0]
2    Kate   33   (17.0, 33.5]
3     Nik   34   (33.5, 52.5]
4  Autumn   45   (33.5, 52.5]


In [3]:
# Checking the data type of the qcut column
df['Age Groups'] = pd.qcut(df['Age'], 4)
print(df['Age Groups'].dtype)

category


In [4]:
# Splitting Age Column into Four Quantiles
df['Age Groups'] = pd.qcut(
   df['Age'], 
   [0, 0.25, 0.5, 0.75, 1]
)
print(df.head())

     Name  Age     Age Groups
0     Ray   12  (6.999, 17.0]
1    Jane    7  (6.999, 17.0]
2    Kate   33   (17.0, 33.5]
3     Nik   34   (33.5, 52.5]
4  Autumn   45   (33.5, 52.5]


In [5]:
# Adding Labels to Pandas .qcut()
df['Age Groups'] = pd.qcut(
   df['Age'], 
   [0, 0.25, 0.5, 0.75, 1], 
   labels=['0-25%', '26-49%', '51-75%', '76-100%']
)
print(df.head())

     Name  Age Age Groups
0     Ray   12      0-25%
1    Jane    7      0-25%
2    Kate   33     26-49%
3     Nik   34     51-75%
4  Autumn   45     51-75%


In [6]:
# Modifying Precision in Categories
df['Age Groups'] = pd.qcut(
   df['Age'], 
   4, 
   precision=1
)
print(df.head())

     Name  Age    Age Groups
0     Ray   12   (6.9, 17.0]
1    Jane    7   (6.9, 17.0]
2    Kate   33  (17.0, 33.5]
3     Nik   34  (33.5, 52.5]
4  Autumn   45  (33.5, 52.5]


In [None]:
# Parameters of the .cut() Function
pd.cut(
    x,                          # The input array to be binned
    bins,                       # The bins to use: int (# of bins) or sequence (widths) 
    right=True,                 # Whether to include right-most edge
    labels=None,                # Labels to be used for bins
    retbins=False,              # Whether to return bins or not
    precision=3,                # Precision to store and display bins
    include_lowest=False,       # Whether first interval should be left inclusive or not
    duplicates='raise',         # What to do if bins edges are not unique
    ordered=True                # Whether labels are ordered or not
)

In [7]:
df['Age Group'] = pd.cut(
   df['Age'], 
   [0, 17, 64, 100]
)
print(df.head())

     Name  Age    Age Groups Age Group
0     Ray   12   (6.9, 17.0]   (0, 17]
1    Jane    7   (6.9, 17.0]   (0, 17]
2    Kate   33  (17.0, 33.5]  (17, 64]
3     Nik   34  (33.5, 52.5]  (17, 64]
4  Autumn   45  (33.5, 52.5]  (17, 64]


In [10]:
# Adding labels to the groupings
df['Age Group'] = pd.cut(
    df['Age'], 
    [0, 17, 64, 100], 
    labels=['0-18 years old', '18-65 years old', '65+ years old']
)
print(df.head())
print()
print(df.tail())

     Name  Age    Age Groups        Age Group
0     Ray   12   (6.9, 17.0]   0-18 years old
1    Jane    7   (6.9, 17.0]   0-18 years old
2    Kate   33  (17.0, 33.5]  18-65 years old
3     Nik   34  (33.5, 52.5]  18-65 years old
4  Autumn   45  (33.5, 52.5]  18-65 years old

      Name  Age    Age Groups        Age Group
5     Kasi   65  (52.5, 77.0]    65+ years old
6  Mandeep   77  (52.5, 77.0]    65+ years old
7     Evan   11   (6.9, 17.0]   0-18 years old
8     Kyra   32  (17.0, 33.5]  18-65 years old
9      Jim   55  (52.5, 77.0]  18-65 years old


In [14]:
# Using the right= argument to modify binning behavior
df['Age Group'] = pd.cut(
    df['Age'], 
    [0, 18, 65, 100], 
    labels=['0-18 years old', '18-65 years old', '65+ years old'],
    right=False
)
print(df.head())
print()
print(df.tail())

     Name  Age    Age Groups        Age Group
0     Ray   12   (6.9, 17.0]   0-18 years old
1    Jane    7   (6.9, 17.0]   0-18 years old
2    Kate   33  (17.0, 33.5]  18-65 years old
3     Nik   34  (33.5, 52.5]  18-65 years old
4  Autumn   45  (33.5, 52.5]  18-65 years old

      Name  Age    Age Groups        Age Group
5     Kasi   65  (52.5, 77.0]    65+ years old
6  Mandeep   77  (52.5, 77.0]    65+ years old
7     Evan   11   (6.9, 17.0]   0-18 years old
8     Kyra   32  (17.0, 33.5]  18-65 years old
9      Jim   55  (52.5, 77.0]  18-65 years old


In [15]:
# Including left-most values
df['Age Group'] = pd.cut(
    df['Age'], 
    [0, 18, 65, 100], 
    labels=['0-18 years old', '18-65 years old', '65+ years old'],
    include_lowest=True
)
print(df.head())

     Name  Age    Age Groups        Age Group
0     Ray   12   (6.9, 17.0]   0-18 years old
1    Jane    7   (6.9, 17.0]   0-18 years old
2    Kate   33  (17.0, 33.5]  18-65 years old
3     Nik   34  (33.5, 52.5]  18-65 years old
4  Autumn   45  (33.5, 52.5]  18-65 years old


In [16]:
# Creating Ordered Categories
print(pd.cut(
    df['Age'], 
    [0, 18, 65, 100], 
    labels=['0-18 years old', '18-65 years old', '65+ years old'],
    ordered=True
))

0     0-18 years old
1     0-18 years old
2    18-65 years old
3    18-65 years old
4    18-65 years old
5    18-65 years old
6      65+ years old
7     0-18 years old
8    18-65 years old
9    18-65 years old
Name: Age, dtype: category
Categories (3, object): ['0-18 years old' < '18-65 years old' < '65+ years old']


**Exercise 1.3B6**

Question 1: Answer

Since the .qcut() function doesn’t allow you to specify including the lowest value of the range, the cut() function needs to be used.



In [18]:
df['Age Group'] = pd.cut(
    df['Age'], 
    [0, 0.25, 0.5, 0.75, 1], 
    include_lowest=True,
    right=False
)

print(df)

      Name  Age    Age Groups Age Group
0      Ray   12   (6.9, 17.0]       NaN
1     Jane    7   (6.9, 17.0]       NaN
2     Kate   33  (17.0, 33.5]       NaN
3      Nik   34  (33.5, 52.5]       NaN
4   Autumn   45  (33.5, 52.5]       NaN
5     Kasi   65  (52.5, 77.0]       NaN
6  Mandeep   77  (52.5, 77.0]       NaN
7     Evan   11   (6.9, 17.0]       NaN
8     Kyra   32  (17.0, 33.5]       NaN
9      Jim   55  (52.5, 77.0]       NaN


Question 2: Answer

Because categories, though they look like strings, aren’t strings, their sorting might not work correctly. By including order in your categories, these values can be sorted appropriately.

Question 3: Answer

The cut function allows you to define your own numeric ranges, while the qcut function enforces an equal distribution of the items in the bins.