# Pandas provides two functions for binning data: cut and qcut. The cut function is used for equal-width binning, while qcut is used for equal-frequency binning.

In [31]:
import pandas as pd

In [32]:
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
import numpy as np

min=df['Age'].min()
max=df['Age'].max()
bins=np.linspace(min,max,3)
bins

array([21., 51., 81.])

In [34]:
num_bins=3
df['bin'] = pd.cut(df['Age'], bins=num_bins, labels=["Young","middle","Old"])
df['bin']

0      middle
1       Young
2       Young
3       Young
4       Young
        ...  
763       Old
764     Young
765     Young
766    middle
767     Young
Name: bin, Length: 768, dtype: category
Categories (3, object): ['Young' < 'middle' < 'Old']

In [35]:
categorical_object = pd.cut(df['Age'], 3)
print(categorical_object)


0       (41.0, 61.0]
1      (20.94, 41.0]
2      (20.94, 41.0]
3      (20.94, 41.0]
4      (20.94, 41.0]
           ...      
763     (61.0, 81.0]
764    (20.94, 41.0]
765    (20.94, 41.0]
766     (41.0, 61.0]
767    (20.94, 41.0]
Name: Age, Length: 768, dtype: category
Categories (3, interval[float64, right]): [(20.94, 41.0] < (41.0, 61.0] < (61.0, 81.0]]


In [36]:
print(pd.value_counts(categorical_object))

(20.94, 41.0]    596
(41.0, 61.0]     147
(61.0, 81.0]      25
Name: Age, dtype: int64


In [37]:
import pandas as pd
 # data
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 # define number of bins
num_bins = 3
 # use pandas' qcut function
bins = pd.qcut(data, q=num_bins)
print(bins)

0    (0.999, 4.0]
1    (0.999, 4.0]
2    (0.999, 4.0]
3    (0.999, 4.0]
4      (4.0, 7.0]
5      (4.0, 7.0]
6      (4.0, 7.0]
7     (7.0, 10.0]
8     (7.0, 10.0]
9     (7.0, 10.0]
dtype: category
Categories (3, interval[float64, right]): [(0.999, 4.0] < (4.0, 7.0] < (7.0, 10.0]]


In [38]:
import numpy as np
 
# data
data = np.array(df['Age'])
# define number of bins
num_bins = 3
# use numpy's histogram function
counts, bins = np.histogram(data, bins=num_bins)
print(f"Bins: {bins}")
print(f"Counts: {counts}")

Bins: [21. 41. 61. 81.]
Counts: [574 167  27]


# for equal-frequency binning

In [39]:
import pandas as pd
#create DataFrame
df = pd.DataFrame({'points': [4, 4, 7, 8, 12, 13, 15, 18, 22, 23, 23, 25],
                   'assists': [2, 5, 4, 7, 7, 8, 5, 4, 5, 11, 13, 8],
                   'rebounds': [7, 7, 4, 6, 3, 8, 9, 9, 12, 11, 8, 9]})
#view DataFrame
#print(df)
#perform data binning on points variable
df['points_bin'] = pd.qcut(df['points'], q=3)
#view updated DataFrame
print(df)

    points  assists  rebounds        points_bin
0        4        2         7   (3.999, 10.667]
1        4        5         7   (3.999, 10.667]
2        7        4         4   (3.999, 10.667]
3        8        7         6   (3.999, 10.667]
4       12        7         3  (10.667, 19.333]
5       13        8         8  (10.667, 19.333]
6       15        5         9  (10.667, 19.333]
7       18        4         9  (10.667, 19.333]
8       22        5        12    (19.333, 25.0]
9       23       11        11    (19.333, 25.0]
10      23       13         8    (19.333, 25.0]
11      25        8         9    (19.333, 25.0]
