In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,...,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,...,1905,65.0,0,0
1,1,1,4,UA,...,1333,-13.0,0,0
2,1,1,4,MQ,...,1453,35.0,0,0
3,1,1,4,AA,...,1935,-7.0,0,0
4,1,1,4,WN,...,2225,39.0,0,0


### Grouping/aggregate on single column

In [4]:
flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [5]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [6]:
type(flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean'))

pandas.core.series.Series

In [7]:
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [8]:
grouped = flights.groupby('AIRLINE')
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [9]:
try:
    flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt)
except ValueError as error:
    print('Only aggregate methods are allowed for agg')
    error

Only aggregate methods are allowed for agg


  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError('Must produce aggregated value')

### grouping mutiple columns

In [10]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].sum().head(20)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
         6          21
         7          29
AS       1           0
         2           0
         3           0
         4           0
         5           0
         6           0
         7           0
B6       1           0
         2           1
         3           0
         4           0
         5           0
         6           0
Name: CANCELLED, dtype: int64

In [11]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head()

  flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786


In [12]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['SCHED_ARR', 'DIVERTED'].agg(['sum', 'min']).head(10)

  flights.groupby(['AIRLINE', 'WEEKDAY'])['SCHED_ARR', 'DIVERTED'].agg(['sum', 'min']).head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,SCHED_ARR,SCHED_ARR,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,sum,min
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,2018786,3,6,0
AA,2,1906451,4,2,0
AA,3,2082989,1,2,0
AA,4,2044815,3,5,0
AA,5,1965188,1,1,0
AA,6,1734700,6,9,0
AA,7,2015023,2,1,0
AS,1,185654,9,0,0
AS,2,155258,9,0,0
AS,3,180524,27,0,0


In [13]:
groupin_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {'CANCELLED' : ['sum', 'mean', 'size'],
            'AIR_TIME' : ['mean', 'var']}

In [14]:
flights.groupby(groupin_cols).agg(agg_dict)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.000000,31,96.387097,45.778495
ATL,ABQ,0,0.000000,16,170.500000,87.866667
ATL,ABY,0,0.000000,19,28.578947,6.590643
ATL,ACY,0,0.000000,6,91.333333,11.466667
ATL,AEX,0,0.000000,40,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,122,64.059322,11.338331
SFO,STL,0,0.000000,20,198.900000,101.042105
SFO,SUN,0,0.000000,10,78.000000,25.777778
SFO,TUS,0,0.000000,20,100.200000,35.221053


In [15]:
flights.groupby(groupin_cols).sum() ## aggregation applied to remaining columns by default

Unnamed: 0_level_0,Unnamed: 1_level_0,MONTH,DAY,WEEKDAY,SCHED_DEP,...,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ATL,ABE,167,519,132,46567,...,52916,185.0,0,0
ATL,ABQ,98,189,62,25426,...,27960,4.0,0,0
ATL,ABY,136,345,87,31253,...,32758,391.0,0,0
ATL,ACY,40,72,11,8972,...,10096,64.0,0,0
ATL,AEX,275,627,160,59674,...,62361,301.0,0,0
...,...,...,...,...,...,...,...,...,...,...
SFO,SNA,780,1863,419,172954,...,191323,389.0,0,4
SFO,STL,144,307,72,24558,...,36147,362.0,0,0
SFO,SUN,59,141,47,14429,...,17203,139.0,0,0
SFO,TUS,117,254,79,30202,...,35549,311.0,0,0


### removing the multi index after groupby

In [16]:
airline_info = flights.groupby(['AIRLINE', 'WEEKDAY']) \
                        .agg({'DIST': ['sum', 'mean'], 'ARR_DELAY': ['min', 'max']}) \
                        .astype(int)

In [17]:
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [18]:
airline_info.columns

MultiIndex([(     'DIST',  'sum'),
            (     'DIST', 'mean'),
            ('ARR_DELAY',  'min'),
            ('ARR_DELAY',  'max')],
           )

In [19]:
level0 = airline_info.columns.get_level_values(0)

In [20]:
level1 = airline_info.columns.get_level_values(1)

In [21]:
level0

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [22]:
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [23]:
airline_info.columns = level0 + '_' + level1

In [24]:
airline_info.columns

Index(['DIST_sum', 'DIST_mean', 'ARR_DELAY_min', 'ARR_DELAY_max'], dtype='object')

In [25]:
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [26]:
airline_info.reset_index().head(7)

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
5,AA,6,1265340,1124,-50,858
6,AA,7,1461906,1100,-49,626


In [27]:
groupby_without_index = flights.groupby(['AIRLINE'], as_index=False)['DIST'].agg('mean').round(0)

In [28]:
groupby_without_index.index   # Index not created with column

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

In [29]:
groupby_without_index.head(7)

Unnamed: 0,AIRLINE,DIST
0,AA,1114.0
1,AS,1066.0
2,B6,1772.0
3,DL,866.0
4,EV,460.0
5,F9,970.0
6,HA,2615.0


### Using customized aggregate functions

In [30]:
college = pd.read_csv('data/college.csv')

In [31]:
college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head(10)

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0
CA,3518.0,6709.0
CO,2325.0,4670.0
CT,1874.0,2871.0
DC,2645.0,3225.0
DE,2491.0,4503.0


In [32]:
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [33]:
college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head(10)

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
CA    6.1
CO    5.0
CT    5.6
DC    2.4
DE    3.5
Name: UGDS, dtype: float64

In [34]:
college.groupby('STABBR')[['UGDS', 'SATVRMID']] \
        .agg(max_deviation).round(1).head(5)

Unnamed: 0_level_0,UGDS,SATVRMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2.6,
AL,5.8,1.6
AR,6.3,2.2
AS,,
AZ,9.9,1.9


In [35]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

  college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,max_deviation,mean,std,max_deviation,...,std,max_deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,...,,,,
AK,1,1.1,123.3,132.9,,...,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,...,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,...,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,...,37.9,2.0,503.6,39.0


In [36]:
max_deviation.__name__

'max_deviation'

In [37]:
max_deviation.__name__ = 'Max deviation'

In [38]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \
        .agg([max_deviation, 'mean', 'std']).round(1).head()

  college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'] \


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max deviation,mean,std,Max deviation,...,std,Max deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,...,,,,
AK,1,1.1,123.3,132.9,,...,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,...,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,...,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,...,37.9,2.0,503.6,39.0


### Passing multiple args to cusotmized aggregate method

In [39]:
groupby_obj = college.groupby(['STABBR', 'RELAFFIL'])

In [40]:
import inspect
inspect.signature(groupby_obj.agg)

<Signature (func=None, *args, engine=None, engine_kwargs=None, **kwargs)>

In [41]:
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

In [42]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(pct_between_1_3k).head(5)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
Name: UGDS, dtype: float64

In [43]:
def pct_between(s, low, high): # customized boundaries
    return s.between(low, high).mean()

In [44]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(pct_between, 1000, 3000).head(5)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
Name: UGDS, dtype: float64

#### How to use multiple argument methods combined with other aggregate methods

In [45]:
try:
    college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
            .agg(['mean', pct_between], low=1000, high=3000)
except TypeError as error:
    error

TypeError("pct_between() missing 2 required positional arguments: 'low' and 'high'")

#### this problem can be solved by creating customizable functions ( decorators concept in python)

In [46]:
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    wrapper.__name__ = name
    return wrapper

In [47]:
my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

In [48]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'] \
        .agg(['mean', my_agg1, my_agg2]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,pct_1_3k,pct_10_30k
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706


### GroupBy object details

In [49]:
college.head(2)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5


In [50]:
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [51]:
len([attr for attr in dir(grouped) if not attr.startswith('_')])

93

In [52]:
grouped.ngroups   # different combinations of the grouped columns

112

In [53]:
groups = list(grouped.groups.keys())

In [54]:
groups[:6]

[('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]

In [55]:
grouped.get_group(('FL', 1)).head(2)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Baptist College of Florida,Graceville,FL,0.0,...,0.5602,0.3531,30800,20052
713,Barry University,Miami,FL,0.0,...,0.6733,0.4361,44100,28250


In [56]:
for name, group in grouped:
    print(name, group.head(2))
    break

('AK', 0)                             INSTNM       CITY STABBR  HBCU  ...  PCTFLOAN  \
60  University of Alaska Anchorage  Anchorage     AK   0.0  ...    0.2647   
62  University of Alaska Fairbanks  Fairbanks     AK   0.0  ...    0.2550   

    UG25ABV  MD_EARN_WNE_P10  GRAD_DEBT_MDN_SUPP  
60   0.4386            42500             19449.5  
62   0.4519            36200               19355  

[2 rows x 27 columns]


In [57]:
grouped.head().head(3)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,...,0.7795,0.854,40100,23370.0


### finding the majoriy of race for students after the white race

In [58]:
grouped = college.groupby('STABBR')

In [59]:
grouped.ngroups

59

In [60]:
college['STABBR'].nunique()

59

In [61]:
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()
    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds
    return total_minority_pct > threshold

In [62]:
college_filtered = grouped.filter(check_minority, threshold=.5)

In [63]:
college_filtered.head(2)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
68,Everest College-Phoenix,Phoenix,AZ,0.0,...,0.7151,0.67,28600,9500
69,Collins College,Phoenix,AZ,0.0,...,0.8228,0.4764,25700,47000


In [64]:
college.shape

(7535, 27)

In [65]:
college_filtered.shape

(3028, 27)

In [66]:
college['STABBR'].nunique()   # before filtering , no of state

59

In [67]:
college_filtered['STABBR'].nunique()  # number of states with minority race as majority ( 50%)

20

In [68]:
college_filtered_20 = grouped.filter(check_minority, threshold=.2)

In [69]:
college_filtered_20.shape, college_filtered_20['STABBR'].nunique()

((7461, 27), 57)

In [70]:
college_filtered_70 = grouped.filter(check_minority, threshold=.7)

In [71]:
college_filtered_70.shape, college_filtered_70['STABBR'].nunique()

((957, 27), 10)

In [72]:
college

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,...,0.7795,0.8540,40100,23370
3,University of Alabama in Huntsville,Huntsville,AL,0.0,...,0.4596,0.2640,45500,24097
4,Alabama State University,Montgomery,AL,1.0,...,0.7554,0.1270,26600,33118.5
...,...,...,...,...,...,...,...,...,...
7530,SAE Institute of Technology San Francisco,Emeryville,CA,,...,,,,9500
7531,Rasmussen College - Overland Park,Overland Park,KS,,...,,,,21163
7532,National Personal Training Institute of Cleveland,Highland Heights,OH,,...,,,,6333
7533,Bay Area Medical Academy - San Jose Satellite ...,San Jose,CA,,...,,,,PrivacySuppressed


In [73]:
college = pd.read_csv('data/college.csv')

In [77]:
subset = ['UGDS', 'SATMTMID', 'SATVRMID']

In [78]:
college2 = college.dropna(subset=subset)

In [79]:
college.shape, college2.shape

((7535, 27), (1184, 27))

In [80]:
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [81]:
college2.groupby('STABBR').apply(weighted_math_average).head()

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
dtype: int64

In [84]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,...,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,...,1905,65.0,0,0
1,1,1,4,UA,...,1333,-13.0,0,0
2,1,1,4,MQ,...,1453,35.0,0,0
3,1,1,4,AA,...,1935,-7.0,0,0
4,1,1,4,WN,...,2225,39.0,0,0


In [86]:
flights['DIST'].value_counts()

337     1035
862      668
236      565
414      503
1235     489
        ... 
1679       1
274        1
463        1
360        1
1304       1
Name: DIST, Length: 850, dtype: int64

In [87]:
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]

In [89]:
cuts = pd.cut(flights['DIST'], bins=bins)

In [90]:
cuts.head(15)

0      (500.0, 1000.0]
1     (1000.0, 2000.0]
2      (500.0, 1000.0]
3     (1000.0, 2000.0]
4     (1000.0, 2000.0]
5     (1000.0, 2000.0]
6       (200.0, 500.0]
7      (500.0, 1000.0]
8       (200.0, 500.0]
9     (1000.0, 2000.0]
10     (500.0, 1000.0]
11     (500.0, 1000.0]
12      (200.0, 500.0]
13      (200.0, 500.0]
14     (500.0, 1000.0]
Name: DIST, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]

In [91]:
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

In [94]:
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

In [95]:
airline_time_splits = flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3)

In [100]:
type(airline_time_splits

pandas.core.series.Series

In [102]:
airline_time_splits_quantile = flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)

In [106]:
airline_time_splits_quantile.head(20)

DIST                  
(-inf, 200.0]     0.25    0.43
                  0.50    0.50
                  0.75    0.57
(200.0, 500.0]    0.25    0.77
                  0.50    0.92
                  0.75    1.05
(500.0, 1000.0]   0.25    1.43
                  0.50    1.65
                  0.75    1.92
(1000.0, 2000.0]  0.25    2.50
                  0.50    2.93
                  0.75    3.40
(2000.0, inf]     0.25    4.30
                  0.50    4.70
                  0.75    5.03
Name: AIR_TIME, dtype: float64

In [107]:
labels=['Under an Hour', '1 Hour', '1-2 Hours','2-4 Hours', '4+ Hours']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)

In [108]:
flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True) \
        .round(3) \
        .unstack() \
        .style.highlight_max(axis=1)

AIRLINE,AA,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
DIST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Under an Hour,0.052,,,0.086,0.289,,,0.211,,0.326,0.027,,,0.009
1 Hour,0.071,0.001,0.007,0.189,0.156,0.005,,0.1,0.012,0.159,0.062,0.016,0.028,0.194
1-2 Hours,0.144,0.023,0.003,0.206,0.101,0.038,,0.051,0.03,0.106,0.131,0.025,0.004,0.138
2-4 Hours,0.264,0.016,0.003,0.165,0.016,0.031,,0.003,0.045,0.046,0.199,0.04,0.012,0.16
4+ Hours,0.212,0.012,0.08,0.171,,0.004,0.028,,0.019,,0.289,0.065,0.074,0.046
