## Topics:
    
1. Defining an aggregation
2. Grouping and aggregating with multiple columns and functions
3. Removing the MultiIndex after grouping
4. Customizing an aggregation function
5. Customizing aggregating functions with *args and **kwargs
6. Examining the groupby object
7. Filtering for states with a minority majority
8. Transforming through a weight loss bet
9. Calculating weighted mean SAT scores per state with apply
10. Grouping by continuous variables
11. Counting the total number of flights between cities
12. Finding the longest streak of on-time flights


In [414]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

### Split - Apply - Combine

* Pandas will just check if groupby is feasible, other than that it wont work much


* Only by applying methods on the group by object, we can unleash its power

### 1. Defining an aggregation

In [415]:
flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [416]:
 flights.groupby('AIRLINE').agg({'ARR_DELAY':'mean'}).head()

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.03458


In [417]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg('mean').head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [418]:
flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.mean).head()

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [419]:
flights.groupby('AIRLINE')['ARR_DELAY'].mean().head() #cause, we are aggregating only on one column

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
Name: ARR_DELAY, dtype: float64

In [420]:
#flights.groupby('AIRLINE')['ARR_DELAY'].agg(np.sqrt) #error

In [421]:
flights.groupby('AIRLINE')['ARR_DELAY']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000238C2E71130>

### 2. Grouping and aggregating with multiple columns and functions

Grouoping the Columns\
Aggregating the columns\
Apply Aggregate Functions

In [422]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED'].agg('sum').head(7)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
         6          21
         7          29
Name: CANCELLED, dtype: int64

In [423]:
flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head(7)

  flights.groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED', 'DIVERTED'].agg(['sum', 'mean']).head(7)


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
AA,6,21,0.018667,9,0.008
AA,7,29,0.021837,1,0.000753


In [424]:
group_cols = ['ORG_AIR', 'DEST_AIR']
agg_dict = {'CANCELLED':['sum', 'mean', 'size'], 'AIR_TIME':['mean', 'var']}
flights.groupby(group_cols).agg(agg_dict).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643
ATL,ACY,0,0.0,6,91.333333,11.466667
ATL,AEX,0,0.0,40,78.725,47.332692


size : returns the total occurences\
count: only non-NULL values

In [425]:
#df.groupby(['grouping', 'columns']).agg({'agg_cols1':['list', 'of', 'functions'], 'agg_cols2':['other', 'functions']})

In [426]:
#df.groupby(['grouping', 'columns'])['aggregating', 'columns'].agg([aggregating, functions])

In [427]:
#df.groupby(['grouping', 'columns'])['aggregating', 'columns'].aggregating_method()

In [428]:
#df.groupby(['grouping', 'columns']).aggregating_method()

### 3. Removing the MultiIndex after grouping

In [429]:
flights = pd.read_csv('data/flights.csv')
airline_info = flights.groupby(['AIRLINE', 'WEEKDAY'])\
 .agg({'DIST':['sum', 'mean'],
 'ARR_DELAY':['min', 'max']}) \
 .astype(int)
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [430]:
level0 = airline_info.columns.get_level_values(0)

In [431]:
level0

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [432]:
level1 = airline_info.columns.get_level_values(1)

In [433]:
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [434]:
airline_info.columns = level0 + '_' + level1

In [435]:
airline_info.head(7)

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
AA,6,1265340,1124,-50,858
AA,7,1461906,1100,-49,626


In [436]:
airline_info.reset_index().head(7)

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
5,AA,6,1265340,1124,-50,858
6,AA,7,1461906,1100,-49,626


In [437]:
flights.groupby(['AIRLINE'], as_index=False)['DIST'].agg('mean').round(0)

Unnamed: 0,AIRLINE,DIST
0,AA,1114.0
1,AS,1066.0
2,B6,1772.0
3,DL,866.0
4,EV,460.0
5,F9,970.0
6,HA,2615.0
7,MQ,404.0
8,NK,1047.0
9,OO,511.0


### 4. Customizing an aggregation function

In [438]:
college = pd.read_csv('data/college.csv')
college.groupby('STABBR')['UGDS'].agg(['mean', 'std']).round(0).head()

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0


In [439]:
#STANDARDIZATION
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [440]:
college.groupby('STABBR')['UGDS'].agg(max_deviation).round(1).head()

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
Name: UGDS, dtype: float64

In [441]:
college.groupby('STABBR')['UGDS', 'SATVRMID', 'SATMTMID'].agg(max_deviation).round(1).head()

  college.groupby('STABBR')['UGDS', 'SATVRMID', 'SATMTMID'].agg(max_deviation).round(1).head()


Unnamed: 0_level_0,UGDS,SATVRMID,SATMTMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4


In [442]:
this = college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'].agg([max_deviation, 'mean', 'std']).round(1).head()

  this = college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'].agg([max_deviation, 'mean', 'std']).round(1).head()


In [443]:
this.columns.get_level_values(0)

Index(['UGDS', 'UGDS', 'UGDS', 'SATVRMID', 'SATVRMID', 'SATVRMID', 'SATMTMID',
       'SATMTMID', 'SATMTMID'],
      dtype='object')

In [444]:
this.columns.get_level_values(1)

Index(['max_deviation', 'mean', 'std', 'max_deviation', 'mean', 'std',
       'max_deviation', 'mean', 'std'],
      dtype='object')

In [445]:
max_deviation.__name__

'max_deviation'

In [446]:
max_deviation.__name__ = 'Max Deviation'
college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'].agg([max_deviation, 'mean', 'std']).round(1).head()


  college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATVRMID', 'SATMTMID'].agg([max_deviation, 'mean', 'std']).round(1).head()


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max Deviation,mean,std,Max Deviation,mean,std,Max Deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0


### 5. Customizing aggregating functions with *args and **kwargs

**args : Non Keyword Arguments\
**kwargs : Keywword Arguments

Keyword Arguments come last

In [447]:
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])

In [448]:
import inspect
inspect.signature(grouped.agg)

<Signature (func=None, *args, **kwargs)>

In [449]:
def pct_between_1_3k(s):
    return s.between(1000, 3000).mean()

In [450]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between_1_3k).head(9)

STABBR  RELAFFIL
AK      0           0.142857
        1           0.000000
AL      0           0.236111
        1           0.333333
AR      0           0.279412
        1           0.111111
AS      0           1.000000
AZ      0           0.096774
        1           0.000000
Name: UGDS, dtype: float64

In [451]:
def pct_between(s, low, high):
    return s.between(low, high).mean()

In [452]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, 10000).head(9)

STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

In [453]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, 1000, high=10000).head(9)

STABBR  RELAFFIL
AK      0           0.428571
        1           0.000000
AL      0           0.458333
        1           0.375000
AR      0           0.397059
        1           0.166667
AS      0           1.000000
AZ      0           0.233871
        1           0.111111
Name: UGDS, dtype: float64

In [454]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(pct_between, high=10000, low = 100).head(9)

STABBR  RELAFFIL
AK      0           0.857143
        1           0.333333
AL      0           0.791667
        1           0.625000
AR      0           0.691176
        1           0.722222
AS      0           1.000000
AZ      0           0.669355
        1           0.444444
Name: UGDS, dtype: float64

In [455]:
def make_agg_func(func, name, *args, **kwargs):
    def wrapper(x):
        return func(x, *args, **kwargs)
    wrapper.__name__ = name
    return wrapper

In [456]:
my_agg1 = make_agg_func(pct_between, 'pct_1_3k', low=1000, high=3000)
my_agg2 = make_agg_func(pct_between, 'pct_10_30k', 10000, 30000)

In [457]:
college.groupby(['STABBR', 'RELAFFIL'])['UGDS'].agg(['mean', my_agg1, my_agg2]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,pct_1_3k,pct_10_30k
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,3508.857143,0.142857,0.142857
AK,1,123.333333,0.0,0.0
AL,0,3248.774648,0.236111,0.083333
AL,1,979.722222,0.333333,0.0
AR,0,1793.691176,0.279412,0.014706


### 6. Examining the groupby object

In [458]:
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [459]:
print([attr for attr in dir(grouped) if not attr.startswith('_')])

['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']


In [460]:
grouped.ngroups

112

In [461]:
len(grouped.groups)

112

In [462]:
grouped.groups.keys()

dict_keys([('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1), ('AS', 0), ('AZ', 0), ('AZ', 1), ('CA', 0), ('CA', 1), ('CO', 0), ('CO', 1), ('CT', 0), ('CT', 1), ('DC', 0), ('DC', 1), ('DE', 0), ('DE', 1), ('FL', 0), ('FL', 1), ('FM', 0), ('GA', 0), ('GA', 1), ('GU', 0), ('GU', 1), ('HI', 0), ('HI', 1), ('IA', 0), ('IA', 1), ('ID', 0), ('ID', 1), ('IL', 0), ('IL', 1), ('IN', 0), ('IN', 1), ('KS', 0), ('KS', 1), ('KY', 0), ('KY', 1), ('LA', 0), ('LA', 1), ('MA', 0), ('MA', 1), ('MD', 0), ('MD', 1), ('ME', 0), ('ME', 1), ('MH', 0), ('MI', 0), ('MI', 1), ('MN', 0), ('MN', 1), ('MO', 0), ('MO', 1), ('MP', 0), ('MS', 0), ('MS', 1), ('MT', 0), ('MT', 1), ('NC', 0), ('NC', 1), ('ND', 0), ('ND', 1), ('NE', 0), ('NE', 1), ('NH', 0), ('NH', 1), ('NJ', 0), ('NJ', 1), ('NM', 0), ('NM', 1), ('NV', 0), ('NV', 1), ('NY', 0), ('NY', 1), ('OH', 0), ('OH', 1), ('OK', 0), ('OK', 1), ('OR', 0), ('OR', 1), ('PA', 0), ('PA', 1), ('PR', 0), ('PR', 1), ('PW', 0), ('RI', 0), ('RI', 1), ('SC', 0),

In [463]:
grouped.get_group(('FL', 1)).head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Baptist College of Florida,Graceville,FL,0.0,0.0,0.0,1,545.0,465.0,0.0,...,0.0308,0.0,0.0507,0.2291,1,0.5878,0.5602,0.3531,30800.0,20052
713,Barry University,Miami,FL,0.0,0.0,0.0,1,470.0,462.0,0.0,...,0.0164,0.0741,0.0841,0.1518,1,0.5045,0.6733,0.4361,44100.0,28250
714,Gooding Institute of Nurse Anesthesia,Panama City,FL,0.0,0.0,0.0,1,,,0.0,...,,,,,0,,,,,PrivacySuppressed
715,Bethune-Cookman University,Daytona Beach,FL,1.0,0.0,0.0,1,405.0,395.0,0.0,...,0.0198,0.0205,0.019,0.0523,1,0.7758,0.8867,0.0647,29400.0,36250
724,Johnson University Florida,Kissimmee,FL,0.0,0.0,0.0,1,480.0,470.0,0.0,...,0.0045,0.0045,0.0136,0.1636,1,0.6689,0.7384,0.2185,26300.0,20199


In [464]:
from IPython.display import display
for name, group in grouped:
    print(name)
    display(group.head(3))
    print("============================")
    break

('AK', 0)


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,AK,0.0,0.0,0.0,0,,,0.0,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200,19355.0
63,University of Alaska Southeast,Juneau,AK,0.0,0.0,0.0,0,,,0.0,...,0.0686,0.0049,0.2241,0.5112,1,0.1769,0.1996,0.555,37400,16875.0




In [465]:
grouped.head(1).head(4)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
43,Prince Institute-Southeast,Elmhurst,IL,0.0,0.0,0.0,0,,,0.0,...,0.0,0.0,0.0,0.0,1,0.7857,0.9375,0.6569,PrivacySuppressed,20992.0
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5


In [466]:
 grouped.nth([1, -1]).head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,INSTNM,CITY,HBCU,MENONLY,WOMENONLY,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AK,0,University of Alaska Fairbanks,Fairbanks,0.0,0.0,0.0,,,0.0,5536.0,0.4259,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200.0,19355
AK,0,Ilisagvik College,Barrow,0.0,0.0,0.0,,,0.0,109.0,0.1376,...,0.0,0.0183,0.0,0.6239,1,0.1323,0.0,0.6498,24900.0,PrivacySuppressed
AK,1,Alaska Pacific University,Anchorage,0.0,0.0,0.0,555.0,503.0,0.0,275.0,0.5309,...,0.0945,0.0,0.0873,0.3745,1,0.3152,0.5297,0.491,47000.0,23250
AK,1,Alaska Christian College,Soldotna,0.0,0.0,0.0,,,0.0,68.0,0.0588,...,0.0147,0.0,0.1324,0.0735,1,0.8868,0.6792,0.2264,,PrivacySuppressed
AL,0,University of Alabama at Birmingham,Birmingham,0.0,0.0,0.0,570.0,565.0,0.0,11383.0,0.5922,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700.0,21941.5
AL,0,Alabama College of Osteopathic Medicine,Dothan,0.0,0.0,0.0,,,0.0,,,...,,,,,1,,,,,PrivacySuppressed
AL,1,Birmingham Southern College,Birmingham,0.0,0.0,0.0,560.0,560.0,0.0,1180.0,0.7983,...,0.0051,0.0,0.0051,0.0017,1,0.192,0.4809,0.0152,44200.0,27000
AL,1,Strayer University-Huntsville Campus,Huntsville,,,,,,,,,...,,,,,1,,,,49200.0,36173.5


### 7. Filtering for states with a minority majority

In [467]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups

59

In [468]:
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()
    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds
    return total_minority_pct > threshold

In [469]:
college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered.head()

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everest College-Phoenix,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,4102.0,...,0.0373,0.0,0.1026,0.4749,0,0.8291,0.7151,0.67,28600,9500
Collins College,Phoenix,AZ,0.0,0.0,0.0,0,,,0.0,83.0,...,0.0241,0.0,0.3855,0.3373,0,0.7205,0.8228,0.4764,25700,47000
Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,25.0,...,0.04,0.0,0.0,0.16,0,0.6349,0.5873,0.4651,17800,9588
Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,,0.0,126.0,...,0.0,0.0,0.0079,0.2222,1,0.7962,0.6615,0.4229,18200,9833
Thunderbird School of Global Management,Glendale,AZ,0.0,0.0,0.0,0,,,0.0,1.0,...,0.0,0.0,0.0,1.0,0,0.0,0.0,0.0,118900,PrivacySuppressed


In [470]:
 college.shape

(7535, 26)

In [471]:
college_filtered.shape

(3028, 26)

In [472]:
college_filtered['STABBR'].nunique()

20

In [473]:
college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape

(7461, 26)

In [474]:
college_filtered_20['STABBR'].nunique()

57

In [475]:
college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Academy of Art University,San Francisco,CA,0.0,0.0,0.0,0,,,0.0,9885.0,...,0.0249,0.2523,0.2098,0.4334,1,0.4008,0.5524,0.4043,36000,35093
ITT Technical Institute-Rancho Cordova,Rancho Cordova,CA,0.0,0.0,0.0,0,,,0.0,500.0,...,0.0400,0.0000,0.1780,0.2540,0,0.7137,0.7667,0.7235,38800,25827.5
Academy of Chinese Culture and Health Sciences,Oakland,CA,0.0,0.0,0.0,0,,,0.0,,...,,,,,1,,,,,PrivacySuppressed
The Academy of Radio and TV Broadcasting,Huntington Beach,CA,0.0,0.0,0.0,0,,,0.0,14.0,...,0.0000,0.0000,0.0000,0.0000,1,0.9579,1.0000,0.4545,28400,9500
Avalon School of Cosmetology-Alameda,Alameda,CA,0.0,0.0,0.0,0,,,0.0,253.0,...,0.0553,0.0000,0.0435,0.5099,1,0.7407,0.6768,0.3387,21600,9860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Xtreme Career Institute -,Arecibo,PR,,,,1,,,,,...,,,,,1,,,,,PrivacySuppressed
Blake Austin College Beauty Academy,Vacaville,CA,,,,1,,,,,...,,,,,1,,,,PrivacySuppressed,9500
WestMed College - Merced,Merced,CA,,,,1,,,,,...,,,,,1,,,,,15623.5
SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,,,,...,,,,,1,,,,,9500


In [476]:
 college_filtered_70['STABBR'].nunique()

10

### 8. Transforming through a weight loss bet

In [477]:
weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [478]:
def find_perc_loss(s):
    return (s - s.iloc[0]) / s.iloc[0]

In [479]:
bob_jan = weight_loss.query('Name=="Bob" and Month=="Jan"')
bob_jan

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
2,Bob,Jan,Week 2,288
4,Bob,Jan,Week 3,283
6,Bob,Jan,Week 4,283


In [480]:
find_perc_loss(bob_jan['Weight'])

0    0.000000
2   -0.010309
4   -0.027491
6   -0.027491
Name: Weight, dtype: float64

In [481]:
pcnt_loss = weight_loss.groupby(['Name', 'Month'])['Weight'].transform(find_perc_loss)
pcnt_loss

0     0.000000
1     0.000000
2    -0.010309
3    -0.040609
4    -0.027491
5    -0.040609
6    -0.027491
7    -0.035533
8     0.000000
9     0.000000
10   -0.028269
11   -0.031579
12   -0.053004
13   -0.068421
14   -0.053004
15   -0.089474
16    0.000000
17    0.000000
18    0.011194
19    0.000000
20   -0.011194
21   -0.017341
22   -0.026119
23   -0.017341
24    0.000000
25    0.000000
26   -0.011494
27   -0.035294
28   -0.030651
29   -0.035294
30   -0.042146
31   -0.052941
Name: Weight, dtype: float64

In [482]:
weight_loss['Perc Weight Loss'] = pcnt_loss.round(3)
weight_loss.query('Name=="Bob" and Month in ["Jan", "Feb"]')

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
0,Bob,Jan,Week 1,291,0.0
2,Bob,Jan,Week 2,288,-0.01
4,Bob,Jan,Week 3,283,-0.027
6,Bob,Jan,Week 4,283,-0.027
8,Bob,Feb,Week 1,283,0.0
10,Bob,Feb,Week 2,275,-0.028
12,Bob,Feb,Week 3,268,-0.053
14,Bob,Feb,Week 4,268,-0.053


In [483]:
week4 = weight_loss.query('Week == "Week 4"')
week4

Unnamed: 0,Name,Month,Week,Weight,Perc Weight Loss
6,Bob,Jan,Week 4,283,-0.027
7,Amy,Jan,Week 4,190,-0.036
14,Bob,Feb,Week 4,268,-0.053
15,Amy,Feb,Week 4,173,-0.089
22,Bob,Mar,Week 4,261,-0.026
23,Amy,Mar,Week 4,170,-0.017
30,Bob,Apr,Week 4,250,-0.042
31,Amy,Apr,Week 4,161,-0.053


In [484]:
winner = week4.pivot(index='Month', columns='Name', values='Perc Weight Loss')
winner

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-0.053,-0.042
Feb,-0.089,-0.053
Jan,-0.036,-0.027
Mar,-0.017,-0.026


In [485]:
winner['Winner'] = np.where(winner['Amy'] < winner['Bob'],
'Amy', 'Bob')
winner.style.highlight_min(axis=1)

Name,Amy,Bob,Winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-0.053,-0.042,Amy
Feb,-0.089,-0.053,Amy
Jan,-0.036,-0.027,Amy
Mar,-0.017,-0.026,Bob


In [486]:
winner.Winner.value_counts()

Amy    3
Bob    1
Name: Winner, dtype: int64

In [487]:
#pivot_table can help when pivot raises an error

In [488]:
week4a = week4.copy()
month_chron = week4a['Month'].unique() # or use drop_duplicates
month_chron

array(['Jan', 'Feb', 'Mar', 'Apr'], dtype=object)

In [489]:
week4a['Month'] = pd.Categorical(week4a['Month'], categories=month_chron,
        ordered=True)
week4a.pivot(index='Month', columns='Name', values='Perc Weight Loss')

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,-0.036,-0.027
Feb,-0.089,-0.053
Mar,-0.017,-0.026
Apr,-0.053,-0.042


### 9. Calculating weighted mean SAT scores per state with apply

In [490]:
college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college.head(2)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5


In [491]:
college.shape

(7535, 27)

In [492]:
college2 = college.dropna(subset=subset)

In [493]:
college2.head(3)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0


In [494]:
college2.shape

(1184, 27)

In [495]:
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [496]:
college2.groupby('STABBR').apply(weighted_math_average).head()

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
dtype: int64

In [497]:
college2.groupby('STABBR').agg(weighted_math_average).head()

KeyError: 'UGDS'

In [None]:
college2.groupby('STABBR')['SATMTMID'].agg(weighted_math_average)

In [None]:
from collections import OrderedDict
def weighted_average(df):
    data = OrderedDict()
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data['weighted_math_avg'] = wm_avg
    data['weighted_verbal_avg'] = wv_avg
    data['math_avg'] = df['SATMTMID'].mean()
    data['verbal_avg'] = df['SATVRMID'].mean()
    data['count'] = len(df)
    return pd.Series(data, dtype='int')

In [None]:
college2.dtypes

In [None]:
college2.groupby('STABBR').apply(weighted_average).head(10)

### 10. Grouping by continuous variables

In [None]:
flights = pd.read_csv('data/flights.csv')
flights.head()

In [None]:
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]

In [None]:
flights['DIST']

In [None]:
cuts = pd.cut(flights['DIST'], bins=bins)

In [None]:
cuts.head()

In [None]:
cuts.value_counts()

In [None]:
flights.groupby(cuts)['AIRLINE'].value_counts(normalize=True).round(3).head(15)

In [None]:
flights.groupby(cuts)['AIR_TIME'].quantile(q=[.25, .5, .75]).div(60).round(2)

In [None]:
labels=['Under an Hour', '1 Hour', '1-2 Hours',
'2-4 Hours', '4+ Hours']

In [None]:
flights['DIST']

In [None]:
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)

In [None]:
cuts2

In [None]:
flights.groupby(cuts2)['AIRLINE'].value_counts(normalize=True).round(3).unstack().style.highlight_max(axis=1)

### 11. Counting the total number of flights between cities

In [None]:
flights = pd.read_csv('data/flights.csv')
flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
flights_ct.head()

In [None]:
flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]

In [None]:
flights[['ORG_AIR', 'DEST_AIR']]

In [None]:
flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=0)

In [None]:
flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)

In [None]:
flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)
flights_sort

In [None]:
len(flights_sort)

In [None]:
len(flights)

In [509]:
sorted(flights.loc[0, ['ORG_AIR', 'DEST_AIR']])

['LAX', 'SLC']

In [512]:
rename_dict = {'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'}
flights_sort = flights_sort.rename({'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
flights_sort = flights_sort.rename(columns = rename_dict)
flights_ct2 = flights_sort.groupby(['AIR1', 'AIR2']).size()
flights_ct2.head()
#error version problem for rename columns, why

TypeError: rename() got an unexpected keyword argument 'columns'

In [513]:
flights_ct2.loc[('ATL', 'IAH')]

NameError: name 'flights_ct2' is not defined

In [514]:
flights_ct2.loc[('IAH', 'ATL')]

NameError: name 'flights_ct2' is not defined

In [515]:
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]

array([['LAX', 'SLC'],
       ['DEN', 'IAD'],
       ['DFW', 'VPS'],
       ['DCA', 'DFW'],
       ['LAX', 'MCI'],
       ['IAH', 'SAN'],
       ['DFW', 'MSY'],
       ['PHX', 'SFO'],
       ['ORD', 'STL'],
       ['IAH', 'SJC']], dtype=object)

In [522]:
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
#fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'}) #error why
fs_orig = flights_sort.rename(columns={'ORG_AIR':'AIR1', 'DEST_AIR':'AIR2'})
flights_sort2.equals(fs_orig)

TypeError: rename() got an unexpected keyword argument 'columns'

In [529]:
# %%timeit
flights_sort = flights[['ORG_AIR', 'DEST_AIR']].apply(sorted, axis=1)

In [528]:
%%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])

7.13 ms ± 42.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### 12. Finding the longest streak of on-time flights

In [531]:
s = pd.Series([0, 1, 1, 0, 1, 1, 1, 0])
s

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [534]:
s1 = s.cumsum()
s1

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64

In [535]:
s.mul(s1)

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64

In [536]:
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64

In [537]:
s.mul(s1).diff().where(lambda x: x < 0)

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64

In [538]:
s.mul(s1).diff().where(lambda x: x < 0).ffill()

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64

In [539]:
s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0)

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64

In [540]:
flights = pd.read_csv('data/flights.csv')
flights['ON_TIME'] = flights['ARR_DELAY'].lt(15).astype(int)
flights[['AIRLINE', 'ORG_AIR', 'ON_TIME']].head(10)

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
5,UA,IAH,1
6,AA,DFW,0
7,F9,SFO,1
8,AA,ORD,1
9,UA,IAH,1


In [542]:
def max_streak(s):
    s1 = s.cumsum()
    return s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0).max()

In [543]:
flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']).groupby(['AIRLINE', 'ORG_AIR'])['ON_TIME'] \
.agg(['mean', 'size', max_streak]).round(2).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.8,196,24
AA,LAS,0.79,374,29


In [544]:
def max_delay_streak(df):
    df = df.reset_index(drop=True)
    s = 1 - df['ON_TIME']
    s1 = s.cumsum()
    streak = s.mul(s1).diff().where(lambda x: x < 0) \
    .ffill().add(s1, fill_value=0)
    last_idx = streak.idxmax()
    first_idx = last_idx - streak.max() + 1
    df_return = df.loc[[first_idx, last_idx], ['MONTH', 'DAY']]
    df_return['streak'] = streak.max()
    df_return.index = ['first', 'last']
    df_return.index.name='type'
    return df_return

In [548]:
# flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
#  .groupby(['AIRLINE', 'ORG_AIR']) \
#  .apply(max_delay_streak) \
#  .sort_values('streak', ascending=False).head(10)
#error why

In [549]:
# Lot of questions, have to revisit