# Summarising DataFrames

In [1]:
import pandas as pd
import seaborn as sns

In [7]:
passengers = sns.load_dataset("flights")
passengers.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [8]:
passengers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   year        144 non-null    int64   
 1   month       144 non-null    category
 2   passengers  144 non-null    int64   
dtypes: category(1), int64(2)
memory usage: 2.9 KB


In [12]:
# method summary stats
print(passengers["passengers"].mean())
print(passengers["passengers"].median())
print(passengers["passengers"].max())
print(passengers["passengers"].min())

280.2986111111111
265.5
622
104


In [13]:
# cumulative methods such as cumsum return a column of values
passengers["cumulative_passengers"] = passengers["passengers"].cumsum()
print(passengers.head())

   year month  passengers  cumulative_passengers
0  1949   Jan         112                    112
1  1949   Feb         118                    230
2  1949   Mar         132                    362
3  1949   Apr         129                    491
4  1949   May         121                    612


In [14]:
# setting new records with cummax
passengers["cumulative_maximum"] = passengers["passengers"].cummax()
passengers.head()

Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum
0,1949,Jan,112,112,112
1,1949,Feb,118,230,118
2,1949,Mar,132,362,132
3,1949,Apr,129,491,132
4,1949,May,121,612,132


## `.agg()`method

If you need to apply a custom function to a column (or more than one column), then agg
is a good bet.

In [16]:
def get_upper_fence(col):
    q3 = col.quantile(0.75)
    iqr = q3 - col.quantile(0.25)
    return q3 + 1.5 * iqr

up_fence = passengers["passengers"].agg(get_upper_fence)
passengers[passengers["passengers"] > up_fence]
# no upper outliers exist


Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum


In [28]:
def get_lower_fence(col):
    q1 = col.quantile(0.25)
    iqr = col.quantile(0.75) - q1
    return q1 - 1.5 * iqr

passengers["passengers"].agg([get_upper_fence, get_lower_fence])
# agg allows multiple defined funcs to be passes in a list.

get_upper_fence    631.25
get_lower_fence    -90.75
Name: passengers, dtype: float64

In [30]:
# you can also pass multiple columns for summary in a list
passengers[["year", "passengers"]].agg([get_upper_fence, get_lower_fence])

Unnamed: 0,year,passengers
get_upper_fence,1965.5,631.25
get_lower_fence,1943.5,-90.75


## Sorting & Counting

In [34]:
passengers.sort_values("passengers").head(1)
# Nov 1949 lowest passenger count

Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum
10,1949,Nov,104,1402,148


In [35]:
passengers.sort_values("passengers", ascending=False).head(1)
# Jul 1960 highest

Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum
138,1960,Jul,622,37966,622


In [39]:
passengers.drop_duplicates("month")
# filter to first month only

Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum
0,1949,Jan,112,112,112
1,1949,Feb,118,230,118
2,1949,Mar,132,362,132
3,1949,Apr,129,491,132
4,1949,May,121,612,132
5,1949,Jun,135,747,135
6,1949,Jul,148,895,148
7,1949,Aug,148,1043,148
8,1949,Sep,136,1179,148
9,1949,Oct,119,1298,148


In [40]:
passengers.drop_duplicates(subset=["year", "month"])
# doesn't achieve anything in this case but useful 

Unnamed: 0,year,month,passengers,cumulative_passengers,cumulative_maximum
0,1949,Jan,112,112,112
1,1949,Feb,118,230,118
2,1949,Mar,132,362,132
3,1949,Apr,129,491,132
4,1949,May,121,612,132
...,...,...,...,...,...
139,1960,Aug,606,38572,622
140,1960,Sep,508,39080,622
141,1960,Oct,461,39541,622
142,1960,Nov,390,39931,622


In [45]:
passengers["cumulative_maximum"].value_counts(sort=True)

cumulative_maximum
272    11
559    11
505    11
148    11
467    11
413    11
199    11
364    11
302    11
242    10
170     8
622     6
178     4
132     3
243     1
264     1
118     1
230     1
315     1
218     1
374     1
422     1
465     1
149     1
491     1
135     1
548     1
112     1
Name: count, dtype: int64

In [48]:
passengers["month"].value_counts(normalize=True, sort=True)

month
Jan    0.083333
Feb    0.083333
Mar    0.083333
Apr    0.083333
May    0.083333
Jun    0.083333
Jul    0.083333
Aug    0.083333
Sep    0.083333
Oct    0.083333
Nov    0.083333
Dec    0.083333
Name: proportion, dtype: float64