In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/gapminder.tsv', sep='\t')

In [3]:
# Calculate the avg life expectancy for each year
avg_life_exp_by_year = df.groupby('year')["lifeExp"].mean()
avg_life_exp_by_year

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [4]:
# Groupby statements can be thought of as creating a subset of each unique value of a column(or pairs from columns)
# get a list of unique years in the data
years = df.year.unique()
years

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007], dtype=int64)

In [5]:
# we can go through each of the years and subset the data
# example: subset the data for year 
y1952 = df.loc[df.year == 1952, :]
y1952

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
12,Albania,Europe,1952,55.230,1282697,1601.056136
24,Algeria,Africa,1952,43.077,9279525,2449.008185
36,Angola,Africa,1952,30.015,4232095,3520.610273
48,Argentina,Americas,1952,62.485,17876956,5911.315053
...,...,...,...,...,...,...
1644,Vietnam,Asia,1952,40.412,26246839,605.066492
1656,West Bank and Gaza,Asia,1952,43.160,1030585,1515.592329
1668,"Yemen, Rep.",Asia,1952,32.548,4963829,781.717576
1680,Zambia,Africa,1952,42.038,2672000,1147.388831


In [6]:
# Finally, we can take the mean of the life expectancy
y1952_mean = y1952["lifeExp"].mean() # the groupby 
y1952_mean

49.057619718309866

In [7]:
# Table 8.1 (page 178) of the book lists all the methods and functions that we can use with the groupby()
# group by continent and describe each group
continent_describe = df.groupby('continent')["lifeExp"].describe()
continent_describe

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


In [8]:
# the agg(), which stands for aggregation, is used for other functions not listed in the groupby table
# We can use the mean() function from the numpy library by passing the function into the agg() method
import numpy as np
# calculate the avg life expectancy per continent
# but use the agg() method
cont_le_agg = df.groupby('continent')["lifeExp"].agg(np.mean)
cont_le_agg

  cont_le_agg = df.groupby('continent')["lifeExp"].agg(np.mean)


continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [9]:
# we can also create custom calculations/formulass and pass them with agg()
# create a custom function
def my_mean(values):
    """my version of calculating a mean"""
    n = len(values)
    sum = 0
    for value in values:
        # add each value to the running sum
        sum += value
    return sum/n


In [10]:
# now we use our created function inside agg()
agg_my_mean = df.groupby('year')["lifeExp"].agg(my_mean)
agg_my_mean

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [11]:
# in this example, we will calculate the global avg life expect, diff_value and substract it from each grouped value
def my_mean_diff(values, diff_value):
    """Difference between the mean and diff_vaue"""
    n = len(values)
    sum = 0
    for value in values:
        sum += value
    mean = sum/n
    return   (mean - diff_value)

In [12]:
# calculate the global avg life expect mean
global_mean = df["lifeExp"].mean()
global_mean

59.474439366197174

In [13]:
# custom aggregation function with multiple parameters
agg_mean_diff = (
    df
    .groupby("year")
    ["lifeExp"]
    .agg(my_mean_diff, diff_value=global_mean)
)
agg_mean_diff

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

In [14]:
# 8.1.4 Multiple Functions Simultaneously
# calculate the count, mean, std of the lifeExp by continent
gdf = (
    df
    .groupby("year")
    ["lifeExp"]
    .agg([np.count_nonzero, np.mean, np.std])
)
gdf

  .agg([np.count_nonzero, np.mean, np.std])
  .agg([np.count_nonzero, np.mean, np.std])


Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142,49.05762,12.225956
1957,142,51.507401,12.231286
1962,142,53.609249,12.097245
1967,142,55.67829,11.718858
1972,142,57.647386,11.381953
1977,142,59.570157,11.227229
1982,142,61.533197,10.770618
1987,142,63.212613,10.556285
1992,142,64.160338,11.22738
1997,142,65.014676,11.559439


In [16]:
# 8.1.5 Use a Dict in agg()
# you can pass a dict when using agg method but result depends if applying to a dataframe or a series
# On a dataframe: the keys are the columns and the values are the functions used in the aggregate
gdf_dict = df.groupby("year").agg(
    {
        "lifeExp": "mean",
        "pop": "median",
        "gdpPercap": "median"
    }
)
gdf_dict


Unnamed: 0_level_0,lifeExp,pop,gdpPercap
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,49.05762,3943953.0,1968.528344
1957,51.507401,4282942.0,2173.220291
1962,53.609249,4686039.5,2335.439533
1967,55.67829,5170175.5,2678.33474
1972,57.647386,5877996.5,3339.129407
1977,59.570157,6404036.5,3798.609244
1982,61.533197,7007320.0,4216.228428
1987,63.212613,7774861.5,4280.300366
1992,64.160338,8688686.5,4386.085502
1997,65.014676,9735063.5,4781.825478


In [17]:
# 8.1.5.2 On a Series
# To have user-defined column names, we need to renae those columns after the fact
gdf = (
    df
    .groupby("year")
    ["lifeExp"]
    .agg(
        [
            np.count_nonzero,
            np.mean,
            np.std,
        ]
    )
    .rename(
        columns={
            "count_nonzero": "count",
            "mean": "avg",
            "std": "std_dev",
        }
    )
    .reset_index() # return a flat dataframe
)
gdf

  .agg(
  .agg(


Unnamed: 0,year,count,avg,std_dev
0,1952,142,49.05762,12.225956
1,1957,142,51.507401,12.231286
2,1962,142,53.609249,12.097245
3,1967,142,55.67829,11.718858
4,1972,142,57.647386,11.381953
5,1977,142,59.570157,11.227229
6,1982,142,61.533197,10.770618
7,1987,142,63.212613,10.556285
8,1992,142,64.160338,11.22738
9,1997,142,65.014676,11.559439


In [18]:
# 8.2 Transform. takes multiple values and retruns a one-to-one tranformation of the values
def my_zscore(x):
    """calculates the z-score of provided data
       'x' is a series or a vector
    """
    return ((x - x.mean()) / x.std())

In [19]:
transform_z = df.groupby('year')["lifeExp"].transform(my_zscore)
transform_z

0      -1.656854
1      -1.731249
2      -1.786543
3      -1.848157
4      -1.894173
          ...   
1699   -0.081621
1700   -0.336974
1701   -1.574962
1702   -2.093346
1703   -1.948180
Name: lifeExp, Length: 1704, dtype: float64

In [22]:
# note the number of rows in our data
df.shape


(1704, 6)

In [23]:
transform_z.shape

(1704,)

In [None]:
# Now, we will compare when we use scipy's own function zscore instead of using groupby
from scipy.stats import zscore
