![alt text](pandas.png "Title")

In [2]:
import pandas as pd
import random

# Data aggregations and grouping

## Test data

In [3]:
# A simple demographic df
rows = 10
countries = ['USA', 'Switzerland', 'Belgium']
genders = ['Male', 'Female']
rawdata = {
    'usubjid': ['study_A_' + str(n) for n in range(1000, 1000 + rows)],
    'age'    : [random.randint(20, 50) for n in range(rows)], # randint() gives a random number within a range       
    'bmi'    : [random.randint(15, 35) for n in range(rows)],        
    'gender' : [random.choice(genders) for n in range(rows)], # choice() picks a value from an iterator
    'country': [random.choice(countries) for n in range(rows)] 
}

dm = pd.DataFrame(rawdata)

# look at the first 5 rows
dm.head()

Unnamed: 0,usubjid,age,bmi,gender,country
0,study_A_1000,30,29,Female,Belgium
1,study_A_1001,44,33,Female,Belgium
2,study_A_1002,44,20,Male,Belgium
3,study_A_1003,24,17,Female,USA
4,study_A_1004,45,21,Male,USA


## Data aggregation

A Dataframe object has many aggregation methods. They return a Series where the index holds the used df variable names

In [4]:
# Calculate the mean values of all numeric variables in the df:
dm.mean()

# many operations are available, e.g. count, sum, median, std, min, max, first, last

  dm.mean()


age    34.4
bmi    23.3
dtype: float64

In [8]:
# All combined in one shot:
df = dm.describe()
df


Unnamed: 0,age,bmi
count,10.0,10.0
mean,34.4,23.3
std,10.606078,6.129165
min,22.0,15.0
25%,24.25,20.0
50%,36.0,22.0
75%,44.0,27.5
max,46.0,33.0


In [5]:
# Some methods can work on strings too (lexicograph order)
dm.max()

usubjid    study_A_1009
age                  50
bmi                  32
gender             Male
country             USA
dtype: object

In [6]:
# You can also use agg() and pass a function name or a list of function names:
dm.agg(['median','min'])

Unnamed: 0,usubjid,age,bmi,gender,country
min,study_A_1000,25.0,17.0,Female,Belgium
median,,42.0,24.5,,


In [12]:
dm['age'].mean()

34.4

## Group by

In [7]:
# groupby() creates groups by variable values. No need to sort df beforehand.
# It returns a 'groupby' object, which also has many aggregation methods.

# let's see the mean values for all numeric variables, but this time by gender:
dm.groupby('gender').mean()

Unnamed: 0_level_0,age,bmi
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,44.333333,28.666667
Male,39.142857,23.857143


In [8]:
# The index was labelled after the group names, but maybe you don't want a labelled index:
dm.groupby(['gender'], as_index=False).mean()

Unnamed: 0,gender,age,bmi
0,Female,44.333333,28.666667
1,Male,39.142857,23.857143


In [9]:
# Multiple groups:
dm.groupby(['gender','country']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,bmi
gender,country,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,Belgium,46.0,32.0
Female,USA,43.5,27.0
Male,Belgium,36.333333,24.333333
Male,Switzerland,41.666667,24.0
Male,USA,40.0,22.0


In [10]:
# We can apply groupby() on a Series:
grouped = dm['age'].groupby(dm['gender'])

# mean() still returns a Series
grouped.mean()

gender
Female    44.333333
Male      39.142857
Name: age, dtype: float64

In [13]:
# The 'group_by' object supports iteration, it returns a tuple containing the group name and the data:
for name, data in dm.groupby(['gender']):
    print('Group name=', name) # that's a string
    print(data, '\n')         # and that's a dataframe

Group name= Female
        usubjid  age  bmi  gender      country  mean  age_mean
0  study_A_1000   30   29  Female      Belgium  34.4      34.4
1  study_A_1001   44   33  Female      Belgium  34.4      34.4
3  study_A_1003   24   17  Female          USA  34.4      34.4
7  study_A_1007   25   23  Female  Switzerland  34.4      34.4
9  study_A_1009   42   20  Female          USA  34.4      34.4 

Group name= Male
        usubjid  age  bmi gender  country  mean  age_mean
2  study_A_1002   44   20   Male  Belgium  34.4      34.4
4  study_A_1004   45   21   Male      USA  34.4      34.4
5  study_A_1005   22   23   Male  Belgium  34.4      34.4
6  study_A_1006   22   15   Male  Belgium  34.4      34.4
8  study_A_1008   46   32   Male  Belgium  34.4      34.4 



  for name, data in dm.groupby(['gender']):


In [14]:
# We can also group data by applying a function on the index.

# let's set an index first:
dm1 = dm.set_index(['country'])
dm1


Unnamed: 0_level_0,usubjid,age,bmi,gender,mean,age_mean
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Belgium,study_A_1000,30,29,Female,34.4,34.4
Belgium,study_A_1001,44,33,Female,34.4,34.4
Belgium,study_A_1002,44,20,Male,34.4,34.4
USA,study_A_1003,24,17,Female,34.4,34.4
USA,study_A_1004,45,21,Male,34.4,34.4
Belgium,study_A_1005,22,23,Male,34.4,34.4
Belgium,study_A_1006,22,15,Male,34.4,34.4
Switzerland,study_A_1007,25,23,Female,34.4,34.4
Belgium,study_A_1008,46,32,Male,34.4,34.4
USA,study_A_1009,42,20,Female,34.4,34.4


In [15]:
# Group by length of 'country' and show the max values in numeric variables:
dm1[['age','bmi']].groupby(len).max()

Unnamed: 0_level_0,age,bmi
country,Unnamed: 1_level_1,Unnamed: 2_level_1
3,45,21
7,46,33
11,25,23


## More sophistication

Let's create a Age mean by gender and add it as a new column. 

In [18]:
# Remember apply() ? It works on groupby objects too.
# We could use mean() directly instead but apply() gives more flexibility, especially combined with a lambda function.

# this creates a Series (because we subsetted dm to keep only the age column)
age_mean = dm.groupby('gender')['age'].apply(lambda group: group.mean())
age_mean

# Does the same: age_mean = dm.groupby('gender')['age'].mean()

gender
Female    33.0
Male      35.8
Name: age, dtype: float64

In [19]:
# Before we can add this on dm, the Series and the Dataframe must share the same index. 
dm2 = dm.set_index('gender')
dm2

Unnamed: 0_level_0,usubjid,age,bmi,country,mean,age_mean
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,study_A_1000,30,29,Belgium,34.4,34.4
Female,study_A_1001,44,33,Belgium,34.4,34.4
Male,study_A_1002,44,20,Belgium,34.4,34.4
Female,study_A_1003,24,17,USA,34.4,34.4
Male,study_A_1004,45,21,USA,34.4,34.4
Male,study_A_1005,22,23,Belgium,34.4,34.4
Male,study_A_1006,22,15,Belgium,34.4,34.4
Female,study_A_1007,25,23,Switzerland,34.4,34.4
Male,study_A_1008,46,32,Belgium,34.4,34.4
Female,study_A_1009,42,20,USA,34.4,34.4


In [20]:
# and now we can add the Series easily onto the Dataframe
dm2['age_mean'] = age_mean

dm2.sort_index()

Unnamed: 0_level_0,usubjid,age,bmi,country,mean,age_mean
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,study_A_1000,30,29,Belgium,34.4,33.0
Female,study_A_1001,44,33,Belgium,34.4,33.0
Female,study_A_1003,24,17,USA,34.4,33.0
Female,study_A_1007,25,23,Switzerland,34.4,33.0
Female,study_A_1009,42,20,USA,34.4,33.0
Male,study_A_1002,44,20,Belgium,34.4,35.8
Male,study_A_1004,45,21,USA,34.4,35.8
Male,study_A_1005,22,23,Belgium,34.4,35.8
Male,study_A_1006,22,15,Belgium,34.4,35.8
Male,study_A_1008,46,32,Belgium,34.4,35.8


Another example: let's select and sort the 3 highest bmi by gender.

In [21]:
dm

Unnamed: 0,usubjid,age,bmi,gender,country,mean,age_mean
0,study_A_1000,30,29,Female,Belgium,34.4,34.4
1,study_A_1001,44,33,Female,Belgium,34.4,34.4
2,study_A_1002,44,20,Male,Belgium,34.4,34.4
3,study_A_1003,24,17,Female,USA,34.4,34.4
4,study_A_1004,45,21,Male,USA,34.4,34.4
5,study_A_1005,22,23,Male,Belgium,34.4,34.4
6,study_A_1006,22,15,Male,Belgium,34.4,34.4
7,study_A_1007,25,23,Female,Switzerland,34.4,34.4
8,study_A_1008,46,32,Male,Belgium,34.4,34.4
9,study_A_1009,42,20,Female,USA,34.4,34.4


In [13]:
def highest_bmi(group, select=3):
    return group.sort_values(by='bmi', ascending=False)[-select:]
    
dm.groupby('gender').apply(highest_bmi)

# or alternatively, if you like lambdas:
dm.groupby('gender').apply(lambda group: group.sort_values(by='bmi', ascending=False)[-3:])

Unnamed: 0_level_0,Unnamed: 1_level_0,usubjid,age,bmi,gender,country
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,4,study_A_1004,24,18,Female,USA
Female,3,study_A_1003,36,17,Female,USA
Female,9,study_A_1009,40,16,Female,USA
Male,1,study_A_1001,28,35,Male,Belgium
Male,0,study_A_1000,41,25,Male,Switzerland
Male,8,study_A_1008,49,22,Male,USA


__________________________________________________
Nicolas Dupuis, Methodology and Innovation (IDAR C&SP), 2020+