In [42]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [43]:
df = pd.read_csv('data/large_countries_2015.csv', index_col=0)

In [44]:
df.head()

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia


In [61]:
round(df.groupby(by='continent')[['fertility', 'population']].aggregate(['mean', 'median', 'min', 'max']),1)

Unnamed: 0_level_0,fertility,fertility,fertility,fertility,population,population,population,population
Unnamed: 0_level_1,mean,median,min,max,mean,median,min,max
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Africa,5.9,5.9,5.9,5.9,182.2,182.2,182.2,182.2
Asia,2.3,2.3,1.4,3.0,503.1,188.9,100.7,1376.0
Europe,1.6,1.6,1.6,1.6,143.5,143.5,143.5,143.5
North America,2.0,2.0,2.0,2.1,224.4,224.4,127.0,321.8
South America,1.8,1.8,1.8,1.8,207.8,207.8,207.8,207.8


In [46]:
df['population'] = df['population'] / 1000000
df['population'] = round(df['population'], 1) 

In [66]:
df.shape

(12, 3)

In [11]:
# Calculate the average population size of the large countries
average_population = df['population'].mean()

In [13]:
# Calculate the average population size by continent
avg_pop_per_continent = round(df.groupby('continent')['population'].mean(), 2)

In [14]:
avg_pop_per_continent

continent
Africa           182.20
Asia             503.13
Europe           143.50
North America    224.40
South America    207.80
Name: population, dtype: float64

In [67]:
# 1. by column
g1 = df.groupby('continent')
g1.groups


{'Africa': ['Nigeria'], 'Asia': ['Bangladesh', 'China', 'India', 'Indonesia', 'Japan', 'Pakistan', 'Philippines'], 'Europe': ['Russia'], 'North America': ['Mexico', 'United States'], 'South America': ['Brazil']}

In [68]:
# 2. by an array of equal length
industrialized = np.array([False, True, True, True, False, True, True, False, False, False, True, True])
g2 = df.groupby(industrialized)
g2.groups

{False: ['Bangladesh', 'Indonesia', 'Nigeria', 'Pakistan', 'Philippines'], True: ['Brazil', 'China', 'India', 'Japan', 'Mexico', 'Russia', 'United States']}

In [19]:
# 3. by a Dictionary with keys on the Index
language = {'Bangladesh':'BN', 'Brazil':'PT', 'China':'CN',
            'India':'BN', 'Indonesia':'MS', 'Japan':'JP',
            'Mexico':'ES', 'Nigeria':'NG', 'Pakistan':'UR',
            'Philippines':'PP', 'Russia':'RU', 'United States':'EN'}
g3 = df.groupby(language)
g3.groups

{'BN': ['Bangladesh', 'India'], 'CN': ['China'], 'EN': ['United States'], 'ES': ['Mexico'], 'JP': ['Japan'], 'MS': ['Indonesia'], 'NG': ['Nigeria'], 'PP': ['Philippines'], 'PT': ['Brazil'], 'RU': ['Russia'], 'UR': ['Pakistan']}

In [69]:
# 4. by a function
g4 = df.groupby(len)
g4.groups


{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [71]:

# 5. a list of the above
g5 = df.groupby(['continent', language, len])
g5.groups


{('Africa', 'NG', 7): ['Nigeria'], ('Asia', 'BN', 5): ['India'], ('Asia', 'BN', 10): ['Bangladesh'], ('Asia', 'CN', 5): ['China'], ('Asia', 'JP', 5): ['Japan'], ('Asia', 'MS', 9): ['Indonesia'], ('Asia', 'PP', 11): ['Philippines'], ('Asia', 'UR', 8): ['Pakistan'], ('Europe', 'RU', 6): ['Russia'], ('North America', 'EN', 13): ['United States'], ('North America', 'ES', 6): ['Mexico'], ('South America', 'PT', 6): ['Brazil']}

In [76]:
g5.agg(['mean', 'median'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,population,population,fertility,fertility
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,mean,median
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Africa,NG,7,182.2,182.2,5.89,5.89
Asia,BN,5,1311.1,1311.1,2.43,2.43
Asia,BN,10,161.0,161.0,2.12,2.12
Asia,CN,5,1376.0,1376.0,1.57,1.57
Asia,JP,5,126.6,126.6,1.45,1.45
Asia,MS,9,257.6,257.6,2.28,2.28
Asia,PP,11,100.7,100.7,2.98,2.98
Asia,UR,8,188.9,188.9,3.04,3.04
Europe,RU,6,143.5,143.5,1.61,1.61
North America,EN,13,321.8,321.8,1.97,1.97


In [24]:

# 6. group along the x-axis
g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)
g6.groups

  g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)


{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [25]:
for i, df_group in df.groupby('continent'):
    print(i, df_group, '\n')

Africa          population  fertility continent
Nigeria       182.2       5.89    Africa 

Asia              population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia 

Europe         population  fertility continent
Russia       143.5       1.61    Europe 

North America                population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America 

South America         population  fertility      continent
Brazil       207.8       1.78  South America 



In [77]:
g = df.groupby('continent')

# standard aggregation functions
g.mean()
g.max()
g.min()
g.sum()
g.count()
g.std()
g.median()
g.quantile(0.9)
g.describe()



Unnamed: 0_level_0,population,population,population,population,population,population,population,population,fertility,fertility,fertility,fertility,fertility,fertility,fertility,fertility
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Africa,1.0,182.2,,182.2,182.2,182.2,182.2,182.2,1.0,5.89,,5.89,5.89,5.89,5.89,5.89
Asia,7.0,503.128571,576.55886,100.7,143.8,188.9,784.35,1376.0,7.0,2.267143,0.620154,1.45,1.845,2.28,2.705,3.04
Europe,1.0,143.5,,143.5,143.5,143.5,143.5,143.5,1.0,1.61,,1.61,1.61,1.61,1.61,1.61
North America,2.0,224.4,137.744401,127.0,175.7,224.4,273.1,321.8,2.0,2.05,0.113137,1.97,2.01,2.05,2.09,2.13
South America,1.0,207.8,,207.8,207.8,207.8,207.8,207.8,1.0,1.78,,1.78,1.78,1.78,1.78,1.78


In [83]:
# Aggregation with selecting columns
g['population'].describe()



Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,1.0,182.2,,182.2,182.2,182.2,182.2,182.2
Asia,7.0,503.128571,576.55886,100.7,143.8,188.9,784.35,1376.0
Europe,1.0,143.5,,143.5,143.5,143.5,143.5,143.5
North America,2.0,224.4,137.744401,127.0,175.7,224.4,273.1,321.8
South America,1.0,207.8,,207.8,207.8,207.8,207.8,207.8


In [84]:
# Aggregation with a list of function names
g.agg(['count', 'mean', 'std'])
g.agg([('Total', 'sum')])        # includes label

Unnamed: 0_level_0,population,fertility
Unnamed: 0_level_1,Total,Total
continent,Unnamed: 1_level_2,Unnamed: 2_level_2
Africa,182.2,5.89
Asia,3521.9,15.87
Europe,143.5,1.61
North America,448.8,4.1
South America,207.8,1.78


In [88]:
# custom aggregation function with parameter
def sum_greater(dataframe, threshold):
    for column in dataframe.columns:
        return dataframe[dataframe[column]>threshold].sum()
    
g.agg(sum_greater, threshold=200)


Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.0,0.0
Asia,2944.7,6.28
Europe,0.0,0.0
North America,321.8,1.97
South America,207.8,1.78


In [89]:

# Transformation by function name
g.transform('mean')

Unnamed: 0,population,fertility
Bangladesh,503.128571,2.267143
Brazil,207.8,1.78
China,503.128571,2.267143
India,503.128571,2.267143
Indonesia,503.128571,2.267143
Japan,503.128571,2.267143
Mexico,224.4,2.05
Nigeria,182.2,5.89
Pakistan,503.128571,2.267143
Philippines,503.128571,2.267143


In [91]:

# Transformation by function reference
g.transform(len)



Unnamed: 0,population,fertility
Bangladesh,7,7
Brazil,1,1
China,7,7
India,7,7
Indonesia,7,7
Japan,7,7
Mexico,2,2
Nigeria,1,1
Pakistan,7,7
Philippines,7,7


In [93]:
# Transformation with your own function
def normalize(array):
    return array - array.mean()

g.transform(normalize)

Unnamed: 0,population,fertility
Bangladesh,-342.128571,-0.147143
Brazil,0.0,0.0
China,872.871429,-0.697143
India,807.971429,0.162857
Indonesia,-245.528571,0.012857
Japan,-376.528571,-0.817143
Mexico,-97.4,0.08
Nigeria,0.0,0.0
Pakistan,-314.228571,0.772857
Philippines,-402.428571,0.712857


In [94]:
# apply any function
def first_two(df):
    return df.head(2)

g.apply(first_two)

Unnamed: 0_level_0,Unnamed: 1_level_0,population,fertility,continent
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Nigeria,182.2,5.89,Africa
Asia,Bangladesh,161.0,2.12,Asia
Asia,China,1376.0,1.57,Asia
Europe,Russia,143.5,1.61,Europe
North America,Mexico,127.0,2.13,North America
North America,United States,321.8,1.97,North America
South America,Brazil,207.8,1.78,South America
