In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [6]:
df = pd.read_csv('data/large_countries_2015.csv', index_col=0)

In [7]:
df.head()

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia


In [8]:
df.groupby(by='continent').mean()

Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,182202000.0,5.89
Asia,503122400.0,2.267143
Europe,143456900.0,1.61
North America,224395400.0,2.05
South America,207847500.0,1.78


In [9]:
df['population'] = df['population'] / 1000000
df['population'] = round(df['population'], 1) 

In [10]:
df.head()

Unnamed: 0,population,fertility,continent
Bangladesh,161.0,2.12,Asia
Brazil,207.8,1.78,South America
China,1376.0,1.57,Asia
India,1311.1,2.43,Asia
Indonesia,257.6,2.28,Asia


In [11]:
# Calculate the average population size of the large countries
average_population = df['population'].mean()

In [13]:
# Calculate the average population size by continent
avg_pop_per_continent = round(df.groupby('continent')['population'].mean(), 2)

In [14]:
avg_pop_per_continent

continent
Africa           182.20
Asia             503.13
Europe           143.50
North America    224.40
South America    207.80
Name: population, dtype: float64

In [15]:
# 1. by column
g1 = df.groupby('continent')
g1.groups


{'Africa': ['Nigeria'], 'Asia': ['Bangladesh', 'China', 'India', 'Indonesia', 'Japan', 'Pakistan', 'Philippines'], 'Europe': ['Russia'], 'North America': ['Mexico', 'United States'], 'South America': ['Brazil']}

In [18]:
# 2. by an array of equal length
industrialized = np.array([False, True, True, True, False, True, True, False, False, False, True, True])
g2 = df.groupby(industrialized)
g2.groups

{False: ['Bangladesh', 'Indonesia', 'Nigeria', 'Pakistan', 'Philippines'], True: ['Brazil', 'China', 'India', 'Japan', 'Mexico', 'Russia', 'United States']}

In [19]:
# 3. by a Dictionary with keys on the Index
language = {'Bangladesh':'BN', 'Brazil':'PT', 'China':'CN',
            'India':'BN', 'Indonesia':'MS', 'Japan':'JP',
            'Mexico':'ES', 'Nigeria':'NG', 'Pakistan':'UR',
            'Philippines':'PP', 'Russia':'RU', 'United States':'EN'}
g3 = df.groupby(language)
g3.groups



{'BN': ['Bangladesh', 'India'], 'CN': ['China'], 'EN': ['United States'], 'ES': ['Mexico'], 'JP': ['Japan'], 'MS': ['Indonesia'], 'NG': ['Nigeria'], 'PP': ['Philippines'], 'PT': ['Brazil'], 'RU': ['Russia'], 'UR': ['Pakistan']}

In [20]:
# 4. by a function
g4 = df.groupby(len)
g4.groups


{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [21]:

# 5. a list of the above
g5 = df.groupby(['continent', language, len])
g5.groups


{('Africa', 'NG', 7): ['Nigeria'], ('Asia', 'BN', 5): ['India'], ('Asia', 'BN', 10): ['Bangladesh'], ('Asia', 'CN', 5): ['China'], ('Asia', 'JP', 5): ['Japan'], ('Asia', 'MS', 9): ['Indonesia'], ('Asia', 'PP', 11): ['Philippines'], ('Asia', 'UR', 8): ['Pakistan'], ('Europe', 'RU', 6): ['Russia'], ('North America', 'EN', 13): ['United States'], ('North America', 'ES', 6): ['Mexico'], ('South America', 'PT', 6): ['Brazil']}

In [24]:

# 6. group along the x-axis
g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)
g6.groups

  g6 = df[['population', 'fertility']].transpose().groupby(len, axis=1)


{5: ['China', 'India', 'Japan'], 6: ['Brazil', 'Mexico', 'Russia'], 7: ['Nigeria'], 8: ['Pakistan'], 9: ['Indonesia'], 10: ['Bangladesh'], 11: ['Philippines'], 13: ['United States']}

In [25]:
for i, df_group in df.groupby('continent'):
    print(i, df_group, '\n')

Africa          population  fertility continent
Nigeria       182.2       5.89    Africa 

Asia              population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia 

Europe         population  fertility continent
Russia       143.5       1.61    Europe 

North America                population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America 

South America         population  fertility      continent
Brazil       207.8       1.78  South America 



In [27]:
g = df.groupby('continent')

# standard aggregation functions
g.mean()
g.max()
g.min()
g.sum()
g.count()
g.std()
g.median()
g.quantile(0.9)
g.describe()

# Aggregation with selecting columns
g['population'].describe()

# Aggregation with a list of function names
g.agg(['count', 'mean', 'std'])
g.agg([('Total', 'sum')])        # includes label

# custom aggregation function with parameter
def sum_greater(dataframe, threshold):
    for column in dataframe.columns:
        return dataframe[dataframe[column]>threshold].sum()
    
g.agg(sum_greater, threshold=200)


Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.0,0.0
Asia,2944.7,6.28
Europe,0.0,0.0
North America,321.8,1.97
South America,207.8,1.78


In [26]:

# Transformation by function name
g.transform('mean')

NameError: name 'g' is not defined