In [1]:
%run common.ipynb

# World Oil consumption

In [2]:
import pandas as pd
import numpy as np

# paths to the most commonly used dataset repositories for this lab

# path to pandas_for_everyone datasets repository
pfe_rep_path = 'https://raw.githubusercontent.com/chendaniely/pandas_for_everyone/master/data/'

# path to data-wrangling-datasets repository
sv_rep_path = '../data/'

In [3]:
source = sv_rep_path + 'oil_consumption_total.csv'

oil_df = pd.read_csv(source)
oil_df.head()

Unnamed: 0,country,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United Arab Emirates,69.6k,74.3k,79.8k,89.1k,99k,115k,136k,179k,280k,...,31.3M,33.8M,35.2M,39M,39.3M,42.2M,44.9M,44.2M,46M,44.6M
1,Argentina,22M,22.8M,23.3M,23.8M,24.7M,22.1M,23.6M,23.6M,23.7M,...,27.5M,27.4M,28.6M,30.7M,30.1M,30.9M,30.2M,29.4M,28.1M,27.6M
2,Australia,15.1M,18M,19.7M,21.5M,22.1M,24.4M,25.7M,26.1M,27.9M,...,41.6M,44.4M,46.1M,46.9M,47.4M,45.6M,46M,48.1M,49.5M,49.1M
3,Austria,5.56M,6.12M,6.51M,7.54M,8.28M,9.1M,10.2M,11M,11.9M,...,12.7M,11.9M,11.9M,12.2M,11.7M,11.7M,12.1M,12.3M,12.4M,12.7M
4,Azerbaijan,,,,,,,,,,...,3.26M,3.99M,4.17M,4.54M,4.45M,4.54M,4.52M,4.56M,4.78M,4.93M


In [4]:
# Tidy the dataset
replace_dict = {'[kK]': '*1e3',
             '[mM]': '*1e6',
             '[bB]': '*1e9', 
            }

# make sure not to make replacements in the `country` column
oil_df.iloc[:,1:] = oil_df.iloc[:,1:].replace(replace_dict, 
                                                    regex=True      # set regex to true
                                                    )

oil_df.head()

Unnamed: 0,country,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,United Arab Emirates,69.6*1e3,74.3*1e3,79.8*1e3,89.1*1e3,99*1e3,115*1e3,136*1e3,179*1e3,280*1e3,...,31.3*1e6,33.8*1e6,35.2*1e6,39*1e6,39.3*1e6,42.2*1e6,44.9*1e6,44.2*1e6,46*1e6,44.6*1e6
1,Argentina,22*1e6,22.8*1e6,23.3*1e6,23.8*1e6,24.7*1e6,22.1*1e6,23.6*1e6,23.6*1e6,23.7*1e6,...,27.5*1e6,27.4*1e6,28.6*1e6,30.7*1e6,30.1*1e6,30.9*1e6,30.2*1e6,29.4*1e6,28.1*1e6,27.6*1e6
2,Australia,15.1*1e6,18*1e6,19.7*1e6,21.5*1e6,22.1*1e6,24.4*1e6,25.7*1e6,26.1*1e6,27.9*1e6,...,41.6*1e6,44.4*1e6,46.1*1e6,46.9*1e6,47.4*1e6,45.6*1e6,46*1e6,48.1*1e6,49.5*1e6,49.1*1e6
3,Austria,5.56*1e6,6.12*1e6,6.51*1e6,7.54*1e6,8.28*1e6,9.1*1e6,10.2*1e6,11*1e6,11.9*1e6,...,12.7*1e6,11.9*1e6,11.9*1e6,12.2*1e6,11.7*1e6,11.7*1e6,12.1*1e6,12.3*1e6,12.4*1e6,12.7*1e6
4,Azerbaijan,,,,,,,,,,...,3.26*1e6,3.99*1e6,4.17*1e6,4.54*1e6,4.45*1e6,4.54*1e6,4.52*1e6,4.56*1e6,4.78*1e6,4.93*1e6


In [5]:
oil_df = pd.melt(oil_df, id_vars='country', var_name='year', value_name='oil')

oil_df

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69.6*1e3
1,Argentina,1965,22*1e6
2,Australia,1965,15.1*1e6
3,Austria,1965,5.56*1e6
4,Azerbaijan,1965,
...,...,...,...
4395,USSR,2019,
4396,Uzbekistan,2019,2.09*1e6
4397,Venezuela,2019,16.2*1e6
4398,Vietnam,2019,24.6*1e6


In [6]:
oil_df_2 = oil_df.dropna(axis=0)
oil_df_2

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69.6*1e3
1,Argentina,1965,22*1e6
2,Australia,1965,15.1*1e6
3,Austria,1965,5.56*1e6
5,Belgium,1965,15.9*1e6
...,...,...,...
4394,United States,2019,842*1e6
4396,Uzbekistan,2019,2.09*1e6
4397,Venezuela,2019,16.2*1e6
4398,Vietnam,2019,24.6*1e6


In [7]:
oil_df_2['oil'] = oil_df_2['oil'].map(pd.eval)

In [8]:
oil_df_2

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69600.0
1,Argentina,1965,22000000.0
2,Australia,1965,15100000.0
3,Austria,1965,5560000.0
5,Belgium,1965,15900000.0
...,...,...,...
4394,United States,2019,842000000.0
4396,Uzbekistan,2019,2090000.0
4397,Venezuela,2019,16200000.0
4398,Vietnam,2019,24600000.0


In [9]:
oil_df_2 = oil_df_2.merge(oil_df, on=['country', 'year'], how='right', suffixes=('', '_raw'))
oil_df_2

Unnamed: 0,country,year,oil,oil_raw
0,United Arab Emirates,1965,69600.0,69.6*1e3
1,Argentina,1965,22000000.0,22*1e6
2,Australia,1965,15100000.0,15.1*1e6
3,Austria,1965,5560000.0,5.56*1e6
4,Azerbaijan,1965,,
...,...,...,...,...
4395,USSR,2019,,
4396,Uzbekistan,2019,2090000.0,2.09*1e6
4397,Venezuela,2019,16200000.0,16.2*1e6
4398,Vietnam,2019,24600000.0,24.6*1e6


In [10]:
oil_df = oil_df_2.drop('oil_raw', axis=1)
oil_df

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69600.0
1,Argentina,1965,22000000.0
2,Australia,1965,15100000.0
3,Austria,1965,5560000.0
4,Azerbaijan,1965,
...,...,...,...
4395,USSR,2019,
4396,Uzbekistan,2019,2090000.0
4397,Venezuela,2019,16200000.0
4398,Vietnam,2019,24600000.0


In [11]:
oil_df

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69600.0
1,Argentina,1965,22000000.0
2,Australia,1965,15100000.0
3,Austria,1965,5560000.0
4,Azerbaijan,1965,
...,...,...,...
4395,USSR,2019,
4396,Uzbekistan,2019,2090000.0
4397,Venezuela,2019,16200000.0
4398,Vietnam,2019,24600000.0


In [12]:
grouped = oil_df.groupby('country')

plt.figure()
for name, group in grouped:
    # print(group[group['oil'].isna()])
    group['oil'] = group['oil'].interpolate(method='linear', limit_direction='backward', axis=0)
    print(group[group['oil'].isna()])

Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
Index: []
Empty DataFrame
Columns: [country, year, oil]
In

<Figure size 2880x1440 with 0 Axes>

In [13]:

def interpolate(column):
    column = column.interpolate(method='linear', limit_direction='backward', axis=0)
    return column


oil_df['oil'] = oil_df.groupby('country').oil.transform(interpolate)
oil_df

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69600.0
1,Argentina,1965,22000000.0
2,Australia,1965,15100000.0
3,Austria,1965,5560000.0
4,Azerbaijan,1965,8250000.0
...,...,...,...
4395,USSR,2019,
4396,Uzbekistan,2019,2090000.0
4397,Venezuela,2019,16200000.0
4398,Vietnam,2019,24600000.0


In [14]:
oil_df = oil_df[(oil_df['country'] != 'USSR')]
oil_df

Unnamed: 0,country,year,oil
0,United Arab Emirates,1965,69600.0
1,Argentina,1965,22000000.0
2,Australia,1965,15100000.0
3,Austria,1965,5560000.0
4,Azerbaijan,1965,8250000.0
...,...,...,...
4394,United States,2019,842000000.0
4396,Uzbekistan,2019,2090000.0
4397,Venezuela,2019,16200000.0
4398,Vietnam,2019,24600000.0


In [15]:
oil_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4345 entries, 0 to 4399
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  4345 non-null   object 
 1   year     4345 non-null   object 
 2   oil      4345 non-null   float64
dtypes: float64(1), object(2)
memory usage: 135.8+ KB


In [16]:
oil_df.year = oil_df.year.astype('int')

In [17]:
source2 = sv_rep_path + 'gapminder_dataset.csv'

gapminder_df = pd.read_csv(source2)
gapminder_df.drop('Unnamed: 0', axis = 1, inplace = True)
gapminder_df

Unnamed: 0,country,year,pop,income,lifeExp,Continent
0,Afghanistan,1800,3280000.0,674.0,28.2,Asia
1,Afghanistan,1801,3280000.0,674.0,28.2,Asia
2,Afghanistan,1802,3280000.0,674.0,28.2,Asia
3,Afghanistan,1803,3280000.0,674.0,28.2,Asia
4,Afghanistan,1804,3280000.0,674.0,28.2,Asia
...,...,...,...,...,...,...
46681,Zimbabwe,2046,22800000.0,3880.0,66.9,Africa
46682,Zimbabwe,2047,23100000.0,3960.0,67.1,Africa
46683,Zimbabwe,2048,23400000.0,4050.0,67.3,Africa
46684,Zimbabwe,2049,23700000.0,4130.0,67.4,Africa


In [18]:
# Merge columns with population, and continents data from the previous notebook

# year, country, continent, oil_consumption, population

oil_df = oil_df.merge(gapminder_df, on=['country', 'year'], how='left')
oil_df

Unnamed: 0,country,year,oil,pop,income,lifeExp,Continent
0,United Arab Emirates,1965,69600.0,150000.0,74800.0,62.6,Asia
1,Argentina,1965,22000000.0,22200000.0,12100.0,66.1,South America
2,Australia,1965,15100000.0,11300000.0,18600.0,71.1,Oceania
3,Austria,1965,5560000.0,7310000.0,14700.0,70.1,Europe
4,Azerbaijan,1965,8250000.0,4590000.0,6280.0,55.6,Europe
...,...,...,...,...,...,...,...
4340,United States,2019,842000000.0,329000000.0,62500.0,78.9,North America
4341,Uzbekistan,2019,2090000.0,33000000.0,7000.0,68.4,Asia
4342,Venezuela,2019,16200000.0,28500000.0,8320.0,75.0,South America
4343,Vietnam,2019,24600000.0,96500000.0,8040.0,74.5,Asia


## Split apply combine

In [19]:
# Average oil consumption for each year

oil_df.groupby('year').oil.mean()

year
1965    2.174697e+07
1966    2.305442e+07
1967    2.429226e+07
1968    2.601223e+07
1969    2.784243e+07
1970    2.978652e+07
1971    3.110137e+07
1972    3.306489e+07
1973    3.527063e+07
1974    3.436024e+07
1975    3.373301e+07
1976    3.575753e+07
1977    3.676527e+07
1978    3.752582e+07
1979    3.794934e+07
1980    3.611467e+07
1981    3.466018e+07
1982    3.353262e+07
1983    3.333552e+07
1984    3.410857e+07
1985    3.406800e+07
1986    3.516529e+07
1987    3.581332e+07
1988    3.704390e+07
1989    3.765158e+07
1990    3.806408e+07
1991    3.811208e+07
1992    3.885862e+07
1993    3.858628e+07
1994    3.944039e+07
1995    3.990049e+07
1996    4.082392e+07
1997    4.180157e+07
1998    4.197162e+07
1999    4.273867e+07
2000    4.318453e+07
2001    4.352928e+07
2002    4.383689e+07
2003    4.479215e+07
2004    4.650904e+07
2005    4.702292e+07
2006    4.754270e+07
2007    4.809599e+07
2008    4.761061e+07
2009    4.660634e+07
2010    4.805476e+07
2011    4.857081e+07
2012    

In [20]:
# Average oil consumtion of each continent for a each year

oil_df.groupby(['year', 'Continent']).oil.mean()

year  Continent    
1965  Africa           3.698750e+06
      Asia             1.671788e+07
      Europe           1.581453e+07
      North America    1.553750e+08
      Oceania          8.910000e+06
                           ...     
2019  Asia             7.799821e+07
      Europe           1.912924e+07
      North America    2.554550e+08
      Oceania          2.864000e+07
      South America    3.001429e+07
Name: oil, Length: 330, dtype: float64

In [21]:
# Oil consumption per capita (tonnes per person per year)

oil_df2 = oil_df.groupby(['year', 'country'])['pop', 'oil'].sum()
oil_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,pop,oil
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1
1965,Algeria,12600000.0,1290000.0
1965,Argentina,22200000.0,22000000.0
1965,Australia,11300000.0,15100000.0
1965,Austria,7310000.0,5560000.0
1965,Azerbaijan,4590000.0,8250000.0
...,...,...,...
2019,United Kingdom,67500000.0,71200000.0
2019,United States,329000000.0,842000000.0
2019,Uzbekistan,33000000.0,2090000.0
2019,Venezuela,28500000.0,16200000.0


In [22]:
oil_df2['percap'] = oil_df2['oil']/oil_df2['pop']

oil_df2['percap']

year  country       
1965  Algeria           0.102381
      Argentina         0.990991
      Australia         1.336283
      Austria           0.760602
      Azerbaijan        1.797386
                          ...   
2019  United Kingdom    1.054815
      United States     2.559271
      Uzbekistan        0.063333
      Venezuela         0.568421
      Vietnam           0.254922
Name: percap, Length: 4345, dtype: float64

In [23]:
# Countries in the 95th percentile of oil consumption

q95 = oil_df.groupby(['year'])['oil'].transform(lambda x: x.quantile(0.95))
q95

q95_oil_df = oil_df[oil_df['oil'] >= q95]

In [24]:
groups = q95_oil_df.groupby('year')

for name, group in groups:
    print(f'country: {name}')
    print(group)

country: 1965
          country  year          oil          pop   income  lifeExp  \
17        Germany  1965   86300000.0   76300000.0  17600.0     70.7   
39          Japan  1965   87900000.0   98400000.0  10800.0     70.9   
62         Russia  1965  247000000.0  127000000.0   8820.0     70.0   
74  United States  1965  551000000.0  200000000.0  24400.0     70.4   

        Continent  
17         Europe  
39           Asia  
62           Asia  
74  North America  
country: 1966
           country  year          oil          pop   income  lifeExp  \
96         Germany  1966   96500000.0   76800000.0  18100.0     70.8   
118          Japan  1966  100000000.0   99600000.0  11800.0     71.5   
141         Russia  1966  247000000.0  127000000.0   9530.0     69.9   
153  United States  1966  579000000.0  202000000.0  25700.0     70.4   

         Continent  
96          Europe  
118           Asia  
141           Asia  
153  North America  
country: 1967
           country  year          oi

3313  North America  
country: 2007
            country  year          oil           pop   income  lifeExp  \
3331          China  2007  369000000.0  1.350000e+09   6800.0     74.2   
3350          India  2007  139000000.0  1.180000e+09   3660.0     65.8   
3357          Japan  2007  231000000.0  1.280000e+08  38700.0     82.9   
3392  United States  2007  908000000.0  3.010000e+08  55900.0     78.2   

          Continent  
3331           Asia  
3350           Asia  
3357           Asia  
3392  North America  
country: 2008
            country  year          oil           pop   income  lifeExp  \
3410          China  2008  376000000.0  1.350000e+09   7410.0     74.3   
3429          India  2008  146000000.0  1.200000e+09   3710.0     66.1   
3436          Japan  2008  225000000.0  1.290000e+08  38300.0     83.1   
3471  United States  2008  847000000.0  3.030000e+08  55300.0     78.4   

          Continent  
3410           Asia  
3429           Asia  
3436           Asia  
3471  Nort