# What will be done here

Because it's not clear in Gapminder's metadata about how Russian is splitted, we will try to reverse engineer the ratio of `rus`

> Jasper Heeffer [9:52 PM]

> so check if `ukr, rus, blr, arm, aze, est, geo, kaz, kgz, lva, ltu, mda, tjk, tkm, uzb` in 1992 gives the correct ratio for rus

conclusion: it is correct

In [1]:
import pandas as pd
import os
import numpy as np
import re
import json

In [18]:
import ddf_utils.ddf_reader as dr
from ddf_utils.str import format_float_sigfig, format_float_digits, to_concept_id

In [3]:
dr.SEARCH_PATH = '../../../'

In [4]:
co2 = dr.ddf_datapoint('ddf--cdiac-co2', 
                       'total_carbon_emissions', 'nation,year')

In [34]:
co2_2010 = dr.ddf_datapoint('ddf--gapminder--gapminder_world', 'yearly_co2_emissions_1000_tonnes')

In [5]:
pop = dr.ddf_datapoint('ddf--gapminder--population', 'population')

In [7]:
geo = dr.ddf_entities('ddf--gapminder--geo_entity_domain')['country']

In [None]:
# load 2010 version cdiac data

In [8]:
def read_source(f, skip=0, **kwargs):
    df = pd.read_csv(f, **kwargs)
    # quick fix for malformed csv downloaded from data povider
    if df.columns[0] == 'Year"':
        df = df.rename(columns={'Year"': 'Year'})
    df.columns = list(map(lambda x: x.lower().replace('\n', ''), df.columns))
    df = df.ix[skip:]  # skip first few rows of data
    
    return df

In [9]:
nation_file = '../../../ddf--cdiac-co2/etl/source/nation.1751_2010.csv'

In [24]:
nation_2010 = read_source(nation_file, skip=2, na_values='.')

In [25]:
nation_2010.head()

Unnamed: 0,nation,year,total co2 emissions from fossil-fuels and cement production (thousand metric tons of c),emissions from solid fuel consumption,emissions from liquid fuel consumption,emissions from gas fuel consumption,emissions from cement production,emissions from gas flaring,per capita co2 emissions (metric tons of carbon),emissions from bunker fuels (not included in the totals)
2,AFGHANISTAN,1949.0,4.0,4.0,0.0,0.0,0.0,,,0.0
3,AFGHANISTAN,1950.0,23.0,6.0,18.0,0.0,0.0,0.0,0.0,0.0
4,AFGHANISTAN,1951.0,25.0,7.0,18.0,0.0,0.0,0.0,0.0,0.0
5,AFGHANISTAN,1952.0,25.0,9.0,17.0,0.0,0.0,0.0,0.0,0.0
6,AFGHANISTAN,1953.0,29.0,10.0,18.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# fix year to int
nation_2010.year = nation_2010.year.map(int)

In [27]:
# fix nation name for hkg and mac
nation_2010['nation'] = nation_2010['nation'].map(
        lambda x: x.replace('ADMINSTRATIVE', 'ADMINISTRATIVE') if 'ADMINSTRATIVE' in x else x)

In [15]:
def get_concept_id(name):
    """return concept name for given indicator name.
    """
    if 'total ' in name.lower():
        return 'total_carbon_emissions'
    else:
        subtypes = [
            'gas fuel consumption', 'liquid fuel consumption', 'solid fuel consumption',
            'cement production', 'gas flaring', 'bunker fuels', 'per capita'
        ]
        for i in subtypes:
            if i in name.lower():
                return 'carbon_emissions_'+to_concept_id(i)
        # if nothing found, it should be a non measure concept.
        return to_concept_id(name)

In [28]:
nation_2010.columns = list(map(get_concept_id, nation_2010.columns))

In [20]:
# nation_2010.nation = nation_2010.nation.map(to_concept_id)

In [29]:
nation_cdiac = nation_2010.nation.unique()

In [30]:
nation_cdiac

array(['AFGHANISTAN', 'ALBANIA', 'ALGERIA', 'ANDORRA', 'ANGOLA',
       'ANGUILLA', 'ANTARCTIC FISHERIES', 'ANTIGUA & BARBUDA', 'ARGENTINA',
       'ARMENIA', 'ARUBA', 'AUSTRALIA', 'AUSTRIA', 'AZERBAIJAN', 'BAHAMAS',
       'BAHRAIN', 'BANGLADESH', 'BARBADOS', 'BELARUS', 'BELGIUM', 'BELIZE',
       'BENIN', 'BERMUDA', 'BHUTAN', 'BOSNIA & HERZEGOVINA', 'BOTSWANA',
       'BRAZIL', 'BRITISH VIRGIN ISLANDS', 'BRUNEI (DARUSSALAM)',
       'BULGARIA', 'BURKINA FASO', 'BURUNDI', 'CAMBODIA', 'CANADA',
       'CAPE VERDE', 'CAYMAN ISLANDS', 'CENTRAL AFRICAN REPUBLIC', 'CHAD',
       'CHILE', 'CHINA (MAINLAND)', 'CHRISTMAS ISLAND', 'COLOMBIA',
       'COMOROS', 'CONGO', 'COOK ISLANDS', 'COSTA RICA', 'COTE D IVOIRE',
       'CROATIA', 'CUBA', 'CYPRUS', 'CZECH REPUBLIC', 'CZECHOSLOVAKIA',
       'DEMOCRATIC PEOPLE S REPUBLIC OF KOREA',
       'DEMOCRATIC REPUBLIC OF THE CONGO (FORMERLY ZAIRE)',
       'DEMOCRATIC REPUBLIC OF VIETNAM', 'DENMARK', 'DJIBOUTI', 'DOMINICA',
       'DOMINICAN REPUBLIC'

In [32]:
map_c = {}
search_cols = ['name', 'gapminder_list','alternative_1', 'alternative_2', 'alternative_3',
               'alternative_4_cdiac', 'pandg', 'god_id', 'alt_5', 'upper_case_name', 
               'arb1', 'arb2', 'arb3', 'arb4', 'arb5', 'arb6'
              ]

for g in nation_cdiac:
    masks = []
    for c in search_cols:
        masks.append(geo[c].str.lower() == g.lower())
    
    map0 = masks[0]
    for m in masks[1:]:
        map0 = map0 | m
        
    filtered = geo[map0]
    if len(filtered) > 1:
        print('multiple entities found for '+g)
        print(filtered['country'].values)
    elif len(filtered) > 0:
        map_c[g] = filtered['country'].values[0]
    else:
        print('not found: ', g)

not found:  ANTARCTIC FISHERIES
not found:  DEMOCRATIC REPUBLIC OF VIETNAM
not found:  EAST & WEST PAKISTAN
not found:  FEDERATION OF MALAYA-SINGAPORE
not found:  FORMER PANAMA CANAL ZONE
not found:  FRENCH EQUATORIAL AFRICA
not found:  FRENCH INDO-CHINA
not found:  FRENCH WEST AFRICA
not found:  KUWAITI OIL FIRES
not found:  LEEWARD ISLANDS
not found:  NETHERLAND ANTILLES AND ARUBA
not found:  PACIFIC ISLANDS (PALAU)
not found:  PENINSULAR MALAYSIA
not found:  REPUBLIC OF SOUTH VIETNAM
not found:  RHODESIA-NYASALAND
not found:  RWANDA-URUNDI
not found:  RYUKYU ISLANDS
not found:  SABAH
not found:  SARAWAK
not found:  ST. KITTS-NEVIS-ANGUILLA
not found:  TANGANYIKA
not found:  ZANZIBAR


In [33]:
nation_2010.head()

Unnamed: 0,nation,year,total_carbon_emissions,carbon_emissions_solid_fuel_consumption,carbon_emissions_liquid_fuel_consumption,carbon_emissions_gas_fuel_consumption,carbon_emissions_cement_production,carbon_emissions_gas_flaring,carbon_emissions_per_capita,carbon_emissions_bunker_fuels
2,AFGHANISTAN,1949,4.0,4.0,0.0,0.0,0.0,,,0.0
3,AFGHANISTAN,1950,23.0,6.0,18.0,0.0,0.0,0.0,0.0,0.0
4,AFGHANISTAN,1951,25.0,7.0,18.0,0.0,0.0,0.0,0.0,0.0
5,AFGHANISTAN,1952,25.0,9.0,17.0,0.0,0.0,0.0,0.0,0.0
6,AFGHANISTAN,1953,29.0,10.0,18.0,0.0,0.0,0.0,0.0,0.0


In [37]:
nation_2010['geo'] = nation_2010.nation.map(lambda x: map_c[x] if x in map_c.keys() else None)

In [40]:
co2_2010_cdiac = nation_2010[['geo', 'year', 'total_carbon_emissions']].copy()

In [41]:
co2_2010_cdiac['total_carbon_emissions'] = co2_2010_cdiac['total_carbon_emissions'] * 3.6667

In [42]:
co2_2010_cdiac.head()

Unnamed: 0,geo,year,total_carbon_emissions
2,afg,1949,14.6668
3,afg,1950,84.3341
4,afg,1951,91.6675
5,afg,1952,91.6675
6,afg,1953,106.3343


In [35]:
co2_2010.head()

Unnamed: 0,yearly_co2_emissions_1000_tonnes,geo,time
0,14.67,afg,1949
1,84.33,afg,1950
2,91.67,afg,1951
3,91.67,afg,1952
4,106.3,afg,1953


In [43]:
co2_2010.columns = ['total_carbon_emissions', 'geo', 'year']

In [108]:
rus_gw_2010 = co2_2010[co2_2010.geo == 'rus']

In [109]:
rus_gw_2010.head()

Unnamed: 0,total_carbon_emissions,geo,year
11730,11.38,rus,1830
11731,13.66,rus,1831
11732,9.104,rus,1832
11733,11.38,rus,1833
11734,70.56,rus,1850


In [61]:
spls = ['ukr', 'rus', 'blr', 'arm', 'aze', 'est', 'geo', 'kaz', 'kgz', 'lva', 'ltu', 
        'mda', 'tjk', 'tkm', 'uzb']

rus_1992 = co2_2010_cdiac[(co2_2010_cdiac.geo.isin(spls) & (co2_2010_cdiac.year == 1992))]

In [62]:
rus_1992 = rus_1992.set_index(['geo', 'year'])

In [75]:
ratio = {}

total = rus_1992.total_carbon_emissions.sum()

for s in spls:
    ratio[s] = rus_1992.ix[(s, 1992), 'total_carbon_emissions'] / total

In [76]:
ratio

{'arm': 0.0011754906476687519,
 'aze': 0.016733455102108115,
 'blr': 0.025379960065231747,
 'est': 0.0066763613617819798,
 'geo': 0.0044487799896386609,
 'kaz': 0.075804785576676553,
 'kgz': 0.0031509532112170527,
 'ltu': 0.0064253063456282912,
 'lva': 0.0039275216933873595,
 'mda': 0.0060838289719616218,
 'rus': 0.62073033606267014,
 'tjk': 0.0020946073169771698,
 'tkm': 0.0081422673459335994,
 'ukr': 0.1861509117233118,
 'uzb': 0.033075434585807091}

In [60]:
rus_gw_2010[rus_gw_2010.year == 1992]

Unnamed: 0,total_carbon_emissions,geo,year
11870,2140000.0,rus,1992


In [122]:
ussr = nation_2010[nation_2010.nation == 'USSR']

In [123]:
ussr = ussr[['geo', 'year', 'total_carbon_emissions']].copy()

In [124]:
ussr['total_carbon_emissions'] = ussr['total_carbon_emissions'] * 3.6667

In [126]:
ussr['total_carbon_emissions'] = ussr['total_carbon_emissions'] * ratio['rus']
ussr.geo = 'rus'

In [102]:
# for s in spls:
#     new_df = ussr.copy()
#     new_df['total_carbon_emissions'] = new_df['total_carbon_emissions'] * ratio[s]
#     new_df.geo = s
#     ussr = ussr.append(new_df, ignore_index=True)

In [127]:
rus_calc = ussr.copy()

In [128]:
rus_calc = rus_calc.set_index(['geo', 'year'])
rus_calc.columns = ['cdiac_2010']

In [116]:
rus_gw_2010 = rus_gw_2010.set_index(['geo', 'year'])
rus_gw_2010.columns = ['gw']

In [132]:
res = pd.concat([rus_calc, rus_gw_2010], axis=1)

In [133]:
res['cdiac_2010'] = res['cdiac_2010'].map(format_float_digits)

In [136]:
res = res.reset_index()

In [140]:
res[(res.year > 1930) & (res.year < 1940)]

Unnamed: 0,geo,year,cdiac_2010,gw
79,rus,1931,116648.9121,116600.0
80,rus,1932,131424.91134,131400.0
81,rus,1933,146876.89207,146900.0
82,rus,1934,177664.7759,177700.0
83,rus,1935,201105.62867,201100.0
84,rus,1936,230605.27843,230600.0
85,rus,1937,235803.73534,235800.0
86,rus,1938,248879.53874,248900.0
87,rus,1939,251931.69755,251900.0


In [None]:
# lva and est