In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
glbl = pd.read_csv('../datasets_all/global.1751_2021.csv')
print(glbl.head())

   Year  \
0  1751   
1  1752   
2  1753   
3  1754   
4  1755   

   Total carbon emissions from fossil fuel consumption and cement production (million metric tons of C)  \
0                                                  3                                                      
1                                                  3                                                      
2                                                  3                                                      
3                                                  3                                                      
4                                                  3                                                      

   Carbon emissions from solid fuel consumption  \
0                                             3   
1                                             3   
2                                             3   
3                                             3   
4                                  

In [3]:
glbl.shape

(271, 8)

In [4]:
wrld = pd.read_excel('../datasets_all/nation.1751_2021.xlsx', engine='openpyxl')

In [5]:
wrld.shape

(18991, 10)

In [6]:
wrld.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18991 entries, 0 to 18990
Data columns (total 10 columns):
 #   Column                                                                                   Non-Null Count  Dtype  
---  ------                                                                                   --------------  -----  
 0   Nation                                                                                   18991 non-null  object 
 1   Year                                                                                     18991 non-null  int64  
 2   Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)  18991 non-null  int64  
 3   Emissions from solid fuel consumption                                                    13209 non-null  float64
 4   Emissions from liquid fuel consumption                                                   18366 non-null  float64
 5   Emissions from gas fuel consumption                         

In [7]:
wrld.Nation.unique()

array(['AFGHANISTAN', 'ALBANIA', 'ALGERIA', 'ANDORRA', 'ANGOLA',
       'ANGUILLA', 'ANTARCTIC FISHERIES', 'ANTIGUA & BARBUDA',
       'ARGENTINA', 'ARMENIA', 'ARUBA', 'AUSTRALIA', 'AUSTRIA',
       'AZERBAIJAN', 'BAHAMAS', 'BAHRAIN', 'BANGLADESH', 'BARBADOS',
       'BELARUS', 'BELGIUM', 'BELIZE', 'BENIN', 'BERMUDA', 'BHUTAN',
       'BONAIRE, SAINT EUSTATIUS, AND SABA', 'BOSNIA & HERZEGOVINA',
       'BOTSWANA', 'BRAZIL', 'BRITISH VIRGIN ISLANDS',
       'BRUNEI (DARUSSALAM)', 'BULGARIA', 'BURKINA FASO', 'BURUNDI',
       'CAMBODIA', 'CANADA', 'CAPE VERDE', 'CAYMAN ISLANDS',
       'CENTRAL AFRICAN REPUBLIC', 'CHAD', 'CHILE', 'CHINA (MAINLAND)',
       'CHRISTMAS ISLAND', 'COLOMBIA', 'COMOROS', 'CONGO', 'COOK ISLANDS',
       'COSTA RICA', 'COTE D IVOIRE', 'CROATIA', 'CUBA', 'CURACAO',
       'CYPRUS', 'CZECH REPUBLIC', 'CZECHOSLOVAKIA',
       'DEMOCRATIC PEOPLE S REPUBLIC OF KOREA',
       'DEMOCRATIC REPUBLIC OF THE CONGO (FORMERLY ZAIRE)',
       'DEMOCRATIC REPUBLIC OF VIETNAM',

In [8]:
wrld[wrld['Nation'] == 'CHINA']

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)


In [9]:
wrld.head()

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)
0,AFGHANISTAN,1949,4,4.0,0.0,0.0,0.0,,,0.0
1,AFGHANISTAN,1950,23,6.0,18.0,0.0,0.0,,0.003025,0.0
2,AFGHANISTAN,1951,25,7.0,18.0,0.0,0.0,,0.003172,0.0
3,AFGHANISTAN,1952,25,9.0,17.0,0.0,0.0,,0.003206,0.0
4,AFGHANISTAN,1953,29,10.0,18.0,0.0,0.0,,0.003551,0.0


In [10]:
wrld.isna().sum()

Nation                                                                                         0
Year                                                                                           0
Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)        0
Emissions from solid fuel consumption                                                       5782
Emissions from liquid fuel consumption                                                       625
Emissions from gas fuel consumption                                                         8249
Emissions from cement production                                                             576
Emissions from gas flaring                                                                 16070
Per capita CO2 emissions (metric tons of carbon)                                            5525
Emissions from bunker fuels (not included in the totals)                                     293
dtype: int64

In [11]:
rare_nations = wrld['Nation'].value_counts()
rare_nations = rare_nations[rare_nations <= 40].index
rare_nations

Index(['ANTARCTIC FISHERIES', 'PACIFIC ISLANDS (PALAU)', 'UNITED KOREA',
       'ARUBA', 'ERITREA', 'TUVALU', 'LIECHTENSTEIN',
       'WALLIS AND FUTUNA ISLANDS', 'OCCUPIED PALESTINIAN TERRITORY',
       'ANGUILLA', 'LESOTHO', 'ANDORRA', 'NAMIBIA', 'YEMEN',
       'BOSNIA & HERZEGOVINA', 'ARMENIA', 'TAJIKISTAN', 'UZBEKISTAN',
       'BELARUS', 'TURKMENISTAN', 'AZERBAIJAN', 'UKRAINE', 'PALAU', 'SLOVENIA',
       'REPUBLIC OF MOLDOVA', 'GEORGIA', 'FORMER PANAMA CANAL ZONE',
       'FEDERATED STATES OF MICRONESIA', 'KAZAKHSTAN', 'KYRGYZSTAN',
       'LITHUANIA', 'MACEDONIA', 'MARSHALL ISLANDS', 'CROATIA',
       'CZECH REPUBLIC', 'SLOVAKIA', 'RUSSIAN FEDERATION', 'FRENCH INDO-CHINA',
       'EAST & WEST PAKISTAN', 'NETHERLAND ANTILLES',
       'ST. KITTS-NEVIS-ANGUILLA', 'RYUKYU ISLANDS', 'SARAWAK', 'MAYOTTE',
       'ZANZIBAR', 'ISLE OF MAN', 'SABAH', 'TIMOR-LESTE (FORMERLY EAST TIMOR)',
       'TANGANYIKA', 'MONTENEGRO', 'SERBIA', 'REPUBLIC OF SOUTH VIETNAM',
       'DEMOCRATIC REPUBLIC

In [12]:
op = wrld[wrld.Nation == 'UNITED KINGDOM']['Year'].value_counts()

In [13]:
wrld['Year'].min(), wrld['Year'].max()

(1751, 2021)

In [14]:
wrld[wrld['Year'] == 1751]

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)
17504,UNITED KINGDOM,1751,2552,2552.0,0.0,0.0,0.0,,,0.0


In [15]:
col_list = ['Emissions from solid fuel consumption', 'Emissions from liquid fuel consumption', 'Emissions from gas fuel consumption', 'Emissions from cement production', 'Emissions from gas flaring', 'Emissions from bunker fuels (not included in the totals)']
wrld['sumation'] = wrld[col_list].sum(axis=1, numeric_only=True)
wrld['total_equals_sum'] = wrld['sumation'] == wrld['Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)']

In [16]:
wrld.head()

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals),sumation,total_equals_sum
0,AFGHANISTAN,1949,4,4.0,0.0,0.0,0.0,,,0.0,4.0,True
1,AFGHANISTAN,1950,23,6.0,18.0,0.0,0.0,,0.003025,0.0,24.0,False
2,AFGHANISTAN,1951,25,7.0,18.0,0.0,0.0,,0.003172,0.0,25.0,True
3,AFGHANISTAN,1952,25,9.0,17.0,0.0,0.0,,0.003206,0.0,26.0,False
4,AFGHANISTAN,1953,29,10.0,18.0,0.0,0.0,,0.003551,0.0,28.0,False


In [17]:
wrld.total_equals_sum.value_counts()

total_equals_sum
False    12163
True      6828
Name: count, dtype: int64

In [18]:
nation_lst = []
for nation in wrld['Nation'].unique():
    subset = wrld[(wrld['Year'] == 2020) & (wrld['Nation'] == str(nation))]
    if not subset.empty:
        nation_lst.append(wrld.Nation)

In [19]:
nation_lst = wrld[wrld['Year'] == 2020]['Nation'].unique().tolist()

In [20]:
len(wrld['Nation'].unique())

259

In [21]:
len(nation_lst)

222

## ML

In [22]:
def preprocessing(wrld):
    '''pick those countries for training where count is more than 40'''
    mjr_nations = wrld['Nation'].value_counts()
    mjr_nations = mjr_nations[mjr_nations >= 40].index
    filtered_wrld = wrld[wrld['Nation'].isin(mjr_nations)].copy()

    # sort data
    filtered_wrld = filtered_wrld.sort_values(['Nation', 'Year'])

    # shift co2 emission to use as target
    filtered_wrld['target'] = filtered_wrld.groupby('Nation')['sumation'].shift(-1)

    # drop rows with missing targets
    filtered_wrld = filtered_wrld.dropna(subset=['target'])

    return filtered_wrld

op = preprocessing(wrld)

In [23]:
def feature_selection(wrld):
    features_lst = ['Emissions from solid fuel consumption', 'Emissions from liquid fuel consumption', 'Emissions from gas fuel consumption', 'Emissions from cement production', 'Emissions from gas flaring', 'Emissions from bunker fuels (not included in the totals)', 'Year', 'Nation']

    x = wrld[features_lst]
    y = wrld['target']

    return x, y


In [24]:
x, y = feature_selection(op)

In [25]:
x.head()

Unnamed: 0,Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Emissions from bunker fuels (not included in the totals),Year,Nation
0,4.0,0.0,0.0,0.0,,0.0,1949,AFGHANISTAN
1,6.0,18.0,0.0,0.0,,0.0,1950,AFGHANISTAN
2,7.0,18.0,0.0,0.0,,0.0,1951,AFGHANISTAN
3,9.0,17.0,0.0,0.0,,0.0,1952,AFGHANISTAN
4,10.0,18.0,0.0,0.0,,0.0,1953,AFGHANISTAN


In [26]:
x.shape, y.shape

((17162, 8), (17162,))

In [27]:
# one-hot encoding
x_encoded = pd.get_dummies(x, columns=['Nation'])

# split
x_train = x_encoded[x['Year'] < 2021]
y_train = y[x['Year'] < 2021]
x_val = x_encoded[x['Year'] >= 2020]
y_val = y[x['Year'] >= 2020]

mdl = RandomForestRegressor(n_estimators=100, random_state=42)
mdl.fit(x_train, y_train)

preds = mdl.predict(x_val)
mae = mean_absolute_error(y_val, preds)
mape = mean_absolute_percentage_error(y_val, preds)
print(f'MAE on >= 2020: {mae:.2f}')
print(f'MAPE on >= 2020: {mape * 100:.2f}%')


MAE on >= 2020: 1836.16
MAPE on >= 2020: 4.48%


In [28]:
joblib.dump(mdl, 'rf_co2_mdl.pkl')

['rf_co2_mdl.pkl']