In [1]:
import pandas as pd
import joblib
import pickle
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
glbl = pd.read_csv('../datasets_all/global.1751_2021.csv')
print(glbl.head())

   Year  \
0  1751   
1  1752   
2  1753   
3  1754   
4  1755   

   Total carbon emissions from fossil fuel consumption and cement production (million metric tons of C)  \
0                                                  3                                                      
1                                                  3                                                      
2                                                  3                                                      
3                                                  3                                                      
4                                                  3                                                      

   Carbon emissions from solid fuel consumption  \
0                                             3   
1                                             3   
2                                             3   
3                                             3   
4                                  

In [3]:
glbl.shape

(271, 8)

In [4]:
wrld = pd.read_excel('../datasets_all/nation.1751_2021.xlsx', engine='openpyxl')

In [5]:
wrld.shape

(18991, 10)

In [6]:
wrld.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18991 entries, 0 to 18990
Data columns (total 10 columns):
 #   Column                                                                                   Non-Null Count  Dtype  
---  ------                                                                                   --------------  -----  
 0   Nation                                                                                   18991 non-null  object 
 1   Year                                                                                     18991 non-null  int64  
 2   Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)  18991 non-null  int64  
 3   Emissions from solid fuel consumption                                                    13209 non-null  float64
 4   Emissions from liquid fuel consumption                                                   18366 non-null  float64
 5   Emissions from gas fuel consumption                         

In [7]:
wrld.Nation.unique()

array(['AFGHANISTAN', 'ALBANIA', 'ALGERIA', 'ANDORRA', 'ANGOLA',
       'ANGUILLA', 'ANTARCTIC FISHERIES', 'ANTIGUA & BARBUDA',
       'ARGENTINA', 'ARMENIA', 'ARUBA', 'AUSTRALIA', 'AUSTRIA',
       'AZERBAIJAN', 'BAHAMAS', 'BAHRAIN', 'BANGLADESH', 'BARBADOS',
       'BELARUS', 'BELGIUM', 'BELIZE', 'BENIN', 'BERMUDA', 'BHUTAN',
       'BONAIRE, SAINT EUSTATIUS, AND SABA', 'BOSNIA & HERZEGOVINA',
       'BOTSWANA', 'BRAZIL', 'BRITISH VIRGIN ISLANDS',
       'BRUNEI (DARUSSALAM)', 'BULGARIA', 'BURKINA FASO', 'BURUNDI',
       'CAMBODIA', 'CANADA', 'CAPE VERDE', 'CAYMAN ISLANDS',
       'CENTRAL AFRICAN REPUBLIC', 'CHAD', 'CHILE', 'CHINA (MAINLAND)',
       'CHRISTMAS ISLAND', 'COLOMBIA', 'COMOROS', 'CONGO', 'COOK ISLANDS',
       'COSTA RICA', 'COTE D IVOIRE', 'CROATIA', 'CUBA', 'CURACAO',
       'CYPRUS', 'CZECH REPUBLIC', 'CZECHOSLOVAKIA',
       'DEMOCRATIC PEOPLE S REPUBLIC OF KOREA',
       'DEMOCRATIC REPUBLIC OF THE CONGO (FORMERLY ZAIRE)',
       'DEMOCRATIC REPUBLIC OF VIETNAM',

In [8]:
wrld[wrld['Nation'] == 'CHINA']

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)


In [9]:
wrld.head()

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)
0,AFGHANISTAN,1949,4,4.0,0.0,0.0,0.0,,,0.0
1,AFGHANISTAN,1950,23,6.0,18.0,0.0,0.0,,0.003025,0.0
2,AFGHANISTAN,1951,25,7.0,18.0,0.0,0.0,,0.003172,0.0
3,AFGHANISTAN,1952,25,9.0,17.0,0.0,0.0,,0.003206,0.0
4,AFGHANISTAN,1953,29,10.0,18.0,0.0,0.0,,0.003551,0.0


In [10]:
wrld.isna().sum()

Nation                                                                                         0
Year                                                                                           0
Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)        0
Emissions from solid fuel consumption                                                       5782
Emissions from liquid fuel consumption                                                       625
Emissions from gas fuel consumption                                                         8249
Emissions from cement production                                                             576
Emissions from gas flaring                                                                 16070
Per capita CO2 emissions (metric tons of carbon)                                            5525
Emissions from bunker fuels (not included in the totals)                                     293
dtype: int64

In [11]:
rare_nations = wrld['Nation'].value_counts()
rare_nations = rare_nations[rare_nations <= 40].index
rare_nations

Index(['ANTARCTIC FISHERIES', 'PACIFIC ISLANDS (PALAU)', 'UNITED KOREA',
       'ARUBA', 'ERITREA', 'TUVALU', 'LIECHTENSTEIN',
       'WALLIS AND FUTUNA ISLANDS', 'OCCUPIED PALESTINIAN TERRITORY',
       'ANGUILLA', 'LESOTHO', 'ANDORRA', 'NAMIBIA', 'YEMEN',
       'BOSNIA & HERZEGOVINA', 'ARMENIA', 'TAJIKISTAN', 'UZBEKISTAN',
       'BELARUS', 'TURKMENISTAN', 'AZERBAIJAN', 'UKRAINE', 'PALAU', 'SLOVENIA',
       'REPUBLIC OF MOLDOVA', 'GEORGIA', 'FORMER PANAMA CANAL ZONE',
       'FEDERATED STATES OF MICRONESIA', 'KAZAKHSTAN', 'KYRGYZSTAN',
       'LITHUANIA', 'MACEDONIA', 'MARSHALL ISLANDS', 'CROATIA',
       'CZECH REPUBLIC', 'SLOVAKIA', 'RUSSIAN FEDERATION', 'FRENCH INDO-CHINA',
       'EAST & WEST PAKISTAN', 'NETHERLAND ANTILLES',
       'ST. KITTS-NEVIS-ANGUILLA', 'RYUKYU ISLANDS', 'SARAWAK', 'MAYOTTE',
       'ZANZIBAR', 'ISLE OF MAN', 'SABAH', 'TIMOR-LESTE (FORMERLY EAST TIMOR)',
       'TANGANYIKA', 'MONTENEGRO', 'SERBIA', 'REPUBLIC OF SOUTH VIETNAM',
       'DEMOCRATIC REPUBLIC

In [12]:
op = wrld[wrld.Nation == 'UNITED KINGDOM']['Year'].value_counts()

In [13]:
wrld[wrld['Nation'] == 'UNITED KINGDOM'].sort_values('Year').iloc[-1]

Nation                                                                                     UNITED KINGDOM
Year                                                                                                 2021
Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)             88331
Emissions from solid fuel consumption                                                              5927.0
Emissions from liquid fuel consumption                                                            39708.0
Emissions from gas fuel consumption                                                               41097.0
Emissions from cement production                                                                   1046.0
Emissions from gas flaring                                                                          552.0
Per capita CO2 emissions (metric tons of carbon)                                                 1.308487
Emissions from bunker fuels (not included in t

In [14]:
wrld['Year'].min(), wrld['Year'].max()

(1751, 2021)

In [15]:
wrld[wrld['Year'] == 1751]

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals)
17504,UNITED KINGDOM,1751,2552,2552.0,0.0,0.0,0.0,,,0.0


In [16]:
col_list = ['Emissions from solid fuel consumption', 'Emissions from liquid fuel consumption', 'Emissions from gas fuel consumption', 'Emissions from cement production', 'Emissions from gas flaring', 'Emissions from bunker fuels (not included in the totals)']
wrld['sumation'] = wrld[col_list].sum(axis=1, numeric_only=True)
wrld['total_equals_sum'] = wrld['sumation'] == wrld['Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)']

In [17]:
wrld.head()

Unnamed: 0,Nation,Year,Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C),Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Per capita CO2 emissions (metric tons of carbon),Emissions from bunker fuels (not included in the totals),sumation,total_equals_sum
0,AFGHANISTAN,1949,4,4.0,0.0,0.0,0.0,,,0.0,4.0,True
1,AFGHANISTAN,1950,23,6.0,18.0,0.0,0.0,,0.003025,0.0,24.0,False
2,AFGHANISTAN,1951,25,7.0,18.0,0.0,0.0,,0.003172,0.0,25.0,True
3,AFGHANISTAN,1952,25,9.0,17.0,0.0,0.0,,0.003206,0.0,26.0,False
4,AFGHANISTAN,1953,29,10.0,18.0,0.0,0.0,,0.003551,0.0,28.0,False


In [18]:
wrld.total_equals_sum.value_counts()

total_equals_sum
False    12163
True      6828
Name: count, dtype: int64

In [19]:
nation_lst = []
for nation in wrld['Nation'].unique():
    subset = wrld[(wrld['Year'] == 2020) & (wrld['Nation'] == str(nation))]
    if not subset.empty:
        nation_lst.append(wrld.Nation)

In [20]:
nation_lst = wrld[wrld['Year'] == 2020]['Nation'].unique().tolist()

In [21]:
len(wrld['Nation'].unique())

259

In [22]:
len(nation_lst)

222

## ML

In [23]:
def preprocessing(wrld):
    '''pick those countries for training where count is more than 40'''
    mjr_nations = wrld['Nation'].value_counts()
    mjr_nations = mjr_nations[mjr_nations >= 40].index
    filtered_wrld = wrld[wrld['Nation'].isin(mjr_nations)].copy()

    # sort data
    filtered_wrld = filtered_wrld.sort_values(['Nation', 'Year'])

    # shift co2 emission to use as target
    filtered_wrld['target'] = filtered_wrld.groupby('Nation')['sumation'].shift(-1)

    # drop rows with missing targets
    filtered_wrld = filtered_wrld.dropna(subset=['target'])

    return filtered_wrld

op = preprocessing(wrld)

In [24]:
def feature_selection(wrld):
    features_lst = ['Emissions from solid fuel consumption', 'Emissions from liquid fuel consumption', 'Emissions from gas fuel consumption', 'Emissions from cement production', 'Emissions from gas flaring', 'Emissions from bunker fuels (not included in the totals)', 'Year', 'Nation']

    x = wrld[features_lst]
    y = wrld['target']

    return x, y


In [25]:
x, y = feature_selection(op)

In [26]:
x.head()

Unnamed: 0,Emissions from solid fuel consumption,Emissions from liquid fuel consumption,Emissions from gas fuel consumption,Emissions from cement production,Emissions from gas flaring,Emissions from bunker fuels (not included in the totals),Year,Nation
0,4.0,0.0,0.0,0.0,,0.0,1949,AFGHANISTAN
1,6.0,18.0,0.0,0.0,,0.0,1950,AFGHANISTAN
2,7.0,18.0,0.0,0.0,,0.0,1951,AFGHANISTAN
3,9.0,17.0,0.0,0.0,,0.0,1952,AFGHANISTAN
4,10.0,18.0,0.0,0.0,,0.0,1953,AFGHANISTAN


In [27]:
x.shape, y.shape

((17162, 8), (17162,))

In [28]:
# one-hot encoding
x_encoded = pd.get_dummies(x, columns=['Nation'])
training_cols = x_encoded.columns.tolist()
# split
x_train = x_encoded[x['Year'] < 2021]
y_train = y[x['Year'] < 2021]
x_val = x_encoded[x['Year'] >= 2020]
y_val = y[x['Year'] >= 2020]

mdl = RandomForestRegressor(n_estimators=100, random_state=42)
mdl.fit(x_train, y_train)

preds = mdl.predict(x_val)
mae = mean_absolute_error(y_val, preds)
mape = mean_absolute_percentage_error(y_val, preds)
print(f'MAE on >= 2020: {mae:.2f}')
print(f'MAPE on >= 2020: {mape * 100:.2f}%')


MAE on >= 2020: 1836.16
MAPE on >= 2020: 4.48%


In [29]:
with open('training_cols.txt', 'wb') as fp:
    pickle.dump(training_cols, fp)

In [30]:
with open('rf_co2_mdl.pkl', 'wb') as f:
    pickle.dump(mdl, f)

In [31]:
def predict_future_emissions(selected_country, base_df, mdl, years=[2022, 2023, 2024], training_cols=None):
    latest_row = base_df[base_df['Nation'] == selected_country].sort_values('Year').iloc[-1]
    # future rows
    future_rows = []
    for year in years:
        row = latest_row.copy()
        row['Year'] = year
        future_rows.append(row)
    
    future_df = pd.DataFrame(future_rows)

    # apply same preprocessing
    future_dummies = pd.get_dummies(future_df, columns=['Nation'])

    # ensure all training columns are present
    for col in training_cols:
        if col not in future_dummies.columns:
            future_dummies[col] = 0

    future_dummies = future_dummies[training_cols]

    # predictions
    preds = mdl.predict(future_dummies)
    return pd.DataFrame({'Nation': selected_country, 'Year': years, 'Predicted Emissions': preds})


In [32]:
op = []
for cntry in wrld.Nation.unique():
    pred_df = predict_future_emissions(selected_country=cntry, base_df=wrld, mdl=mdl,training_cols=training_cols)
    op.append(pred_df)
final_op = pd.concat(op, ignore_index=True)

  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dummies[col] = 0
  future_dum

In [33]:
final_op.to_csv('../datasets_all/cntry_year_preditction.csv', index=False)

In [34]:
final_op.head()

Unnamed: 0,Nation,Year,Predicted Emissions
0,AFGHANISTAN,2022,2908.79
1,AFGHANISTAN,2023,2908.79
2,AFGHANISTAN,2024,2908.79
3,ALBANIA,2022,1493.92
4,ALBANIA,2023,1493.92


In [35]:
def predict_future_emissions_v2(selected_country, base_df, mdl, training_cols, years_to_predict=[2022, 2023, 2024]):
    # Get the most recent row for the selected country
    last_row = base_df[base_df['Nation'] == selected_country].sort_values('Year').iloc[-1].copy()

    # Store results
    future_preds = []

    for year in years_to_predict:
        # Prepare input for the model
        future_row = last_row.copy()
        future_row['Year'] = year

        # Convert to DataFrame
        input_df = pd.DataFrame([future_row])

        # One-hot encode 'Nation' (and any other categorical features used in training)
        input_df_encoded = pd.get_dummies(input_df)

        # Align with training columns to ensure consistent input shape
        input_df_encoded = input_df_encoded.reindex(columns=training_cols, fill_value=0)

        # Predict
        pred_value = mdl.predict(input_df_encoded)[0]

        # Save prediction
        future_preds.append({
            'Nation': selected_country,
            'Year': year,
            'Predicted_CO2': pred_value
        })

    return pd.DataFrame(future_preds)


In [36]:
op_2 = []
for cntry in wrld.Nation.unique():
    pred_df = predict_future_emissions_v2(selected_country=cntry, base_df=wrld, mdl=mdl,training_cols=training_cols)
    op_2.append(pred_df)
final_op_2 = pd.concat(op_2, ignore_index=True)

In [37]:
final_op_2.head(10)

Unnamed: 0,Nation,Year,Predicted_CO2
0,AFGHANISTAN,2022,2908.79
1,AFGHANISTAN,2023,2908.79
2,AFGHANISTAN,2024,2908.79
3,ALBANIA,2022,1493.92
4,ALBANIA,2023,1493.92
5,ALBANIA,2024,1493.92
6,ALGERIA,2022,45666.97
7,ALGERIA,2023,45666.97
8,ALGERIA,2024,45666.97
9,ANDORRA,2022,137.07


In [38]:
def predict_future_emissions_v3(selected_country, base_df, mdl, training_cols, years_to_predict=[2022, 2023, 2024]):
    # Filter for the country and sort by year
    country_df = base_df[base_df['Nation'] == selected_country].sort_values('Year')

    if len(country_df) < 2:
        # Not enough data to compute trend
        return pd.DataFrame()

    last_known = country_df.iloc[-1].copy()
    prev_known = country_df.iloc[-2].copy()

    # Emission columns
    emission_cols = [
        'Emissions from solid fuel consumption',
        'Emissions from liquid fuel consumption',
        'Emissions from gas fuel consumption',
        'Emissions from cement production',
        'Emissions from gas flaring',
        'Emissions from bunker fuels (not included in the totals)'
    ]

    # Estimate safe percent changes
    pct_changes = {}
    for col in emission_cols:
        prev_val = prev_known[col]
        last_val = last_known[col]

        if prev_val == 0 or pd.isna(prev_val) or pd.isna(last_val):
            pct_changes[col] = 0.0
        else:
            pct_changes[col] = (last_val - prev_val) / abs(prev_val)

    future_preds = []

    for year in years_to_predict:
        future_row = last_known.copy()
        future_row['Year'] = year

        for col in emission_cols:
            future_val = future_row[col] * (1 + pct_changes[col])
            # Clamp to 0 if negative due to extrapolation
            future_row[col] = max(future_val, 0)

        # Create input DataFrame
        input_df = pd.DataFrame([future_row])

        # One-hot encode
        input_df_encoded = pd.get_dummies(input_df)
        input_df_encoded = input_df_encoded.reindex(columns=training_cols, fill_value=0)

        # Ensure no infinite or NaN
        input_df_encoded.replace([np.inf, -np.inf], np.nan, inplace=True)
        input_df_encoded.fillna(0, inplace=True)

        # Predict
        pred_value = mdl.predict(input_df_encoded)[0]

        future_preds.append({
            'Nation': selected_country,
            'Year': year,
            'Predicted_CO2': pred_value
        })

        # Use this row for next year prediction
        last_known = future_row.copy()

    return pd.DataFrame(future_preds)


In [39]:
op_3 = []
for cntry in wrld.Nation.unique():
    pred_df = predict_future_emissions_v3(selected_country=cntry, base_df=wrld, mdl=mdl,training_cols=training_cols)
    op_3.append(pred_df)
final_op_3 = pd.concat(op_3, ignore_index=True)

In [40]:
final_op_3[final_op_3['Nation'] == 'UNITED STATES OF AMERICA']

Unnamed: 0,Nation,Year,Predicted_CO2
729,UNITED STATES OF AMERICA,2022,1419767.77
730,UNITED STATES OF AMERICA,2023,1454446.24
731,UNITED STATES OF AMERICA,2024,1447436.68


In [41]:
final_op_3.head(10)

Unnamed: 0,Nation,Year,Predicted_CO2
0,AFGHANISTAN,2022,2788.48
1,AFGHANISTAN,2023,2820.86
2,AFGHANISTAN,2024,2518.38
3,ALBANIA,2022,1805.19
4,ALBANIA,2023,2081.05
5,ALBANIA,2024,2408.77
6,ALGERIA,2022,47880.93
7,ALGERIA,2023,56379.7
8,ALGERIA,2024,60672.91
9,ANDORRA,2022,144.9


In [42]:
final_op_3.to_csv('../datasets_all/cntry_year_preditction_v3.csv', index=False)

In [43]:
# combining predicted and historical data
historical = wrld[['Nation', 'Year', 'Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)']]
historical.rename(columns={
    'Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)': 'CO2',
}, inplace=True)
historical['Source'] = 'Actual'

# predicted
predicted = final_op_3.copy()
predicted.rename(columns={'Predicted_CO2': 'CO2'}, inplace=True)
predicted['Source'] = 'Predicted'

combined_df = pd.concat([historical, predicted], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  historical['Source'] = 'Actual'


In [44]:
combined_df.head()

Unnamed: 0,Nation,Year,CO2,Source
0,AFGHANISTAN,1949,4.0,Actual
1,AFGHANISTAN,1950,23.0,Actual
2,AFGHANISTAN,1951,25.0,Actual
3,AFGHANISTAN,1952,25.0,Actual
4,AFGHANISTAN,1953,29.0,Actual
