In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load your dataset
df = pd.read_csv("co-emissions-by-sector.csv")
df

Unnamed: 0,Entity,Code,Year,Carbon dioxide emissions from buildings,Carbon dioxide emissions from industry,Carbon dioxide emissions from land use change and forestry,Carbon dioxide emissions from other fuel combustion,Carbon dioxide emissions from transport,Carbon dioxide emissions from manufacturing and construction,Fugitive emissions of carbon dioxide from energy production,Carbon dioxide emissions from electricity and heat,Carbon dioxide emissions from bunker fuels
0,Afghanistan,AFG,1990,129999.99,50000.0,-2390000.0,0.0,970000.0,570000.0,,320000.0,20000.00
1,Afghanistan,AFG,1991,140000.00,50000.0,-2390000.0,0.0,930000.0,530000.0,,300000.0,20000.00
2,Afghanistan,AFG,1992,150000.00,50000.0,-2390000.0,0.0,740000.0,390000.0,,200000.0,20000.00
3,Afghanistan,AFG,1993,160000.00,50000.0,-2390000.0,0.0,740000.0,380000.0,,200000.0,20000.00
4,Afghanistan,AFG,1994,160000.00,50000.0,-2390000.0,0.0,730000.0,360000.0,,190000.0,20000.00
...,...,...,...,...,...,...,...,...,...,...,...,...
6555,Zimbabwe,ZWE,2017,1070000.00,470000.0,87160000.0,250000.0,2000000.0,1240000.0,,5100000.0,129999.99
6556,Zimbabwe,ZWE,2018,1030000.00,560000.0,87160000.0,320000.0,2620000.0,1540000.0,,5400000.0,160000.00
6557,Zimbabwe,ZWE,2019,890000.00,470000.0,87160000.0,280000.0,2240000.0,1610000.0,,4990000.0,160000.00
6558,Zimbabwe,ZWE,2020,970000.00,500000.0,87160000.0,340000.0,1510000.0,1400000.0,,3840000.0,60000.00


In [None]:
# Step 1: Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("-", "_")
df.columns

Index(['entity', 'code', 'year', 'carbon_dioxide_emissions_from_buildings',
       'carbon_dioxide_emissions_from_industry',
       'carbon_dioxide_emissions_from_land_use_change_and_forestry',
       'carbon_dioxide_emissions_from_other_fuel_combustion',
       'carbon_dioxide_emissions_from_transport',
       'carbon_dioxide_emissions_from_manufacturing_and_construction',
       'fugitive_emissions_of_carbon_dioxide_from_energy_production',
       'carbon_dioxide_emissions_from_electricity_and_heat',
       'carbon_dioxide_emissions_from_bunker_fuels'],
      dtype='object')

In [None]:
# Step 2: Rename long sector names for simplicity
df = df.rename(columns={
    'carbon_dioxide_emissions_from_buildings': 'buildings',
    'carbon_dioxide_emissions_from_industry': 'industry',
    'carbon_dioxide_emissions_from_transport': 'transport',
    'carbon_dioxide_emissions_from_land_use_change_and_forestry': 'land_use',
    'carbon_dioxide_emissions_from_other_fuel_combustion': 'other_fuel',
    'carbon_dioxide_emissions_from_manufacturing_and_construction': 'manufacturing',
    'fugitive_emissions_of_carbon_dioxide_from_energy_production': 'fugitive_emissions',
    'carbon_dioxide_emissions_from_electricity_and_heat': 'electricity_heat',
    'carbon_dioxide_emissions_from_bunker_fuels': 'bunker_fuels'
})

In [None]:
# Step 3: Drop high-null column (optional)
df = df.drop(columns=['fugitive_emissions'], errors='ignore')

In [None]:
# Step 4: Fill remaining NaN values with 0
df = df.fillna(0)

In [None]:
# Step 5: Create a total_emissions column
sector_cols = ['buildings', 'industry', 'transport', 'land_use', 'other_fuel',
               'manufacturing', 'electricity_heat', 'bunker_fuels']

df['total_emissions'] = df[sector_cols].sum(axis=1)

# Step 6: Preview the cleaned dataset
print(df.head())

        entity code  year  buildings  industry   land_use  other_fuel  \
0  Afghanistan  AFG  1990  129999.99   50000.0 -2390000.0         0.0   
1  Afghanistan  AFG  1991  140000.00   50000.0 -2390000.0         0.0   
2  Afghanistan  AFG  1992  150000.00   50000.0 -2390000.0         0.0   
3  Afghanistan  AFG  1993  160000.00   50000.0 -2390000.0         0.0   
4  Afghanistan  AFG  1994  160000.00   50000.0 -2390000.0         0.0   

   transport  manufacturing  electricity_heat  bunker_fuels  total_emissions  
0   970000.0       570000.0          320000.0       20000.0       -330000.01  
1   930000.0       530000.0          300000.0       20000.0       -420000.00  
2   740000.0       390000.0          200000.0       20000.0       -840000.00  
3   740000.0       380000.0          200000.0       20000.0       -840000.00  
4   730000.0       360000.0          190000.0       20000.0       -880000.00  


In [None]:
df.to_csv("final_co2_emissions.csv", index=False)

In [None]:
from google.colab import files
files.download("final_co2_emissions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>