In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

In [27]:
file_path = "data/Indicator_1_1_annual_6562429754166382300.csv"
df = pd.read_csv(file_path)

In [28]:
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())
print("\nUnique countries:", df['Country'].nunique())
print("\nUnique industries:", df['Industry'].nunique())
print("\nUnique gas types:", df['Gas Type'].nunique())

Shape: (1194, 28)

Columns: ['ObjectId2', 'Country', 'ISO2', 'ISO3', 'Indicator', 'Unit', 'Source', 'CTS Code', 'CTS Name', 'CTS Full Descriptor', 'Industry', 'Gas Type', 'Seasonal Adjustment', 'Scale', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

Missing values:
 ObjectId2                 0
Country                   0
ISO2                   1194
ISO3                      0
Indicator                 0
Unit                      0
Source                    0
CTS Code                  0
CTS Name                  0
CTS Full Descriptor       0
Industry                  0
Gas Type                  0
Seasonal Adjustment       0
Scale                     0
2010                      0
2011                      0
2012                      0
2013                      0
2014                      0
2015                      0
2016                      0
2017                      0
2018                      0
2019                   

In [29]:
df.head(3)

Unnamed: 0,ObjectId2,Country,ISO2,ISO3,Indicator,Unit,Source,CTS Code,CTS Name,CTS Full Descriptor,Industry,Gas Type,Seasonal Adjustment,Scale,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,1,Advanced Economies,,AETMP,Annual greenhouse gas (GHG) air emissions acco...,Million metric tons of CO2 equivalent,Organisation for Economic Co-operation and Dev...,ECNGA,Greenhouse Gas Emissions (GHG); Air Emissions ...,"Environment, Climate Change, Greenhouse Gas Em...","Agriculture, Forestry and Fishing",Carbon dioxide,Seasonally Adjusted,Units,173.260853,170.220412,177.534807,175.325824,173.567963,177.091399,176.675451,178.385861,267.731429,264.950269,259.339703,264.885635,263.950294,261.488111
1,2,Advanced Economies,,AETMP,Annual greenhouse gas (GHG) air emissions acco...,Million metric tons of CO2 equivalent,Organisation for Economic Co-operation and Dev...,ECNGA,Greenhouse Gas Emissions (GHG); Air Emissions ...,"Environment, Climate Change, Greenhouse Gas Em...","Agriculture, Forestry and Fishing",Fluorinated gases,Seasonally Adjusted,Units,1.024105,1.089814,1.11303,1.193029,1.167994,1.216524,1.177738,1.202901,1.078726,1.057855,1.056469,1.017692,0.986116,0.951482
2,3,Advanced Economies,,AETMP,Annual greenhouse gas (GHG) air emissions acco...,Million metric tons of CO2 equivalent,Organisation for Economic Co-operation and Dev...,ECNGA,Greenhouse Gas Emissions (GHG); Air Emissions ...,"Environment, Climate Change, Greenhouse Gas Em...","Agriculture, Forestry and Fishing",Greenhouse gas,Seasonally Adjusted,Units,1323.978223,1315.166082,1313.348179,1356.224115,1366.936222,1376.562102,1364.670233,1371.16541,1508.188585,1473.062889,1442.419245,1453.734142,1433.944153,1426.65117


In [30]:
for col in df.columns:
    if df[col].nunique() == 1:

        print(col)
    #df[df['Country'] == 'Advanced Economies'].head(10)
        print(df[col].nunique())
    #print(df[col].value_counts())

Indicator
1
Unit
1
Source
1
CTS Code
1
CTS Name
1
CTS Full Descriptor
1
Seasonal Adjustment
1
Scale
1


In [31]:
df = df.drop(['ISO2','Indicator', 'Unit', 'Source', 'CTS Code', 'CTS Name','CTS Full Descriptor','Seasonal Adjustment', 'Scale'], axis = 1)

In [32]:
df['Gas Type'].value_counts()

Carbon dioxide       250
Greenhouse gas       250
Methane              250
Nitrous oxide        250
Fluorinated gases    194
Name: Gas Type, dtype: int64

In [33]:
df.columns

Index(['ObjectId2', 'Country', 'ISO3', 'Industry', 'Gas Type', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023'],
      dtype='object')

In [34]:
countries = df["Country"].unique()
print("Number of countries:", len(countries))
print(countries)

Number of countries: 25
['Advanced Economies' 'Africa' 'Americas' 'Asia'
 'Australia and New Zealand' 'Central Asia' 'Eastern Asia'
 'Eastern Europe' 'Emerging and Developing Economies' 'Europe' 'G20' 'G7'
 'Latin America and the Caribbean' 'Northern Africa' 'Northern America'
 'Northern Europe' 'Oceania' 'Other Oceania sub-regions'
 'South-eastern Asia' 'Southern Asia' 'Southern Europe'
 'Sub-Saharan Africa' 'Western Asia' 'Western Europe' 'World']


In [35]:
counts = (
    df["Country"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Country", "Country": "Row_Count"})
)

print("Total unique regions:", len(counts))
counts

Total unique regions: 25


Unnamed: 0,Country,Row_Count
0,Advanced Economies,50
1,Europe,50
2,Western Europe,50
3,Western Asia,50
4,Southern Europe,50
5,Oceania,50
6,Northern Europe,50
7,G7,50
8,G20,50
9,World,50


In [38]:
# Only want continents
df_continents = df[df['Country'].isin(['Asia', 'Africa', 'Europe', 'Oceania', 'Americas'])]
df_continents
# And then focus in on those with area of highest and break down even further

Unnamed: 0,ObjectId2,Country,ISO3,Industry,Gas Type,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
50,51,Africa,NA605,"Agriculture, Forestry and Fishing",Carbon dioxide,13.764646,14.013880,15.113429,16.348389,16.397802,16.272526,16.599160,18.665183,18.274653,17.479398,17.476979,18.363328,18.627047,18.729477
51,52,Africa,NA605,"Agriculture, Forestry and Fishing",Greenhouse gas,746.666487,759.223800,783.954722,800.405420,815.142069,830.342939,857.029048,866.575458,881.852581,900.163472,927.896582,933.561471,953.968074,971.153944
52,53,Africa,NA605,"Agriculture, Forestry and Fishing",Methane,518.958969,527.655030,545.888680,555.348280,566.883360,578.143041,596.462642,600.885129,612.803188,628.271138,647.997966,650.169169,664.558934,678.069693
53,54,Africa,NA605,"Agriculture, Forestry and Fishing",Nitrous oxide,213.942872,217.554890,222.952613,228.708751,231.860907,235.927372,243.967246,247.025145,250.774741,254.412936,262.421637,265.028975,270.782093,274.354773
54,55,Africa,NA605,Construction,Carbon dioxide,14.739421,14.683788,14.612763,15.531768,15.816543,15.410865,15.617956,15.948885,17.119342,16.420184,14.998471,16.448411,16.097923,16.112988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816,936,Oceania,OCETMP,"Water supply; sewerage, waste management and r...",Carbon dioxide,1.078737,1.120538,1.165991,1.199452,1.277886,1.323179,1.366242,1.352139,1.363956,1.324029,1.284978,1.413799,1.380583,1.372198
817,937,Oceania,OCETMP,"Water supply; sewerage, waste management and r...",Fluorinated gases,0.007927,0.007768,0.007353,0.007200,0.006727,0.006216,0.006171,0.005634,0.005229,0.005342,0.005106,0.004618,0.003925,0.004017
818,938,Oceania,OCETMP,"Water supply; sewerage, waste management and r...",Greenhouse gas,17.119341,16.775185,15.623260,14.791663,14.880234,14.524249,15.076835,15.337364,15.258940,14.911750,14.767259,14.873634,14.900376,14.907682
819,939,Oceania,OCETMP,"Water supply; sewerage, waste management and r...",Methane,15.264279,14.859400,13.740488,12.929864,12.908807,12.472949,12.944900,13.239467,13.159212,12.839472,12.716227,12.686177,12.737657,12.742981


In [39]:
df_continents.columns

Index(['ObjectId2', 'Country', 'ISO3', 'Industry', 'Gas Type', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023'],
      dtype='object')

In [36]:
df[df['Country'] == 'Asia']

Unnamed: 0,ObjectId2,Country,ISO3,Industry,Gas Type,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
138,139,Asia,ASIATMP,"Agriculture, Forestry and Fishing",Carbon dioxide,214.056668,221.169936,212.65191,214.715215,212.09688,224.314506,223.768087,224.261602,219.992939,221.488501,223.87912,222.542854,224.9652,229.701869
139,140,Asia,ASIATMP,"Agriculture, Forestry and Fishing",Fluorinated gases,0.059641,0.080017,0.091665,0.096087,0.11144,0.119199,0.12571,0.157617,0.158843,0.181449,0.20961,0.213433,0.218619,0.222528
140,141,Asia,ASIATMP,"Agriculture, Forestry and Fishing",Greenhouse gas,3079.353009,3119.930523,3122.388257,3144.042878,3123.515817,3156.061579,3166.617272,3173.490867,3166.517308,3167.529595,3227.345226,3242.566583,3247.427058,3260.25818
141,142,Asia,ASIATMP,"Agriculture, Forestry and Fishing",Methane,2132.403708,2146.497967,2153.367065,2165.12396,2162.881568,2169.718376,2187.834087,2202.829924,2195.860406,2181.612171,2226.92703,2249.16394,2249.367919,2253.147571
142,143,Asia,ASIATMP,"Agriculture, Forestry and Fishing",Nitrous oxide,732.832993,752.182603,756.277618,764.107616,748.42593,761.909497,754.889388,746.241724,750.505119,764.247475,776.329465,770.646356,772.87532,777.186212
143,144,Asia,ASIATMP,Construction,Carbon dioxide,375.040905,393.921847,396.686196,398.18931,403.83829,397.155893,385.315699,378.262773,388.168866,395.087723,395.593878,400.597361,393.354773,412.92172
144,145,Asia,ASIATMP,Construction,Fluorinated gases,1.897709,2.128416,2.273805,2.411704,2.594592,2.713938,2.891279,3.069742,3.186891,3.251931,3.240302,3.292828,3.363274,3.424381
145,146,Asia,ASIATMP,Construction,Greenhouse gas,379.682036,398.892241,401.852858,403.47593,409.343983,402.763264,391.002329,384.077147,394.060217,401.013736,401.51114,406.614411,399.468913,419.178442
146,147,Asia,ASIATMP,Construction,Methane,1.641451,1.696804,1.724503,1.710466,1.741808,1.731762,1.67573,1.642628,1.621502,1.606975,1.611396,1.638759,1.657905,1.707066
147,148,Asia,ASIATMP,Construction,Nitrous oxide,1.10197,1.145175,1.168353,1.16445,1.169293,1.161671,1.119621,1.102004,1.082958,1.067107,1.065564,1.085463,1.092961,1.125275


In [None]:
#Compare industries 

In [10]:
counts = (
    df["ISO3"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "ISO3", "ISO3": "Row_Count"})
)

print("Total unique regions:", len(counts))
counts

Total unique regions: 25


Unnamed: 0,ISO3,Row_Count
0,AETMP,50
1,EURTMP,50
2,NAWE,50
3,NAWA,50
4,NASE,50
5,OCETMP,50
6,NANE,50
7,NA119,50
8,NA120,50
9,WLD,50


In [11]:
print("Indicators:", df["Indicator"].unique()[:3])
print("Gas types:", df["Gas Type"].unique())
print("Industries:", df["Industry"].unique()[:5])

Indicators: ['Annual greenhouse gas (GHG) air emissions accounts']
Gas types: ['Carbon dioxide' 'Fluorinated gases' 'Greenhouse gas' 'Methane'
 'Nitrous oxide']
Industries: ['Agriculture, Forestry and Fishing' 'Construction'
 'Electricity, Gas, Steam and Air Conditioning Supply' 'Manufacturing'
 'Mining']


In [12]:
counts = (
    df["Industry"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Industry", "Industry": "Row_Count"})
)

print("Total unique regions:", len(counts))
counts

Total unique regions: 10


Unnamed: 0,Industry,Row_Count
0,Manufacturing,125
1,Other Services Industries,125
2,Total Industry and Households,125
3,Transportation and Storage,120
4,Total Households,119
5,"Agriculture, Forestry and Fishing",116
6,Construction,116
7,"Electricity, Gas, Steam and Air Conditioning S...",116
8,Mining,116
9,"Water supply; sewerage, waste management and r...",116


In [13]:
counts = (
    df["Gas Type"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "Gas Type", "Gas Type": "Row_Count"})
)

print("Total unique regions:", len(counts))
counts


Total unique regions: 5


Unnamed: 0,Gas Type,Row_Count
0,Carbon dioxide,250
1,Greenhouse gas,250
2,Methane,250
3,Nitrous oxide,250
4,Fluorinated gases,194
