<a href="https://colab.research.google.com/github/sonleh96/wb-gpbp-ldt/blob/dev-ghinwa/Total_Methane_Emissions_2022_District.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Libraries Needed
import geopandas as gpd
import pandas as pd
from google.cloud import storage
from io import StringIO

from shapely.geometry import Point

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
# Create a client
client = storage.Client()

# Access the Google Cloud Storage bucket
bucket_name = 'wb-ldt'
bucket = client.get_bucket(bucket_name)

# Define the folder path in the bucket (ensure it ends with '/')
folder_path = 'RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/'

**Download Emissions Data**

In [4]:
files = list()

blobs = bucket.list_blobs(prefix=folder_path)
for i, blob in enumerate(blobs):
  if i == 0:
    continue  # Skip iteration when i == 1

  else:
    files.append(blob.name)

In [5]:
# List and read files into DataFrames
blobs = bucket.list_blobs(prefix=folder_path)

# Initialize an empty list to hold DataFrames
df_list = []

# Iterate through the blobs
for i, blob in enumerate(blobs):
    # Skip the first file, if needed
    if i == 0:
        continue
    else:
        # Read the content of the blob (assuming it's a CSV file)
        blob_content = blob.download_as_text()  # Download content as a string

        # Convert the content into a DataFrame
        df = pd.read_csv(StringIO(blob_content))

        # Append the DataFrame to the list
        df_list.append(df)
        print(f"Read {blob.name} into a DataFrame")

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)
final_df_cleaned = final_df.drop_duplicates()

Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/cement_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/coal-mining_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/copper-mining_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/cropland-fires_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/domestic-aviation_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/domestic-shipping_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/electricity-generation_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/enteric-ferm

In [6]:
final_df['gas'].unique()

array(['ch4', 'co2', 'co2e_100yr', 'co2e_20yr', 'n2o'], dtype=object)

In [7]:
# Assuming 'date_column' is the column you want to convert
final_df_cleaned['start_time'] = pd.to_datetime(final_df_cleaned['start_time'], errors='coerce')

# Now filter for rows where the year is 2022
final_df = final_df_cleaned[(final_df_cleaned['start_time'].dt.year == 2022) & (final_df_cleaned['gas'] == 'ch4')]
final_df = final_df[['source_id', 'source_name', 'source_type', 'sector', 'subsector', 'lat', 'lon', 'geometry_ref', 'gas', 'emissions_quantity', 'emissions_factor', 'emissions_factor_units']]

In [8]:
len(final_df_cleaned)

96530

In [9]:
# Display the filtered DataFrame
len(final_df)

2594

In [10]:
# Convert the DataFrame into a GeoDataFrame using lat and lon columns
final_df['geometry'] = final_df.apply(lambda row: Point((row['lon'], row['lat'])), axis=1)

# Define the GeoDataFrame with a coordinate reference system (CRS) such as WGS84 (EPSG:4326)
final_df = gpd.GeoDataFrame(final_df, geometry='geometry', crs="EPSG:4326")

**Serbia Shape File**

In [11]:
#Name of Shape File
file_path = "shapefiles/gadm41_SRB_2.json"
gcs_file_path = 'gs://' + bucket_name + '/' + file_path

#Read Shape File --> The shape file gives a MultiPolygon Geometry Column
gdf = gpd.read_file(gcs_file_path)

#Adjust for GeoSpatial Data
center = gpd.GeoDataFrame(gdf[['GID_1', 'NAME_1']])

#Change the MultiPolygon Geometry Column to make it more useful
center['geometry'] = gdf.centroid
center = center.to_crs(gdf.crs)
center['lat'] = center.geometry.y
center['lon'] = center.geometry.x


  center['geometry'] = gdf.centroid
  center['geometry'] = gdf.centroid


In [12]:
emissions = gpd.sjoin(final_df, gdf, predicate = 'within',
                      how = 'inner')

In [13]:
grouped_emissions = emissions.groupby(['GID_1', 'NAME_1', 'gas']).agg({
    'emissions_quantity': 'sum',        # Sum of the 'Sales' column
    'emissions_factor': 'mean'     # Average of the 'Quantity' column
})

In [14]:
grouped_emissions = grouped_emissions.reset_index()
#len(grouped_emissions)

In [15]:
grouped_emissions.head()

Unnamed: 0,GID_1,NAME_1,gas,emissions_quantity,emissions_factor
0,SRB.10_1,Nišavski,ch4,2748.110647,0.025564
1,SRB.11_1,Pčinjski,ch4,3854.151237,0.028772
2,SRB.12_1,Pirotski,ch4,1405.439611,0.028843
3,SRB.13_1,Podunavski,ch4,449.60284,0.029
4,SRB.14_1,Pomoravski,ch4,5873.560229,0.028878


In [16]:
df = grouped_emissions.pivot(index='GID_1', columns='gas', values='emissions_quantity').reset_index()

In [17]:
df = df.rename(columns= {"ch4": "ch4_emissions_quantity"})
df.head()

gas,GID_1,ch4_emissions_quantity
0,SRB.10_1,2748.110647
1,SRB.11_1,3854.151237
2,SRB.12_1,1405.439611
3,SRB.13_1,449.60284
4,SRB.14_1,5873.560229


In [18]:
df['year'] = 2022

In [19]:
df = df.merge(grouped_emissions[['GID_1', 'NAME_1']],
              how = 'inner')

In [21]:
len(df)

25

In [24]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data-district/SRB_methane_emissions-2022.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)

File uploaded to RS/processed-data-district/SRB_methane_emissions-2022.csv.


In [None]:
grouped_emissions = emissions.groupby(['GID_2', 'NAME_2', 'gas', 'sector']).agg({
    'emissions_quantity': 'sum',        # Sum of the 'Sales' column
    'emissions_factor': 'mean'     # Average of the 'Quantity' column
})

In [None]:
grouped_emissions = grouped_emissions.reset_index()

In [None]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
grouped_emissions.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data/SRB_emissions-per-sector-all-sources-2022.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)

File uploaded to RS/processed-data/SRB_emissions-per-sector-all-sources-2022.csv.
