<a href="https://colab.research.google.com/github/sonleh96/wb-gpbp-ldt/blob/dev-ghinwa/Emissions_Coal_PowerPlants_vf_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from io import StringIO

In [2]:
!pip install google-cloud-storage

from google.colab import auth
auth.authenticate_user()

from google.cloud import storage

# Create a client
client = storage.Client()

#Access the Google Bucket
bucket_name = 'wb-ldt'
bucket = client.get_bucket(bucket_name)



In [3]:
#Read Emissions data

# Define the folder path in the bucket (ensure it ends with '/')
folder_path = 'RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/'

# List and read files into DataFrames
blobs = bucket.list_blobs(prefix=folder_path)

# Initialize an empty list to hold DataFrames
df_list = []

# Iterate through the blobs
for i, blob in enumerate(blobs):
    # Skip the first file, if needed
    if i == 0:
        continue
    else:
        # Read the content of the blob (assuming it's a CSV file)
        blob_content = blob.download_as_text()  # Download content as a string

        # Convert the content into a DataFrame
        df = pd.read_csv(StringIO(blob_content))

        # Append the DataFrame to the list
        df_list.append(df)
        print(f"Read {blob.name} into a DataFrame")

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)
final_df = final_df.drop_duplicates()

Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/cement_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/coal-mining_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/copper-mining_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/cropland-fires_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/domestic-aviation_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/domestic-shipping_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/electricity-generation_emissions_sources.csv into a DataFrame
Read RS/raw-data/geospatial/climate-trace-emissions/filtered_emissions_data/enteric-ferm

In [4]:
final_df['geometry'] = final_df.apply(lambda row: Point((row['lon'], row['lat'])), axis=1)

# Define the GeoDataFrame with a coordinate reference system (CRS) such as WGS84 (EPSG:4326)
emissions_gdf = gpd.GeoDataFrame(final_df, geometry='geometry', crs="EPSG:4326")

In [5]:
power_plant_gdf = emissions_gdf[(emissions_gdf['sector'] == 'power') & (emissions_gdf['source_type'] == 'coal')]
power_plant_gdf.head()

Unnamed: 0,source_id,source_name,source_type,iso3_country,sector,subsector,start_time,end_time,lat,lon,...,other7_def,other8,other8_def,other9,other9_def,other10,other10_def,created_date,modified_date,geometry
11875,25450758,Kolubara A power station,coal,SRB,power,electricity-generation,2019-01-01 00:00:00,2019-12-31 00:00:00,44.4804,20.2936,...,,,,,,,,2023-10-31 00:00:00,2023-11-01 10:00:00,POINT (20.29360 44.48040)
11876,25450758,Kolubara A power station,coal,SRB,power,electricity-generation,2019-01-01 00:00:00,2019-12-31 00:00:00,44.4804,20.2936,...,,,,,,,,2023-10-31 00:00:00,2023-11-01 10:00:00,POINT (20.29360 44.48040)
11877,25450758,Kolubara A power station,coal,SRB,power,electricity-generation,2019-01-01 00:00:00,2019-12-31 00:00:00,44.4804,20.2936,...,,,,,,,,2023-10-31 00:00:00,2023-11-01 10:00:00,POINT (20.29360 44.48040)
11878,25450758,Kolubara A power station,coal,SRB,power,electricity-generation,2019-01-01 00:00:00,2019-12-31 00:00:00,44.4804,20.2936,...,,,,,,,,2023-10-31 00:00:00,2023-11-01 10:00:00,POINT (20.29360 44.48040)
11879,25450758,Kolubara A power station,coal,SRB,power,electricity-generation,2019-01-01 00:00:00,2019-12-31 00:00:00,44.4804,20.2936,...,,,,,,,,2023-10-31 00:00:00,2023-11-01 10:00:00,POINT (20.29360 44.48040)


In [8]:
# Assuming 'date_column' is the column you want to convert
power_plant_gdf['start_time'] = pd.to_datetime(power_plant_gdf['start_time'], errors='coerce')

# Now filter for rows where the year is 2022
filtered_df = power_plant_gdf[(power_plant_gdf['start_time'].dt.year == 2022) & (power_plant_gdf['gas'] == 'co2e_20yr')]

# Display the filtered DataFrame
len(filtered_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


4

**Serbia Shape File**

In [9]:
#Name of Shape File
file_path = "shapefiles/gadm41_SRB_2.json"
gcs_file_path = 'gs://' + bucket_name + '/' + file_path

#Read Shape File --> The shape file gives a MultiPolygon Geometry Column
gdf = gpd.read_file(gcs_file_path)

#Adjust for GeoSpatial Data
center = gpd.GeoDataFrame(gdf[['GID_2', 'NAME_2']])

#Change the MultiPolygon Geometry Column to make it more useful
center['geometry'] = gdf.centroid
center = center.to_crs(gdf.crs)
center['lat'] = center.geometry.y
center['lon'] = center.geometry.x

emissions = gpd.sjoin(filtered_df, gdf, predicate = 'within',
                      how = 'inner')


  center['geometry'] = gdf.centroid
  center['geometry'] = gdf.centroid


In [10]:
100*len(emissions)/len(filtered_df)

100.0

In [11]:
grouped_emissions = emissions.groupby(['GID_2', 'NAME_2', 'gas', 'sector']).agg({
    'emissions_quantity': 'sum',        # Sum of the 'Sales' column
    'emissions_factor': 'mean'     # Average of the 'Quantity' column
})

In [12]:
grouped_emissions = grouped_emissions.reset_index()
grouped_emissions = grouped_emissions.rename(columns={"emissions_quantity": "total_powerplant_coal_emissions_quantity", "emissions_factor": "total_powerplant_coal_emissions_factor"})

In [14]:
grouped_emissions = grouped_emissions[['GID_2', 'NAME_2',	'gas',	'sector',	'total_powerplant_coal_emissions_quantity']]

In [15]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
grouped_emissions.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data/SRB_emissions-powerplants-coal-2022.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)

File uploaded to RS/processed-data/SRB_emissions-powerplants-coal-2022.csv.
