In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import rasterio
import geopandas as gpd
from rasterio.mask import mask
import matplotlib.pyplot as plt
from rasterio.plot import show

In [None]:
## Cities polygons
cities_poly_df = pd.read_pickle('../input/cdp-cities-with-polygons/CDP/CDP_cities_with_polygons.pkl')
cities_poly_df = cities_poly_df.reset_index(drop=True)
cities_poly_df['geometry'].crs

In [None]:
data_file = '../input/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_GHI_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_GHI_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/GHI.tif'
df_data = rasterio.open(data_file)
df_data.crs.wkt

In [None]:
# in this assertion the order makes a difference! I don't know why.
assert cities_poly_df.crs == df_data.crs, 'Different Coordinate Systems'

## Potential coordinates conversion

In this case coordinates conversion is not needed because they are the same (double checked!!! even with converted data as below)

In [None]:
# !mkdir -p /kaggle/working/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_GHI_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_GHI_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/

In [None]:
# %%time


# # The coordinate reference system of these data  need to match other systems, to be spatially joined.
# from rasterio.warp import calculate_default_transform, reproject, Resampling

# # reprojection of Carbon Footprint data
# dst_crs = 'EPSG:4326'

# with rasterio.open(data_file) as src:
#     transform, width, height = calculate_default_transform(
#         src.crs, dst_crs, src.width, src.height, *src.bounds)
#     kwargs = src.meta.copy()
#     kwargs.update({
#         'crs': dst_crs,
#         'transform': transform,
#         'width': width,
#         'height': height
#     })

#     with rasterio.open('/kaggle/working/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_GHI_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_GHI_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/GHI.EPSG4326.tif', 'w', **kwargs) as dst:
#         for i in range(1, src.count + 1):
#             reproject(
#                 source=rasterio.band(src, i),
#                 destination=rasterio.band(dst, i),
#                 src_transform=src.transform,
#                 src_crs=src.crs,
#                 dst_transform=transform,
#                 dst_crs=dst_crs,
#                 resampling=Resampling.nearest)

In [None]:
# df_data_4326 = rasterio.open('/kaggle/working/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_GHI_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_GHI_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/GHI.EPSG4326.tif')

## Sample plotting

In [None]:
city_name = 'Warsaw'
carbon_data, carbon_transform = mask(df_data, cities_poly_df.loc[cities_poly_df['name_conve']==city_name]['geometry'], crop=True)
#carbon_meta = df_ghi.meta

# let's visualize this:
fig, axs = plt.subplots(1,2)
cities_poly_df.loc[cities_poly_df['name_conve']==city_name, 'geometry'].plot(ax=axs[0])
show(carbon_data, ax=axs[1])

# Calculate for the whole dataset

## GHI

In [None]:
from rasterio.mask import mask

accounts = cities_poly_df.loc[cities_poly_df['geometry'].notnull(), 'Account Number'].values#[:10]

cities_poly_df['solar_ghi_mean'] = None
cities_poly_df['solar_ghi_sum'] = None
i = 0
for acc in accounts:
    try:
        if(i%100==0):
            print(i)
        
        data, transform = mask(df_data, cities_poly_df.loc[cities_poly_df['Account Number']==acc]['geometry'], crop=True)

        # adding columns: solar_ghi_mean, solar_ghi_sum
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_ghi_mean'] = np.nanmean(data)
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_ghi_sum'] = np.nansum(data)
        
#         fig, axs = plt.subplots(1,2)
#         cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'geometry'].plot(ax=axs[0])
#         show(data, ax=axs[1])
    except:
        print(f'No data polygon for the ', cities_poly_df.loc[cities_poly_df['Account Number']==acc, ['City','Country']].values)
    i+=1

print('Number of values assigned: ', cities_poly_df.loc[cities_poly_df['solar_ghi_mean'].notnull()].shape[0])

In [None]:
cities_poly_df.loc[cities_poly_df['solar_ghi_mean'].notnull()]

# GTI

In [None]:
data_file = '../input/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_GTI_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_GTI_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/GTI.tif'
df_data = rasterio.open(data_file)
df_data.crs.wkt

In [None]:
# in this assertion the order makes a difference! I don't know why.
assert cities_poly_df.crs == df_data.crs, 'Different Coordinate Systems'

In [None]:
city_name = 'Warsaw'
carbon_data, carbon_transform = mask(df_data, cities_poly_df.loc[cities_poly_df['name_conve']==city_name]['geometry'], crop=True)
#carbon_meta = df_ghi.meta

# let's visualize this:
fig, axs = plt.subplots(1,2)
cities_poly_df.loc[cities_poly_df['name_conve']==city_name, 'geometry'].plot(ax=axs[0])
show(carbon_data, ax=axs[1])

In [None]:
from rasterio.mask import mask

accounts = cities_poly_df.loc[cities_poly_df['geometry'].notnull(), 'Account Number'].values#[:10]

cities_poly_df['solar_gti_mean'] = None
cities_poly_df['solar_gti_sum'] = None
i = 0
for acc in accounts:
    try:
        if(i%100==0):
            print(i)
        
        data, transform = mask(df_data, cities_poly_df.loc[cities_poly_df['Account Number']==acc]['geometry'], crop=True)

        # adding columns: solar_gti_mean, solar_gti_sum
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_gti_mean'] = np.nanmean(data)
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_gti_sum'] = np.nansum(data)
        
#         fig, axs = plt.subplots(1,2)
#         cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'geometry'].plot(ax=axs[0])
#         show(data, ax=axs[1])
    except:
        print(f'No data polygon for the ', cities_poly_df.loc[cities_poly_df['Account Number']==acc, ['City','Country']].values)
    i+=1

print('Number of values assigned: ', cities_poly_df.loc[cities_poly_df['solar_ghi_mean'].notnull()].shape[0])

In [None]:
cities_poly_df.loc[cities_poly_df['solar_ghi_mean'].notnull()]

# PVOUT

In [None]:
data_file = '../input/geospatial-environmental-and-socioeconomic-data/10_solar_potential/World_PVOUT_GISdata_LTAy_AvgDailyTotals_GlobalSolarAtlas-v2_GEOTIFF/World_PVOUT_GISdata_LTAy_DailySum_GlobalSolarAtlas_GEOTIFF/PVOUT.tif'
df_data = rasterio.open(data_file)
df_data.crs.wkt

In [None]:
# in this assertion the order makes a difference! I don't know why.
assert cities_poly_df.crs == df_data.crs, 'Different Coordinate Systems'

In [None]:
city_name = 'Warsaw'
carbon_data, carbon_transform = mask(df_data, cities_poly_df.loc[cities_poly_df['name_conve']==city_name]['geometry'], crop=True)
#carbon_meta = df_ghi.meta

# let's visualize this:
fig, axs = plt.subplots(1,2)
cities_poly_df.loc[cities_poly_df['name_conve']==city_name, 'geometry'].plot(ax=axs[0])
show(carbon_data, ax=axs[1])

In [None]:
from rasterio.mask import mask

accounts = cities_poly_df.loc[cities_poly_df['geometry'].notnull(), 'Account Number'].values#[:10]

cities_poly_df['solar_pvout_mean'] = None
cities_poly_df['solar_pvout_sum'] = None
i = 0
for acc in accounts:
    try:
        if(i%100==0):
            print(i)
        
        data, transform = mask(df_data, cities_poly_df.loc[cities_poly_df['Account Number']==acc]['geometry'], crop=True)

        # adding columns: solar_gti_mean, solar_gti_sum
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_pvout_mean'] = np.nanmean(data)
        cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'solar_pvout_sum'] = np.nansum(data)
        
#         fig, axs = plt.subplots(1,2)
#         cities_poly_df.loc[cities_poly_df['Account Number']==acc, 'geometry'].plot(ax=axs[0])
#         show(data, ax=axs[1])
    except:
        print(f'No data polygon for the ', cities_poly_df.loc[cities_poly_df['Account Number']==acc, ['City','Country']].values)
    i+=1

print('Number of values assigned: ', cities_poly_df.loc[cities_poly_df['solar_ghi_mean'].notnull()].shape[0])

In [None]:
cities_poly_df.head()

## Save results

In [None]:
# create an output directory
!mkdir /kaggle/working/solar-potential

In [None]:
cities_poly_df.to_pickle('/kaggle/working/solar-potential/CDP_cities_with_solar_potential.pkl')