## This notebook contains refactored code for:
- creating ~1km square box shapefiles for cement and steel data for China
- over ~1km is chosen so that the image chips are big enough to then crop down to the standard 256 x 256 pixels once the chips are downloaded
- Landcover polygons are also created around cement plants

In [1]:
import geopandas as gpd
from geopandas import GeoDataFrame
import pandas as pd
from shapely.geometry import Point,Polygon, LineString
import os, sys
import matplotlib.pyplot as plt
%matplotlib inline
import fiona
import numpy as np
import time


In [44]:
cempath = r'C:\\Users\\Maral.Bayaraa\\OneDrive - Satellite Applications Catapult\\Projects-Maral-Z\\SustainableFinance\\ALD_Phase2\\Cement_dataset_v3_SmithSchool\\'

steelpath = r'C:\\Users\\Maral.Bayaraa\\OneDrive - Satellite Applications Catapult\\Projects-Maral-Z\\SustainableFinance\\ALD_Phase2\\steel_polys\\'

In [53]:
def create_polys(path,filename,planttype):
    

    #open the file
    df = pd.read_excel(os.path.join(path,filename))


    #make sure there's no NaN location
    df = df[df['latitude'].notnull()]

    #define the geometry 
    geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)] #df.x, df.y
    #Create a geodataframe
    crs = {'init': 'epsg:4326'} #http://www.spatialreference.org/ref/epsg/2263/
    geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    
    #Convert crs to UTM
    geo_df_UTM = geo_df.to_crs({'init':"EPSG:3395"})
    print(geo_df_UTM.crs)
    
    #create an AOI box
    #polygon_buffer= geo_df_UTM.buffer(500).envelope
    polygon_buffer= geo_df_UTM.buffer(700).envelope

    #Replace point data with polygon data
    geo_df_UTM['geometry'] = polygon_buffer
    
    geo_df_UTM['plantID']=np.arange(len(geo_df_UTM))
    
    '''Only include China
    '''
    
    df_china = geo_df_UTM[geo_df_UTM.iso3=='CHN']
    df_china = df_china[df_china.accuracy=='Exact']
    if planttype == 'cement':
        print('cement plant')
        df_china= df_china[df_china.plant_type == 'Integrated']
    else:
        print('steel plant')
    
    #for cement only: get integrated plants only
    
    return df_china

### STEEL

In [54]:
steel = create_polys(steelpath,'steel_dataset_v3.1.xlsx','steel')

  return _prepare_from_string(" ".join(pjargs))


{'init': 'EPSG:3395'}
steel plant


In [40]:
#a.to_file(os.path.join(path,'2_steeldatasetv3_UTM_CHINA_exactlocPOLYS.geojson'), driver='GeoJSON')
steel.to_file(os.path.join(steelpath,'2_steeldatasetv3_UTM_CHINA_exactlocPOLYS_700m.geojson'), driver='GeoJSON')

Try to make the shapefile so that it extracts 256 by 256 pixels

### cement

In [55]:
cem = create_polys(cempath,'cement_dataset_v3.xlsx','cement')

  return _prepare_from_string(" ".join(pjargs))


{'init': 'EPSG:3395'}
cement plant


In [51]:
cem.to_file(os.path.join(cempath,'4_cementdatasetv3_UTM_CHINA_exactlocPOLYS_1km.geojson'), driver='GeoJSON')

# landcover

In [112]:
def get_lc_centrepoints(path,filename):
    '''Creates landcover samples around cement China plants
    '''
    
    #open the file
    df = pd.read_excel(os.path.join(path,filename))

    #make sure there's no NaN location
    df = df[df['latitude'].notnull()]

    #define the geometry 
    geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)] #df.x, df.y
    #Create a geodataframe
    crs = {'init': 'epsg:4326'} #http://www.spatialreference.org/ref/epsg/2263/
    geo_df = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
    
    #Convert crs to UTM
    geo_df_UTM = geo_df.to_crs({'init':"EPSG:3395"})
    print(geo_df_UTM.crs)
    
    df_china = geo_df_UTM[geo_df_UTM.iso3=='CHN']
    df_china = df_china[df_china.accuracy=='Exact']
    df_china= df_china[df_china.plant_type == 'Integrated']

    
    #create a HUGE buffer to sample landcover samples
    gdf_buffer=df_china.buffer(3500).envelope
    
    '''Intersect large squares so they do not interact with each other
    '''
    gdf_union = gpd.GeoDataFrame()
    #Union the polygons so any that overlap become one
    gdf_union['geometry'] = gdf_buffer.unary_union
    gdf_union.crs={'init': 'EPSG:3395'}
    
    '''Get the coordinates of the large polygons around which
    landcover samples will be generated
    '''
    
    #get the number of nodes in each polygon
    #coords_list = []
    point_list = []
    IDs = []
    for index, row in gdf_union.iterrows():
        #if row['geometry'].type == 'Polygon':
        coords = list(row['geometry'].exterior.coords)
        #onecoord = row['outlines'].exterior.coords
        
        #create a point for each coordinate
        for item in coords:
            points = Point(item)
            point_list.append(points)
            IDs.append(index)
    
    gdf_points = gpd.GeoDataFrame()
    gdf_points['geometry'] = point_list
    #define the coordinates for the points
    gdf_points.crs = "EPSG:3395"
    
    
    ''' Create the buffers around each landcover polygon
    '''
    polygon_buffer= gdf_points.buffer(700).envelope
    polygon_buffer.crs = "EPSG:3395"

    
    return polygon_buffer
    

In [109]:
lc = get_lc_centrepoints(cempath,'cement_dataset_v3.xlsx')

  return _prepare_from_string(" ".join(pjargs))


{'init': 'EPSG:3395'}


    Now need to intersect with steel plants so we remove any lc polygon that touches steel plant samples --- had a look at the current data and actually this seems okay so ignoring here

In [137]:
outpath = r'C:\\Users\\Maral.Bayaraa\\OneDrive - Satellite Applications Catapult\\Projects-Maral-Z\\SustainableFinance\\ALD_Phase2\\1_ALD_phase2_training_data\\'

In [138]:
lc.to_file(os.path.join(outpath,'Landcover_fromCementdatasetv3_1kmpolys.geojson'), driver='GeoJSON')