# Prepare CNN Data

## Setup

In [1]:
# USE ACCOUNT: robmarty3@gmail.com
import ee
ee.Authenticate()
ee.Initialize()

Enter verification code:  4/1AX4XfWhFp8h1LFROJ-jbIzCbhsabPaA9dtypqyieLv7fioEdg4nCAuclda4



Successfully saved authorization token.


In [11]:
#image = ee.ImageCollection('LANDSAT/LC08/C02/T1_L2')\
#    .filterDate("2019-01-01", "2019-02-01")\
#    .map(cloud_mask_landsatSR)\
#    .median()
#
#print(image.getInfo())

In [12]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time

#import geetools
#from geetools import ui, cloud_mask
#cloud_mask_landsatSR = cloud_mask.landsatSR()
#cloud_mask_sentinel2 = cloud_mask.sentinel2()

## Parameters

In [13]:
#224
SURVEY_NAME = 'DHS'
SATELLITE = 'l8'
SKIP_IF_SCRAPED = True
CHUNK_SIZE = 10 # Number of observtaions to scrape in GEE at any given time
DROPBOX_DIR = cf.DROPBOX_DIRECTORY
GOOGLEDRIVE_DIR = cf.GOOGLEDRIVE_DIRECTORY

if SATELLITE == 's2':
    KERNEL_SIZE = 500
elif SATELLITE == 'l8':
    KERNEL_SIZE = 167

print(KERNEL_SIZE)

167


In [14]:
# Directory to store tfrecords
out_path = os.path.join(GOOGLEDRIVE_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE,
            'tfrecords')

## Load Data

In [15]:
# Relies on VIIRs Values
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_socioeconomic.csv'))

viirs_2_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_2_5km.csv'))
viirs_5km_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 'survey_viirs_5km.csv'))

viirs_2_5km_df.viirs_avg_rad = np.log(viirs_2_5km_df.viirs_avg_rad + 1)
viirs_5km_df.viirs_avg_rad = np.log(viirs_5km_df.viirs_avg_rad + 1)

viirs_2_5km_df = viirs_2_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_2_5km"})
viirs_5km_df = viirs_5km_df.rename(columns={"viirs_avg_rad": "viirs_avg_rad_5km"})

survey_df = survey_df.merge(viirs_2_5km_df, on = 'uid')
survey_df = survey_df.merge(viirs_5km_df, on = 'uid')

survey_df.shape

(64249, 44)

In [16]:
# Sentinel doesn't capture Lakshadweep (island off coast of mainland India)
#survey_df = survey_df[survey_df['GID_2'] != "IND.18.1_1"]

# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

In [17]:
# Remove if issues extracting
#survey_df = survey_df[survey_df['uid'] != "IA201400180079"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180052"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180112"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180081"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180011"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180048"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180058"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180028"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180072"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180047"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180012"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180040"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180055"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180140"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180030"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180104"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180123"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180062"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180080"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180050"]
#survey_df = survey_df[survey_df['uid'] != "IA201400180116"]

In [18]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]

In [None]:
### Loop through all tfrecords
for tfr_i in tf_record_list:
    
    # Sometimes we get computational time out errors. If occurs, just skip and go to next.
    # We can then go back and rescrape missed ones.
    try:
    
        survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
        year_i = survey_df_yeari['year'].iloc[0]

        ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
        survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

        print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)

        proto_examples_all = []
        for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
            time.sleep(6)
            print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))

            survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]

            proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
            proto_examples_all.extend(proto_examples_i)

        ### Save data as tf record
        out_path_i = os.path.join(out_path, tfr_i)
        print(out_path_i)
        with tf.io.TFRecordWriter(out_path_i) as writer:
            for tf_example in proto_examples_all:
                writer.write(tf_example.SerializeToString())
                
    except:
        pass

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Putting 128 observations into AO_1_1.tfrecord
Observation: 0/128
Observation: 10/128
Observation: 20/128
Observation: 30/128
Observation: 40/128
Observation: 50/128
Observation: 60/128
Observation: 70/128
Observation: 80/128
Observation: 90/128
Observation: 100/128
Observation: 110/128
Observation: 120/128
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/AO_1_1.tfrecord
Putting 125 observations into AO_2_1.tfrecord
Observation: 0/125
Observation: 10/125
Observation: 20/125
Observation: 30/125
Observation: 40/125
Observation: 50/125
Observation: 60/125
Observation: 70/125
Observation: 80/125
Observation: 90/125
Observation: 100/125
Observation: 110/125
Observation: 120/125
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/AO_2_1.tfrecord
Putting 146 observations into AO_3_1.tfrecord
Observation: 0/146


  return (values2 - values1)/(values2 + values1)


Observation: 10/146
Observation: 20/146
Observation: 30/146
Observation: 40/146
Observation: 50/146
Observation: 60/146
Observation: 70/146
Observation: 80/146
Observation: 90/146
Observation: 100/146
Observation: 110/146
Observation: 120/146
Observation: 130/146
Observation: 140/146
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/AO_3_1.tfrecord
Putting 85 observations into AO_4_1.tfrecord
Observation: 0/85
Observation: 10/85
Observation: 20/85
Putting 141 observations into AO_5_1.tfrecord
Observation: 0/141
Observation: 10/141
Observation: 20/141
Observation: 30/141
Observation: 40/141
Observation: 50/141
Observation: 60/141
Observation: 70/141
Observation: 80/141
Observation: 90/141
Observation: 100/141
Observation: 110/141
Observation: 120/141
Observation: 130/141
Observation: 140/141
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tf

W0910 12:31:30.334298 4588170688 http.py:171] Sleeping 0.81 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 50/108
Observation: 60/108
Observation: 70/108
Observation: 80/108
Observation: 90/108
Observation: 100/108
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BF_3_1.tfrecord
Putting 78 observations into BF_4_1.tfrecord
Observation: 0/78
Observation: 10/78
Observation: 20/78
Observation: 30/78
Observation: 40/78
Observation: 50/78
Observation: 60/78
Observation: 70/78
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BF_4_1.tfrecord
Putting 99 observations into BF_5_1.tfrecord
Observation: 0/99
Observation: 10/99
Observation: 20/99
Observation: 30/99
Observation: 40/99
Observation: 50/99
Observation: 60/99
Observation: 70/99
Observation: 80/99
Observation: 90/99
Putting 127 observations into BJ_1_1.tfrecord
Observation: 0/127
Observation: 10/127
Observation: 20/127
Observation: 30/127
Observation: 40/127
Observation: 50/

W0910 13:21:31.201469 4588170688 http.py:171] Sleeping 0.17 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 502


Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BO_2_1.tfrecord
Putting 68 observations into BO_2_2.tfrecord
Observation: 0/68
Observation: 10/68
Observation: 20/68
Observation: 30/68
Observation: 40/68
Observation: 50/68
Observation: 60/68
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BO_2_2.tfrecord
Putting 118 observations into BO_3_1.tfrecord
Observation: 0/118
Observation: 10/118
Observation: 20/118
Observation: 30/118
Observation: 40/118


W0910 13:42:20.492254 4588170688 http.py:171] Sleeping 0.70 seconds before retry 1 of 5 for request: POST https://earthengine.googleapis.com/v1alpha/projects/earthengine-legacy/value:compute?prettyPrint=false&alt=json, after 503


Observation: 50/118
Observation: 60/118
Observation: 70/118
Observation: 80/118
Observation: 90/118
Observation: 100/118
Observation: 110/118
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BO_3_1.tfrecord
Putting 200 observations into BO_4_1.tfrecord
Observation: 0/200
Observation: 10/200
Observation: 20/200
Observation: 30/200
Observation: 40/200
Observation: 50/200
Observation: 60/200
Observation: 70/200
Observation: 80/200
Observation: 90/200
Observation: 100/200
Observation: 110/200
Observation: 120/200
Observation: 130/200
Observation: 140/200
Observation: 150/200
Observation: 160/200
Observation: 170/200
Observation: 180/200
Observation: 190/200
Putting 15 observations into BO_4_2.tfrecord
Observation: 0/15
Observation: 10/15
/Users/robmarty/Google Drive/World Bank/IEs/Pakistan Poverty Estimation/Data/DHS/FinalData/Individual Datasets/cnn_l8/tfrecords/BO_4_2.tfrecord
Putting 203 observations into BO_

  return (values2 - values1)/(values2 + values1)


Observation: 40/203
Observation: 50/203
Putting 63 observations into BO_5_2.tfrecord
Observation: 0/63
Observation: 10/63
Putting 126 observations into BU_1_1.tfrecord
Observation: 0/126
Observation: 10/126
Observation: 20/126
Observation: 30/126
Observation: 40/126
Observation: 50/126
Putting 117 observations into BU_2_1.tfrecord


In [None]:
for row_i in range(0,10):
    print(row_i)
    proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki.iloc[[row_i]], SATELLITE, KERNEL_SIZE, year_i)

In [None]:
survey_df_yeari_chunki.iloc[[1]]