# Prepare CNN Data

## Setup

In [31]:
import ee
#ee.Authenticate()
ee.Initialize()

In [32]:
import numpy as np
import os, datetime
import pandas as pd
import itertools
import tensorflow as tf
import config as cf
import ee_utils as utils
import eeconvert
import time
from datetime import datetime
import glob

DROPBOX_DIR = cf.DROPBOX_DIRECTORY

In [33]:
def decode_fn_uid(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,

        # Schema
        {"uid": tf.io.FixedLenFeature([], dtype=tf.string)}
    )

def extract_uid(TF_FILES):
    actual_values = []
    for batch in tf.data.TFRecordDataset([TF_FILES]).map(decode_fn_uid):
        value = batch['uid'].numpy()
        actual_values.append(value)

    return actual_values

## Parameters

In [34]:
# Datasets -------------------------------------
SURVEY_NAME = 'DHS'

VERSION = 1
if VERSION == 1:

    SATELLITE         = 's2' 
    OUTCOME_VAR       = "viirs" 
    UNDERSAMPLE_INDIA = True
    
if VERSION == 2:

    SATELLITE         = 'landsat' 
    OUTCOME_VAR       = "ntlharmon" 
    UNDERSAMPLE_INDIA = True

# Processing data ------------------------------
SKIP_IF_SCRAPED = True ## Skip if filename has already been created
CHECK_IF_UID_SCRAPED = True ## Load data already scraped and skip if scraped; add date to filename
IGNORE_ERRORS = False ## Load dataset of errors and remove from ones to scrape

CHUNK_SIZE = 1 # Number of observtaions to scrape in GEE at any given time

# Parameters based on dataset ---------------------
if SATELLITE == 's2':
    KERNEL_SIZE = 224
elif SATELLITE == 'landsat':
    KERNEL_SIZE = 224 #167
elif SATELLITE == 'landsat_7':
    KERNEL_SIZE = 224 #167

print(KERNEL_SIZE)

224


In [35]:
# Directory to store tfrecords
out_path = os.path.join(DROPBOX_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE + '_' + OUTCOME_VAR + '_underia' + str(UNDERSAMPLE_INDIA),
            'tfrecords')

out_path_errors = os.path.join(DROPBOX_DIR, 
            'Data', 
            SURVEY_NAME, 
            'FinalData',
            'Individual Datasets',
            'cnn_' + SATELLITE + '_' + OUTCOME_VAR + '_underia' + str(UNDERSAMPLE_INDIA))

## Load Data

In [36]:
### Load data
if UNDERSAMPLE_INDIA == True:
    UNDERSAMPLE_INDIA_str = "TRUE"
else:
    UNDERSAMPLE_INDIA_str = "FALSE"
    
survey_df = pd.read_csv(os.path.join(DROPBOX_DIR, 'Data', SURVEY_NAME, 'FinalData', 'Individual Datasets', 
                                     'data_for_cnn_' + OUTCOME_VAR + '_iaunder' + UNDERSAMPLE_INDIA_str + '_' + SATELLITE + '.csv'))

### If sentinel, only use most recent
if SATELLITE == 's2':
    survey_df = survey_df[survey_df.most_recent_survey == True]
        
### N Observations      
print(survey_df.shape)
print(survey_df.ntl_group.value_counts())

(63911, 9)
0    22545
2    11104
1    10690
4     9830
3     9742
Name: ntl_group, dtype: int64


### Check if UID Already Scraped

(1) Filter surveys to those that have been scraped, and (2) add date/time to filename (so process of checking if file has been scraped doesnt skip it)

In [37]:
if IGNORE_ERRORS:
    error_files = glob.glob(out_path_errors + '/*.csv')
    error_df = pd.concat([pd.read_csv(f) for f in error_files])

    survey_df = survey_df[~survey_df['uid'].isin(error_df['uid'])]

In [38]:
## Check if everything processed, ignoring errors
if False:
    tf_paths = glob.glob(out_path + '/*.tfrecord')
    processed_uids = extract_uid(tf_paths)

    ## List of IDs already processed
    processed_uids = [x.decode('utf-8') for x in processed_uids]

    ## Subset survey to uids not scraped
    survey_df = survey_df[~survey_df['uid'].isin(processed_uids)]

In [39]:
if CHECK_IF_UID_SCRAPED:

    tf_paths = glob.glob(out_path + '/*.tfrecord')
    processed_uids = extract_uid(tf_paths)
    
    ## List of IDs already processed
    processed_uids = [x.decode('utf-8') for x in processed_uids]
    
    ## Subset survey to uids not scraped
    survey_df = survey_df[~survey_df['uid'].isin(processed_uids)]
    
    ## Change name of tfrecords
    txt_to_add = datetime.now().strftime("%Y%m%d%H%M%S")
    
    survey_df['tfrecord_name'] = survey_df.apply(lambda row: row['tfrecord_name'].replace('.tfrecord', "_" + txt_to_add + '.tfrecord'), axis='columns')

### Skip tfrecords already processed

In [40]:
survey_df.head()

Unnamed: 0,uid,GID_2,year,most_recent_survey,ntl_group,longitude,latitude,tfrecord_name,use_for_cnn
5572,IA201400180047,IND.18.1_1,2015,True,2,72.71837,11.121363,forcnn_train_IA_4_1_all_20221212115634.tfrecord,yes
5580,IA201400180138,IND.18.1_1,2015,True,2,72.719906,11.125791,forcnn_train_IA_4_2_all_20221212115634.tfrecord,yes
10059,UG201800000327,UGA.19.1_1,2018,True,0,0.0,0.0,forcnn_train_UG_3_1_all_20221212115634.tfrecord,yes
10551,UG201800000328,UGA.19.1_1,2018,True,0,0.0,0.0,forcnn_train_UG_3_1_all_20221212115634.tfrecord,yes
10896,UG201800000334,UGA.19.1_1,2018,True,0,0.0,0.0,forcnn_train_UG_3_1_all_20221212115634.tfrecord,yes


In [41]:
survey_df.shape

(51, 9)

In [42]:
# List of TF Records
tf_record_list = list(np.unique(survey_df.tfrecord_name))

len(tf_record_list)

19

In [43]:
# If skip already scraped, remove existing tfrecords from tf_record_list
if SKIP_IF_SCRAPED:
    tf_records_exist = os.listdir(out_path)
    tf_record_list = [x for x in tf_record_list if x not in tf_records_exist]
    
print(len(tf_record_list))

19


In [44]:
# https://gist.github.com/erdemarslan/3ec02009f38f8df84c8e4807e7954af3
if False:
    import urllib3

    def check_internet_conn():
        http = urllib3.PoolManager(timeout=3.0)
        r = http.request('GET', 'google.com', preload_content=False)
        code = r.status
        r.release_conn()
        if code == 200:
            return True
        else:
            return False

## Query Data

In [None]:
## Blank error dataframe
errors_df = pd.DataFrame()

## Error file name
now = datetime.now()
current_time = now.strftime("%d_%m_%y_%H_%M_%S")
error_file_name = 'errors_' + current_time + '.csv'

if True:
    ### Loop through all tfrecords
    for tfr_i in tf_record_list:

        # Sometimes we get computational time out errors. If occurs, just skip and go to next.
        # We can then go back and rescrape missed ones.

        survey_df_yeari = survey_df[survey_df['tfrecord_name'] == tfr_i]
        year_i = survey_df_yeari['year'].iloc[0]

        ### Loop through chunks within tfrecord (can only pull so much data from GEE at a time)
        survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)

        print("Putting " + str(survey_df_yeari.shape[0]) + " observations into " + tfr_i)

        proto_examples_all = []
        for chunk_i in list(np.unique(survey_df_yeari.chunk_id)):
            

            try:

                time.sleep(3)
                print("Observation: " + str(len(proto_examples_all)) + "/" + str(survey_df_yeari.shape[0]))

                survey_df_yeari_chunki = survey_df_yeari[survey_df_yeari['chunk_id'] == chunk_i]

                proto_examples_i = utils.prep_cnn_np(survey_df_yeari_chunki, SATELLITE, KERNEL_SIZE, year_i)
                proto_examples_all.extend(proto_examples_i)

            except:

                print("Error ---")
                print(survey_df_yeari_chunki['uid'])

                errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)
                errors_df.to_csv(os.path.join(out_path_errors, error_file_name))

                time.sleep(15)
                pass

        ### Save data as tf record
        out_path_i = os.path.join(out_path, tfr_i)
        print(out_path_i)
        with tf.io.TFRecordWriter(out_path_i) as writer:
            for tf_example in proto_examples_all:
                writer.write(tf_example.SerializeToString())

        print("Success \o/")



Putting 1 observations into forcnn_train_IA_4_1_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
5572    IA201400180047
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/forcnn_train_IA_4_1_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into forcnn_train_IA_4_2_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
5580    IA201400180138
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/forcnn_train_IA_4_2_all_20221212115634.tfrecord
Success \o/
Putting 4 observations into forcnn_train_UG_3_1_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/4
Error ---
10059    UG201800000327
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
10551    UG201800000328
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
10896    UG201800000334
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
11610    UG201800000323
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/forcnn_train_UG_3_1_all_20221212115634.tfrecord
Success \o/
Putting 3 observations into nocnn_IA_4_10_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/3
Error ---
15365    IA201400180112
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
35246    IA201400180062
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
45548    IA201400180116
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_10_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_11_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
21090    IA201400180091
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_11_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_12_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
50767    IA201400180064
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_12_all_20221212115634.tfrecord
Success \o/
Putting 2 observations into nocnn_IA_4_13_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/2
Error ---
28655    IA201400180072
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/2
Error ---
37844    IA201400180079
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_13_all_20221212115634.tfrecord
Success \o/
Putting 4 observations into nocnn_IA_4_16_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/4
Error ---
14814    IA201400180140
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
16064    IA201400180040
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
37094    IA201400180055
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
37118    IA201400180080
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_16_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_17_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
58997    IA201400180058
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_17_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_18_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
32901    IA201400180050
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_18_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_19_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
56608    IA201400180012
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_19_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_1_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
17462    IA201400180011
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_1_all_20221212115634.tfrecord
Success \o/
Putting 2 observations into nocnn_IA_4_21_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/2
Error ---
51963    IA201400180048
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/2
Error ---
52220    IA201400180086
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_21_all_20221212115634.tfrecord
Success \o/
Putting 4 observations into nocnn_IA_4_22_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/4
Error ---
21079    IA201400180123
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
21686    IA201400180027
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
36090    IA201400180081
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/4
Error ---
49594    IA201400180052
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_22_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_23_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
37212    IA201400180030
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_23_all_20221212115634.tfrecord
Success \o/
Putting 1 observations into nocnn_IA_4_3_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/1
Error ---
16905    IA201400180028
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


/Users/robmarty/Dropbox/World Bank/IEs/Pakistan Poverty Estimation from Satellites/Data/DHS/FinalData/Individual Datasets/cnn_s2_viirs_underiaTrue/tfrecords/nocnn_IA_4_3_all_20221212115634.tfrecord
Success \o/
Putting 3 observations into nocnn_IA_4_4_all_20221212115634.tfrecord


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_df_yeari['chunk_id'] = utils.chunk_ids(survey_df_yeari.shape[0], CHUNK_SIZE)


Observation: 0/3
Error ---
26794    IA201400180083
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
34444    IA201400180100
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)


Observation: 0/3
Error ---
62071    IA201400180133
Name: uid, dtype: object


  errors_df = errors_df.append(survey_df_yeari_chunki[['uid']], ignore_index = True)
