# 02 - Metadata generation
Check for missing s3 files, S1 images with missing pixels, S2 images with fully obscured pixels

In [47]:
from botocore.exceptions import ClientError
import boto3
import calendar
import io
from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import tifffile as tif
from tqdm import tqdm


In [45]:
DATA_DIR = 'data'
BUCKET_NAME = 'drivendata-competition-biomassters-public-us'

month_map = {month: index for index, month in enumerate(calendar.month_name) if month}


In [3]:
def load_tif_from_s3(bucket, key):
    # https://stackoverflow.com/questions/44043036/how-to-read-image-file-from-s3-bucket-directly-into-memory
    try:
        object = bucket.Object(key)
        img_data = object.get().get('Body').read()
        return tif.imread(io.BytesIO(img_data))
    except ClientError as ex:
        if ex.response['Error']['Code'] == 'NoSuchKey':
            print("Error: No such key")
            img = None

In [4]:
s3 = boto3.resource(
    's3',
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY']
)
bucket = s3.Bucket(BUCKET_NAME)

train_features = pd.read_csv(os.path.join(DATA_DIR, "features_metadata_FzP19JI.csv"))

In [11]:

s1_missing_pixel_value = -9999 
s2_fully_obscured_value = 256

train_features['file_not_found'] = None
train_features['num_s1_missing'] = None
train_features['num_s2_obscured'] = None

idx_start_0 = 200000
idx_start = idx_start_0
chunk_size = 5e3
for idx, row in tqdm(train_features[idx_start_0:].iterrows(), position=0, leave=True):
    key = f'{row.split}_features/{row.filename}'
    img = load_tif_from_s3(bucket, key)
    if type(img) == np.ndarray:
        if row['satellite'] == 'S1':
            train_features.loc[idx, 'num_s1_missing'] = np.count_nonzero(img == s1_missing_pixel_value)
        elif row['satellite'] == 'S2':
            train_features.loc[idx, 'num_s2_obscured'] = np.count_nonzero(img[:,:,-1] == s2_fully_obscured_value)
    else:
        train_features.file_not_found = True
    if (idx % chunk_size == 0) and (idx > idx_start):
        train_features[idx_start:idx].to_csv(os.path.join(DATA_DIR, f'features_meta_data_{int(idx//chunk_size):03}.csv'), index=True)
        idx_start=idx
        break
train_features[idx_start:].to_csv(os.path.join(DATA_DIR, f'features_meta_data_{int(idx//chunk_size)+1:03}.csv'), index=True)


5000it [18:52,  4.31it/s]

In [42]:
num_chunks = len(train_features)//int(chunk_size)+1
df_list = []
for i in range(1,num_chunks+1):
    df_list.append(pd.read_csv(f'data/metadata/features_meta_data_{i:03}.csv',index_col=0))
train_features_aug_raw = pd.concat(df_list)
assert train_features_aug_raw.iloc[:,:11].equals(train_features.iloc[:,:11])

In [41]:
# train_features_aug_raw.to_csv(f'data/metadata/features_metadata_aug_raw.csv', index=True)

In [50]:
# load raw metadata
df_metadata_raw = train_features_aug_raw.copy()

# remap month field
df_metadata_raw['month'] = df_metadata_raw.month.apply(lambda x: month_map[x])

all_chips = list(df_metadata_raw.chip_id.unique())
months = range(1,13)
satellites = ['S1', 'S2']

df_metadata_full = pd.DataFrame(product(all_chips,satellites,months), columns=['chip_id','satellite','month'])
df_metadata_full = df_metadata_full.merge(df_metadata_raw, on=['chip_id','month','satellite'], how='left')
df_metadata_full['corresponding_agbm'] = df_metadata_full.chip_id.apply(lambda x: f'{x}_agbm.tif')

df_metadata_full.to_csv('data/metadata/features_metadata.csv', index=True)

In [51]:
df_metadata_full.head()

Unnamed: 0,chip_id,satellite,month,filename,split,size,cksum,s3path_us,s3path_eu,s3path_as,corresponding_agbm,file_not_found,num_s1_missing,num_s2_obscured
0,0003d2eb,S1,1,0003d2eb_S1_04.tif,train,1049524.0,2467836000.0,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif,,0.0,
1,0003d2eb,S1,2,0003d2eb_S1_05.tif,train,1049524.0,2955838000.0,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif,,0.0,
2,0003d2eb,S1,3,0003d2eb_S1_06.tif,train,1049524.0,938913200.0,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif,,0.0,
3,0003d2eb,S1,4,0003d2eb_S1_07.tif,train,1049524.0,225964900.0,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif,,0.0,
4,0003d2eb,S1,5,0003d2eb_S1_08.tif,train,1049524.0,135182000.0,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,s3://drivendata-competition-biomassters-public...,0003d2eb_agbm.tif,,0.0,
