In [55]:
import os
import re
from glob import glob, iglob
import pandas as pd

In [56]:
data_directory = '/datasets/rpartsey/satellite/planet/SNP_Planet_Scenes_2017_Summer_Autumn_I'

In [57]:
def iter_files(glob_pattern, regex_patter=None):
    files = iglob(glob_pattern)
    if regex_patter is not None:
        files = filter(lambda path: regex_patter.match(path), files)
    return files 
    

image_name_pattern = re.compile(r'.*_Analytic(MS)?\.tif$')
metadata_xml_name_pattern = re.compile(r'.*_Analytic(MS)?_metadata?\.xml$')

image_files = list(iter_files(os.path.join(data_directory, '*/*.tif'), image_name_pattern))
udm_mask_files = list(iter_files(os.path.join(data_directory, '*/*_DN_udm.tif')))
shape_files = list(iter_files(os.path.join(data_directory, '*/*.shp')))
xml_files = list(iter_files(os.path.join(data_directory, '*/*_metadata.xml'), metadata_xml_name_pattern))
json_files = list(iter_files(os.path.join(data_directory, '*/*_metadata.json')))

print('Number of images:', len(image_files))
print('Number of udm masks:', len(udm_mask_files))
print('Number of shapes:', len(shape_files))
print('Number of xmls:', len(xml_files))
print('Number of jsons:', len(json_files))

Number of images: 30
Number of udm masks: 30
Number of shapes: 29
Number of xmls: 30
Number of jsons: 30


In [58]:
image_df = pd.DataFrame({'image': image_files})
udm_mask_df = pd.DataFrame({'udm_mask': udm_mask_files})
shape_df = pd.DataFrame({'shape_file': shape_files})
xml_df = pd.DataFrame({'xml': xml_files})
json_df =pd.DataFrame({'json': json_files})


def transform(df):
    df['id'] = df.iloc[:, 0].map(lambda path: path.split('/')[-2])
    df = df.set_index('id')
    df.iloc[:,0] = df.iloc[:,0].map(lambda path: os.path.basename(path))
    return df

image_df = transform(image_df)
udm_mask_df = transform(udm_mask_df)
shape_df = transform(shape_df)
xml_df = transform(xml_df)
json_df = transform(json_df)

In [59]:
files_df = pd.concat([image_df, udm_mask_df, shape_df, xml_df, json_df], axis=1, sort=False)

In [60]:
print(files_df.shape)
files_df.head()

(30, 5)


Unnamed: 0,image,udm_mask,shape_file,xml,json
20170806_075516_1033,20170806_075516_1033_3B_AnalyticMS.tif,20170806_075516_1033_3B_AnalyticMS_DN_udm.tif,20170806_075516_1033.shp,20170806_075516_1033_3B_AnalyticMS_metadata.xml,20170806_075516_1033_metadata.json
20170814_075356_102e,20170814_075356_102e_3B_AnalyticMS.tif,20170814_075356_102e_3B_AnalyticMS_DN_udm.tif,20170814_075356_102e.shp,20170814_075356_102e_3B_AnalyticMS_metadata.xml,20170814_075356_102e_metadata.json
20170809_075116_1041,20170809_075116_1041_3B_AnalyticMS.tif,20170809_075116_1041_3B_AnalyticMS_DN_udm.tif,20170809_075116_1041.shp,20170809_075116_1041_3B_AnalyticMS_metadata.xml,20170809_075116_1041_metadata.json
20171011_075449_1015,20171011_075449_1015_3B_AnalyticMS.tif,20171011_075449_1015_3B_AnalyticMS_DN_udm.tif,20171011_075449_1015.shp,20171011_075449_1015_3B_AnalyticMS_metadata.xml,20171011_075449_1015_metadata.json
20170629_075104_1044,20170629_075104_1044_3B_AnalyticMS.tif,20170629_075104_1044_3B_AnalyticMS_DN_udm.tif,20170629_075104_1044.shp,20170629_075104_1044_3B_AnalyticMS_metadata.xml,20170629_075104_1044_metadata.json


In [61]:
invalid_subdirectories = files_df[files_df.isna().any(axis=1)]
print('Number of invalid subdirectories: ', invalid_subdirectories.shape[0])
invalid_subdirectories.head()

Number of invalid subdirectories:  1


Unnamed: 0,image,udm_mask,shape_file,xml,json
20170812_075232_1002,20170812_075232_1002_3B_Analytic.tif,20170812_075232_1002_3B_Analytic_DN_udm.tif,,20170812_075232_1002_3B_Analytic_metadata.xml,20170812_075232_1002_metadata.json


In [62]:
files_df = files_df.dropna(axis=0)
files_df.shape

(29, 5)

In [63]:
files_df.to_csv(os.path.join(data_directory, 'files.csv'), index=True)

In [64]:
files_df = pd.read_csv(os.path.join(data_directory, 'files.csv'), index_col=0)
print(files_df.shape)
files_df.head()

(29, 5)


Unnamed: 0,image,udm_mask,shape_file,xml,json
20170806_075516_1033,20170806_075516_1033_3B_AnalyticMS.tif,20170806_075516_1033_3B_AnalyticMS_DN_udm.tif,20170806_075516_1033.shp,20170806_075516_1033_3B_AnalyticMS_metadata.xml,20170806_075516_1033_metadata.json
20170814_075356_102e,20170814_075356_102e_3B_AnalyticMS.tif,20170814_075356_102e_3B_AnalyticMS_DN_udm.tif,20170814_075356_102e.shp,20170814_075356_102e_3B_AnalyticMS_metadata.xml,20170814_075356_102e_metadata.json
20170809_075116_1041,20170809_075116_1041_3B_AnalyticMS.tif,20170809_075116_1041_3B_AnalyticMS_DN_udm.tif,20170809_075116_1041.shp,20170809_075116_1041_3B_AnalyticMS_metadata.xml,20170809_075116_1041_metadata.json
20171011_075449_1015,20171011_075449_1015_3B_AnalyticMS.tif,20171011_075449_1015_3B_AnalyticMS_DN_udm.tif,20171011_075449_1015.shp,20171011_075449_1015_3B_AnalyticMS_metadata.xml,20171011_075449_1015_metadata.json
20170629_075104_1044,20170629_075104_1044_3B_AnalyticMS.tif,20170629_075104_1044_3B_AnalyticMS_DN_udm.tif,20170629_075104_1044.shp,20170629_075104_1044_3B_AnalyticMS_metadata.xml,20170629_075104_1044_metadata.json
