In [1]:
import pandas as pd
import geopandas as gpd
import fiona
import rasterio
from rasterio.mask import mask
from rasterio.plot import show
from tqdm import tqdm
import os
tqdm.pandas()

In [2]:
root_dir = '/gypsum/scratch1/jtaneja/DG/DG_new'
df_lst = []

In [3]:
# Traverse the subdirectories and extract the image parameters and file paths
for subdir, dirs, files in tqdm(os.walk(root_dir)):
    for file in files:
        if file.endswith('.shp'):
            file_path = os.path.join(subdir, file)
            df = gpd.read_file(file_path)
            df_lst.append(df)

128it [02:15,  1.06s/it]


In [4]:
metadata_df = gpd.GeoDataFrame(pd.concat(df_lst, ignore_index=True, axis=0))

In [5]:
metadata_df.shape

(21537, 11)

In [6]:
metadata_df.head()

Unnamed: 0,FILENAME,CATALOG_ID,ACQ_DATE,ONA,CC,SUNEL,SENSOR,ACCURACY,TILE_TYPE,VERSION,geometry
0,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1040010007774300,2015-02-02,24.552141,0.0,58.511749,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((37.88084 4.01177, 37.88084 4.0..."
1,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1050410001225D00,2011-01-06,12.898581,0.0,54.994518,GE01,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((37.80711 4.01204, 37.80713 4.01204, ..."
2,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1040010007397700,2015-02-02,25.988863,0.0,58.584145,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((37.88085 4.04297, 37.88086 4.04297, ..."
3,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1040010007397700,2015-02-02,25.988863,0.0,58.584145,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((37.88737 3.98100, 37.88737 3.9..."
4,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1030010024ABAA00,2013-07-05,11.839989,0.0,63.663494,WV02,10.2,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((37.88110 3.97349, 37.88113 3.9..."


In [7]:
metadata_df[metadata_df.duplicated(subset=['FILENAME', 'CATALOG_ID', 'ACQ_DATE', 'SENSOR', 'ACCURACY', 'TILE_TYPE', 'VERSION'], keep=False)].shape

(1280, 11)

In [8]:
metadata_df.drop_duplicates(subset=['FILENAME', 'CATALOG_ID', 'ACQ_DATE', 'SENSOR', 'ACCURACY', 'TILE_TYPE', 'VERSION'], keep=False, inplace=True)

In [9]:
metadata_df.shape

(20257, 11)

In [10]:
metadata_df[metadata_df.duplicated(subset=['FILENAME', 'CATALOG_ID', 'ACQ_DATE', 'SENSOR', 'ACCURACY', 'TILE_TYPE', 'VERSION'], keep=False)].shape

(0, 11)

In [11]:
metadata_df['SENSOR'].value_counts()

WV02         11687
GE01          5084
WV03_VNIR     2376
QB02          1110
Name: SENSOR, dtype: int64

In [13]:
metadata_df.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [12]:
metadata_df.reset_index(inplace=True, drop=True)

In [15]:
#this takes a while
metadata_df.to_file("/work/scorreacardo_umass_edu/DeepSatGSD/data/interim/dg_metadata.gpkg", driver="GPKG")


KeyboardInterrupt



In [14]:
sensor_counts = metadata_df[['FILENAME', 'SENSOR']].groupby('FILENAME')['SENSOR'].nunique()
sensor_counts.head()

FILENAME
AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V0_494_304_133_9_R1C1.tif    2
AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V0_494_304_133_9_R1C2.tif    1
AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V0_494_304_133_9_R1C3.tif    2
AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V0_494_304_133_9_R1C4.tif    2
AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V0_494_304_133_9_R1C5.tif    2
Name: SENSOR, dtype: int64

In [15]:
type(sensor_counts)

pandas.core.series.Series

In [16]:
sensor_counts = sensor_counts.reset_index()

In [17]:
sensor_counts.head()

Unnamed: 0,FILENAME,SENSOR
0,AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V...,2
1,AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V...,1
2,AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V...,2
3,AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V...,2
4,AOP_AF15_Q217_V0_494_304_133_9/AOP_AF15_Q217_V...,2


In [18]:
sensor_counts.SENSOR.value_counts()

1    3775
2    3349
3     678
4      24
Name: SENSOR, dtype: int64

In [19]:
#let's take a look at the images with 4 sensors captures:
meta_4_sensors = metadata_df[metadata_df['FILENAME'].isin(sensor_counts[sensor_counts['SENSOR'] == 4]['FILENAME'].tolist())]
meta_4_sensors.reset_index(inplace=True, drop=True)
meta_4_sensors.head()

Unnamed: 0,FILENAME,CATALOG_ID,ACQ_DATE,ONA,CC,SUNEL,SENSOR,ACCURACY,TILE_TYPE,VERSION,geometry
0,AOP_AF20_Q417_V0_508_306_129_9/AOP_AF20_Q417_V...,1050010007C10D00,2017-01-12,2.535889,0.0,57.409065,GE01,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((35.27057 1.05893, 35.27057 1.05894, ..."
1,AOP_AF20_Q417_V0_508_306_129_9/AOP_AF20_Q417_V...,1030010007832C00,2010-10-01,15.752147,72.842246,71.755409,WV02,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((35.24414 1.14258, 35.26766 1.14258, ..."
2,AOP_AF20_Q417_V0_508_306_129_9/AOP_AF20_Q417_V...,104001002D437900,2017-06-10,18.950382,3.148776,61.670254,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((35.33203 1.14257, 35.33203 1.14257, ..."
3,AOP_AF20_Q417_V0_508_306_129_9/AOP_AF20_Q417_V...,101001000E9AA900,2011-12-13,11.700283,4.579706,52.611244,QB02,25.4,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((35.30076 1.14227, 35.30076 1.1..."
4,AOP_AF20_Q417_V0_508_306_129_9/AOP_AF20_Q417_V...,101001000CEC7500,2011-01-30,17.573339,3.329182,54.15155,QB02,25.4,SVA,AOP-Processing_1.1.8.1,"POLYGON ((35.30067 1.14254, 35.30067 1.14251, ..."


In [29]:
meta_4_sensors.shape

(122, 11)

In [20]:
meta_4_sensors.to_file("/work/scorreacardo_umass_edu/DeepSatGSD/data/interim/dg_metadata_sensorcount4.gpkg", driver="GPKG")

In [21]:
#let's take a look at the images with 4 sensors captures:
meta_3_sensors = metadata_df[metadata_df['FILENAME'].isin(sensor_counts[sensor_counts['SENSOR'] == 3]['FILENAME'].tolist())]
meta_3_sensors.reset_index(inplace=True, drop=True)
meta_3_sensors.head()

Unnamed: 0,FILENAME,CATALOG_ID,ACQ_DATE,ONA,CC,SUNEL,SENSOR,ACCURACY,TILE_TYPE,VERSION,geometry
0,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1040010007397700,2015-02-02,25.988863,0.0,58.584145,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((37.88737 3.98100, 37.88737 3.9..."
1,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1030010024ABAA00,2013-07-05,11.839989,0.0,63.663494,WV02,10.2,SVA,AOP-Processing_1.1.8.1,"MULTIPOLYGON (((37.88110 3.97349, 37.88113 3.9..."
2,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1050410001225D00,2011-01-06,12.898581,0.0,54.994518,GE01,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((37.88353 3.97415, 37.88353 3.97414, ..."
3,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,1050410001225D00,2011-01-06,15.757914,0.0,54.778412,GE01,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((37.44956 3.95508, 37.44141 3.95508, ..."
4,AOP_AF17_Q317_V0_502_309_133_9/AOP_AF17_Q317_V...,104001002AD43900,2017-03-21,18.025932,0.0,70.07683,WV03_VNIR,10.2,SVA,AOP-Processing_1.1.8.1,"POLYGON ((37.44944 4.04297, 37.52929 4.04297, ..."


In [22]:
meta_3_sensors.shape

(2729, 11)

In [24]:
meta_3_sensors.to_file("/work/scorreacardo_umass_edu/DeepSatGSD/data/interim/dg_metadata_sensorcount3.gpkg", driver="GPKG")

In [25]:
output_dir = "/work/scorreacardo_umass_edu/DeepSatGSD/data/interim"
input_dir = "/gypsum/scratch1/jtaneja/DG/DG_new"

# Loop through the DataFrame rows
for index, row in tqdm(meta_4_sensors.iterrows()):
    # Extract the file name, geometry, and sensor
    filename = row['FILENAME']
    geometry = row['geometry']
    sensor = row['SENSOR']
    date = row['ACQ_DATE']

    # Load the TIFF file
    with rasterio.open(os.path.join(input_dir, filename)) as src:
        # Mask the TIFF file based on the geometry
        masked_data, masked_transform = mask(src, [geometry], crop=True)
        masked_meta = src.meta

    # Set the output file path
    output_subdir = os.path.join(output_dir, sensor)
    os.makedirs(output_subdir, exist_ok=True)
    output_filename = os.path.join(output_subdir, os.path.basename(filename)[:-4] + f"_{str(date)}" + f"_{sensor}" +".tif")

    # Update the metadata for the masked TIFF
    masked_meta.update({
        'transform': masked_transform,
        'height': masked_data.shape[1],
        'width': masked_data.shape[2]
    })

    # Save the masked TIFF file
    with rasterio.open(output_filename, 'w', **masked_meta) as dst:
        dst.write(masked_data)

122it [11:23,  5.60s/it]
