# Import Libraries

In [53]:
import os
import sys
import cv2
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing as mp
from tqdm import tqdm
from netCDF4 import Dataset
from collections import defaultdict 
from datetime import timedelta, date
from s2cloudless import S2PixelCloudDetector

# Define Paths

In [54]:
cwd = os.getcwd()
data_path = open('path_to_data.txt').read()
print(cwd, '\n', data_path)

/home/sarthakj/code/AI/projects/summers/summer_2024/AFRL 
 /home/sarthakj/data/MultiEarth/multiearth2023-dataset-final/data


# Metadata

In [55]:
# Define the path to the data directory
data_directory = data_path
DIR = sorted(os.listdir(data_directory))
#DIR = [file for file in DIR if ('deforestation' in file)]
DIR = [file for file in DIR if (('fire' not in file))]

# Initialize an empty dictionary to store the file names and their data point counts
data_counts = {}

# Loop through each file in the data directory
for filename in DIR:
    if filename.endswith('.nc'):
        # Construct the full file path
        file_path = os.path.join(data_directory, filename)
        
        # Open the .nc file
        with Dataset(file_path, 'r') as nc_file:
            # Assuming the data variable is named 'data', adjust as necessary
            data_variable = nc_file.variables['index']
            
            # Count the number of data points
            num_data_points = data_variable.size
            
            # Store the count in the dictionary
            data_counts[filename] = num_data_points

# Create a DataFrame from the dictionary
df = pd.DataFrame(list(data_counts.items()), columns=['File Name', 'Number of Data Points'])

# Display the DataFrame
print(df)

                                 File Name  Number of Data Points
0    deforestation_segmentation_targets.nc                   1000
1                   deforestation_train.nc                  17215
2                   landsat5_prediction.nc                  34965
3                        landsat5_train.nc                 429080
4   landsat8_deforestation_segmentation.nc                  10494
5                   landsat8_prediction.nc                   4483
6                        landsat8_train.nc                 233225
7      sent1_deforestation_segmentation.nc                  24131
8                      sent1_prediction.nc                   1302
9                          sent1_sar2eo.nc                   5000
10                          sent1_train.nc                 390353
11                    sent2_b1-b4_train.nc                 403163
12                    sent2_b5-b8_train.nc                 403163
13                   sent2_b9-b12_train.nc                 403163
14     sen

# NP Array Metadata

In [63]:
def np_metadata(arr, name='Array', print_unique_values=False):
    print()
    print('Name:', name)
    print('\tShape:', arr.shape)
    print('\tSize:', arr.size)
    print('\tDimensions:', arr.ndim)
    print('\tDtype:', arr.dtype)
    print('\tMemory Usage:', arr.nbytes, 'bytes')
    if not str(arr.dtype).startswith('<U'):
        print('\tMIN:', np.min(arr))
        print('\tMAX:', np.max(arr))
    print('\tNum Unique Values:', np.unique(arr, axis=0).size)
    if print_unique_values:
        print('\tUnique Values:', np.unique(arr, axis=0))
    print('\tPreview:', arr[:10])
    print()

def np_metadata_image(image, name='Image', print_unique_values=False):
    print()
    print('Name:', name)
    print('\tShape:', image.shape)
    print('\tSize:', image.size)
    print('\tDimensions:', image.ndim)
    print('\tDtype:', image.dtype)
    print('\tMemory Usage:', image.nbytes, 'bytes')
    print('\tMIN:', image.min())
    print('\tMAX:', image.max())
    print('\tNum Unique Values:', np.unique(image).size)
    if print_unique_values:
        print('\tUnique Values:', np.unique(image))
    '''
    print('\tPreview:')
    plt.imshow(image)
    plt.axis('off')
    plt.show()
    '''
    print()

# netCDF date interpreter

In [57]:
def days_from_date(n, date='2021-05-01'):
    date = pd.to_datetime(date)
    return (date + timedelta(days=int(n))).strftime('%Y-%m-%d')

def date_from_days(date, original_date='2018-12-13'):
    # determine how many days have passed since the original date
    days_passed = pd.to_datetime(date) - pd.to_datetime(original_date)
    return days_passed.days


# Look at Specific Files

### Deforestation Train

In [58]:
deforestation_train = Dataset(os.path.join(data_directory, 'deforestation_train.nc'))
print('Deforestation Train:', nc_file.variables.keys())
COORDS = np.array(deforestation_train.variables['center_lat_lons'])
np_metadata(COORDS.T[0], 'Latitude')
np_metadata(COORDS.T[1], 'Longitude')
np_metadata(COORDS, 'Label Coordinates')

# Deforestation Test Set Collction Dates
dates = deforestation_train.variables['collection_dates']
dates = np.array(dates[:])
print('Collection dates:', dates, '\tLength:', len(dates))
value, counts = np.unique(dates, return_counts=True)
Gregorian_dates = [days_from_date(i) for i in value]
print('Unique Collection Dates:', Gregorian_dates)

Deforestation Train: dict_keys(['index', 'data_band', 'channel', 'row', 'col', 'images', 'collection_dates', 'geo_coord', 'center_lat_lons'])

Name: Latitude
	Shape: (17215,)
	Size: 17215
	Dimensions: 1
	Dtype: float64
	Memory Usage: 137720 bytes
	MIN: -4.39
	MAX: -3.33
	Num Unique Values: 54
	Preview: [-4.09 -3.61 -4.17 -3.39 -4.27 -3.69 -3.59 -3.91 -3.79 -3.63]


Name: Longitude
	Shape: (17215,)
	Size: 17215
	Dimensions: 1
	Dtype: float64
	Memory Usage: 137720 bytes
	MIN: -55.2
	MAX: -54.48
	Num Unique Values: 29
	Preview: [-55.06 -54.92 -54.68 -55.1  -55.12 -55.08 -54.94 -55.12 -54.58 -54.48]


Name: Label Coordinates
	Shape: (17215, 2)
	Size: 34430
	Dimensions: 2
	Dtype: float64
	Memory Usage: 275440 bytes
	MIN: -55.2
	MAX: -3.33
	Num Unique Values: 3130
	Preview: [[ -4.09 -55.06]
 [ -3.61 -54.92]
 [ -4.17 -54.68]
 [ -3.39 -55.1 ]
 [ -4.27 -55.12]
 [ -3.69 -55.08]
 [ -3.59 -54.94]
 [ -3.91 -55.12]
 [ -3.79 -54.58]
 [ -3.63 -54.48]]

Collection dates: [ 0  0  0 ... 92 92 92] 	Length

### Sentinel-2 Train

In [65]:
sent2_b1_b4_train = Dataset(os.path.join(data_directory, 'sent2_b1-b4_train.nc'))
print('SENTINEL-2 B1-B4+QA60 Deforestation Train:', nc_file.variables.keys())
print('Image Shape:', sent2_b1_b4_train.variables['images'].shape)
COORDS = np.array(sent2_b1_b4_train.variables['center_lat_lons']).T
np_metadata(COORDS[0], 'Latitude')
np_metadata(COORDS[1], 'Longitude')
DATA_BANDS_1_4 = np.array(sent2_b1_b4_train.variables['data_band']).astype(str)
np_metadata(DATA_BANDS_1_4, 'Data Bands')

# Deforestation Test Set Collection Dates
dates = sent2_b1_b4_train.variables['collection_dates']
dates = np.array(dates[:])
print('Collection dates:', dates, '\tLength:', len(dates))
value, counts = np.unique(dates, return_counts=True)
Gregorian_dates = [days_from_date(i, '2018-12-13') for i in value]
print('Unique Collection Dates:', Gregorian_dates)
np_metadata(counts, 'Unique Collection Dates Counts')

sent2_b5_b8_train = Dataset(os.path.join(data_directory, 'sent2_b5-b8_train.nc'))
print('SENTINEL-2 B5-B8A+QA60 Deforestation Train:', nc_file.variables.keys())
DATA_BANDS_5_8 = np.array(sent2_b5_b8_train.variables['data_band']).astype(str)
np_metadata(DATA_BANDS_5_8, 'Data Bands')

sent2_b9_b12_train = Dataset(os.path.join(data_directory, 'sent2_b9-b12_train.nc'))
print('SENTINEL-2 B9-B12+QA60 Deforestation Train:', nc_file.variables.keys())
DATA_BANDS_9_12 = np.array(sent2_b9_b12_train.variables['data_band']).astype(str)
np_metadata(DATA_BANDS_9_12, 'Data Bands')

DATA_BANDS = np.concatenate((DATA_BANDS_1_4, DATA_BANDS_5_8[:-1], DATA_BANDS_9_12[:-1]), axis=0)
DATA_BANDS_DICT = {DATA_BANDS[i] : i for i in range(DATA_BANDS.shape[0])}
print('Data Bands:', DATA_BANDS)

SENTINEL-2 B1-B4+QA60 Deforestation Train: dict_keys(['index', 'data_band', 'channel', 'row', 'col', 'images', 'collection_dates', 'geo_coord', 'center_lat_lons'])
Image Shape: (403163, 5, 1, 256, 256)

Name: Latitude
	Shape: (403163,)
	Size: 403163
	Dimensions: 1
	Dtype: float64
	Memory Usage: 3225304 bytes
	MIN: -4.39
	MAX: -3.33
	Num Unique Values: 54
	Preview: [-4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39]


Name: Longitude
	Shape: (403163,)
	Size: 403163
	Dimensions: 1
	Dtype: float64
	Memory Usage: 3225304 bytes
	MIN: -55.2
	MAX: -54.48
	Num Unique Values: 29
	Preview: [-55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2]


Name: Data Bands
	Shape: (5,)
	Size: 5
	Dimensions: 1
	Dtype: <U4
	Memory Usage: 80 bytes
	Num Unique Values: 5
	Preview: ['B1' 'B2' 'B3' 'B4' 'QA60']

Collection dates: [   0    5   10 ... 1100 1105 1110] 	Length: 403163
Unique Collection Dates: ['2018-12-13', '2018-12-16', '2018-12-18', '2018-12-21', '2018-12-23', '2018-12-26', '2018-1

# Coordinate Analysis

In [66]:
def sent2_coordinate_analysis():
    # open sent2_valid_image_index.npy
    valid_index = np.load('sent2_valid_image_index.npy')
    #valid_index = np.arange(len(sent2_b1_b4_train.variables['index']))
    print('Valid Index:', valid_index.shape)

    COORDS = np.array(sent2_b1_b4_train.variables['center_lat_lons'])
    valid_data_COORDS = COORDS[valid_index]
    np_metadata(valid_data_COORDS.T[0], 'Latitude')
    np_metadata(valid_data_COORDS.T[1], 'Longitude')
    np_metadata(valid_data_COORDS, 'Valid Data Coordinates')

    DATES = np.array(sent2_b1_b4_train.variables['collection_dates'])
    valid_data_dates = DATES[valid_index]
    np_metadata(valid_data_dates, 'Valid Data Dates')
    STRING_valid_data_dates = np.array([days_from_date(i, '2018-12-13') for i in valid_data_dates]).astype(str)
    np_metadata(STRING_valid_data_dates, 'String Valid Data Dates')

    TIME_FOR_COORDS = defaultdict(list)
    for i, idx in tqdm(enumerate(valid_index), total=len(valid_index)):
        TIME_FOR_COORDS[tuple(COORDS[i])].append(valid_data_dates[i])

    return TIME_FOR_COORDS

def labels_coordinate_analysis():
    COORDS = np.array(deforestation_train.variables['center_lat_lons'])
    np_metadata(COORDS.T[0], 'Latitude')
    np_metadata(COORDS.T[1], 'Longitude')
    np_metadata(COORDS, 'Label Coordinates')

    DATES = np.array(deforestation_train.variables['collection_dates'])
    np_metadata(DATES, 'Label Dates')
    STRING_DATES = np.array([days_from_date(i) for i in DATES]).astype(str)
    np_metadata(STRING_DATES, 'String Label Dates')

    TIME_FOR_COORDS = defaultdict(list)
    for i, date in tqdm(enumerate(DATES), total=len(DATES)):
        TIME_FOR_COORDS[tuple(COORDS[i])].append(date)

    return TIME_FOR_COORDS


sent2_TIME_FOR_COORDS = sent2_coordinate_analysis()
label_TIME_FOR_COORDS = labels_coordinate_analysis()

NOT_IN_SENT2 = 0
for key in label_TIME_FOR_COORDS.keys():
    if key not in sent2_TIME_FOR_COORDS.keys():
        NOT_IN_SENT2 += 1
print('Not in Sent2:', NOT_IN_SENT2)

for key in label_TIME_FOR_COORDS.keys():
    for i in range(len(label_TIME_FOR_COORDS[key])):
        label_TIME_FOR_COORDS[key][i] = date_from_days(label_TIME_FOR_COORDS[key][i], '2021-05-01')
    for i in range(len(sent2_TIME_FOR_COORDS[key])):
        sent2_TIME_FOR_COORDS[key][i] = date_from_days(sent2_TIME_FOR_COORDS[key][i], '2018-12-13')

print(len(label_TIME_FOR_COORDS.keys()))

closest_dates = []

EMPTY_SENT2 = []
for key in tqdm(label_TIME_FOR_COORDS.keys()):
    if len(sent2_TIME_FOR_COORDS[key]) == 0:
        EMPTY_SENT2.append(key)
        continue
    for label_date in label_TIME_FOR_COORDS[key]:
        closest_date = min(sent2_TIME_FOR_COORDS[key], key=lambda x:abs(x-label_date))
        closest_dates.append(closest_date)

print('Empty Sent2:', len(EMPTY_SENT2))
closest_dates = np.array(closest_dates)
np_metadata(closest_dates, 'Closest Dates')

Valid Index: (94790,)

Name: Latitude
	Shape: (94790,)
	Size: 94790
	Dimensions: 1
	Dtype: float64
	Memory Usage: 758320 bytes
	MIN: -4.39
	MAX: -3.33
	Num Unique Values: 54
	Preview: [-4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39 -4.39]


Name: Longitude
	Shape: (94790,)
	Size: 94790
	Dimensions: 1
	Dtype: float64
	Memory Usage: 758320 bytes
	MIN: -55.2
	MAX: -54.48
	Num Unique Values: 29
	Preview: [-55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2 -55.2]


Name: Valid Data Coordinates
	Shape: (94790, 2)
	Size: 189580
	Dimensions: 2
	Dtype: float64
	Memory Usage: 1516640 bytes
	MIN: -55.2
	MAX: -3.33
	Num Unique Values: 3132
	Preview: [[ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]
 [ -4.39 -55.2 ]]


Name: Valid Data Dates
	Shape: (94790,)
	Size: 94790
	Dimensions: 1
	Dtype: int64
	Memory Usage: 758320 bytes
	MIN: 0
	MAX: 1105
	Num Unique Values: 353
	Preview: [  5  15

100%|██████████| 94790/94790 [00:00<00:00, 707878.93it/s]



Name: Latitude
	Shape: (17215,)
	Size: 17215
	Dimensions: 1
	Dtype: float64
	Memory Usage: 137720 bytes
	MIN: -4.39
	MAX: -3.33
	Num Unique Values: 54
	Preview: [-4.09 -3.61 -4.17 -3.39 -4.27 -3.69 -3.59 -3.91 -3.79 -3.63]


Name: Longitude
	Shape: (17215,)
	Size: 17215
	Dimensions: 1
	Dtype: float64
	Memory Usage: 137720 bytes
	MIN: -55.2
	MAX: -54.48
	Num Unique Values: 29
	Preview: [-55.06 -54.92 -54.68 -55.1  -55.12 -55.08 -54.94 -55.12 -54.58 -54.48]


Name: Label Coordinates
	Shape: (17215, 2)
	Size: 34430
	Dimensions: 2
	Dtype: float64
	Memory Usage: 275440 bytes
	MIN: -55.2
	MAX: -3.33
	Num Unique Values: 3130
	Preview: [[ -4.09 -55.06]
 [ -3.61 -54.92]
 [ -4.17 -54.68]
 [ -3.39 -55.1 ]
 [ -4.27 -55.12]
 [ -3.69 -55.08]
 [ -3.59 -54.94]
 [ -3.91 -55.12]
 [ -3.79 -54.58]
 [ -3.63 -54.48]]


Name: Label Dates
	Shape: (17215,)
	Size: 17215
	Dimensions: 1
	Dtype: int64
	Memory Usage: 137720 bytes
	MIN: -1734
	MAX: 92
	Num Unique Values: 11
	Preview: [0 0 0 0 0 0 0 0 0 0]


Name: S

100%|██████████| 17215/17215 [00:00<00:00, 666318.55it/s]

Not in Sent2: 1149





1565


100%|██████████| 1565/1565 [00:00<00:00, 26416.85it/s]

Empty Sent2: 1149

Name: Closest Dates
	Shape: (4576,)
	Size: 4576
	Dimensions: 1
	Dtype: int64
	Memory Usage: 36608 bytes
	MIN: -17878
	MAX: -17878
	Num Unique Values: 1
	Preview: [-17878 -17878 -17878 -17878 -17878 -17878 -17878 -17878 -17878 -17878]






# Save some valid data

In [None]:
def save_image(index):
    # Load the image & normalize
    image = np.array(sent2_b1_b4_train.variables['images'][index]).squeeze()[1:4][::-1].transpose(1, 2, 0)
    percentage = np.percentile(image, [5, 95])
    image = np.clip(image, percentage[0], percentage[1])
    image = (image - percentage[0]) / (percentage[1] - percentage[0])

    # Load the QA60 band cloud mask
    qa60 = np.array(sent2_b1_b4_train.variables['images'][index]).squeeze()[4]
    qa60 = np.stack(3*[qa60/2048.], axis=-1)

    # Create cv2 cloud mask
    cloud_mask = cv2_cloud_mask(image)
    cv2_cloud_percentage = np.sum(cloud_mask/256) / cloud_mask.size * 100

    # Create fig for plotting image and QA60 and cv2 cloud mask
    fig = plt.figure(figsize=(18, 6))
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.axis('off')
    plt.title('RGB Image')
    plt.subplot(1, 3, 2)
    plt.imshow((qa60), cmap='gray')
    plt.axis('off')
    plt.title('QA60 Mask')
    plt.subplot(1, 3, 3)
    plt.imshow(cloud_mask, cmap='gray')
    plt.axis('off')
    plt.title('CV2 Mask')
    plt.close(fig)
    fig.savefig(f'pngs/filtered_sentinel2_pngs/{index}_{sent2_meta[0][index]:.2f}_{sent2_meta[1][index]:.3f}_{cv2_cloud_percentage:.3f}.png')

num_processes = mp.cpu_count()
print('Number of processes:', mp)
pool = mp.Pool(processes=num_processes)
args = [(i) for i in range(len(sent2_meta[0])) if (sent2_meta[0][i] <= 40 and sent2_meta[1][i] >= 0.1 and sent2_meta[2][i] <= 10)]
print('Number of images:', len(args))
args = random.sample(args, 500)
for _ in tqdm(pool.imap(save_image, args), total=len(args)):
    pass
pool.close()
pool.join()