### Satellite image preprocessing

In [1]:
import os
import pandas as pd
import numpy as np
import rasterio
from datetime import datetime

In [2]:
dir_path = os.path.dirname(os.path.abspath(''))
proj_dir = dir_path + '/Planet/'
os.chdir(proj_dir)

# Initialize a list to hold processed data
processed_data = []

# Loop through each subdirectory (site)
for site_folder in os.listdir(proj_dir):
    if site_folder.startswith('013'):
        if 'json' not in site_folder:
            site_path = os.path.join(proj_dir, site_folder)
            
            # Check if the current path is a directory
            if os.path.isdir(site_path):
                # Load the target data
                targets_path = os.path.join(site_path, 'targets.csv')
                targets_df = pd.read_csv(targets_path, index_col=[0])
                targets_df.index = pd.to_datetime(targets_df.index).date
                
                # Loop through each .tif file in the subdirectory
                for file in os.listdir(site_path):
                    if file.endswith('.tif'):
                        # Check file size and only include images greater than 25 KB
                        size_threshold = 25 * 1024  # 25 KB in bytes
                        file_size = os.path.getsize(os.path.join(site_path, file))
                        if file_size >= size_threshold:
                            # Extract the date from the filename
                            date_str = file[:8]  # Assuming the date is the first 8 characters
                            date = datetime.strptime(date_str, '%Y%m%d').date()
                            
                            # Check if the date is in the targets DataFrame
                            if date in targets_df.index:
                                target_value = targets_df.loc[date, 'turbidity (FNU)']
                                
                                # Read the .tif file as a 4-dimensional array
                                tif_path = os.path.join(site_path, file)
                                with rasterio.open(tif_path) as src:
                                    raster_array = src.read()  # (bands, width, height)
                                
                                # Append the data to the processed_data list
                                processed_data.append({
                                    'site': site_folder,
                                    'date': date,
                                    'raster': raster_array,
                                    'target': target_value
                                })

                
    


In [3]:
import pickle

# Save processed_data as a pickle file
pickle_file_path = 'processed_data.pkl'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(processed_data, f)

In [2]:
# processed_df_test = pd.read_csv('processed_data.csv')
processed_data = pd.read_pickle('processed_data.pkl')

In [39]:
num_elements = len(processed_data_pckl[0]['raster'])* len(processed_data_pckl[0]['raster'][1])* len(processed_data_pckl[0]['raster'][2])
print(np.count_nonzero(processed_data_pckl[0]['raster']) / num_elements)

0.6937533802055165


In [3]:
data = pd.DataFrame(processed_data)
data.set_index(['site', 'date'], inplace=True)