### Satellite image preprocessing

In [1]:
import os
import pandas as pd
import rasterio
from datetime import datetime

In [2]:
visited = [] # visited lst
final_turb_dict = {}
dir_path = os.path.dirname(os.path.abspath(''))
proj_dir = dir_path + '/Planet/'
os.chdir(proj_dir)

# Initialize a list to hold processed data
processed_data = []

# Loop through each subdirectory (site)
for site_folder in os.listdir(proj_dir):
    if site_folder.startswith('013'):
        if 'json' not in site_folder:
            site_path = os.path.join(proj_dir, site_folder)
            
            # Check if the current path is a directory
            if os.path.isdir(site_path):
                # Load the target data
                targets_path = os.path.join(site_path, 'targets.csv')
                targets_df = pd.read_csv(targets_path, index_col=[0])
                targets_df.index = pd.to_datetime(targets_df.index).date
                
                # Loop through each .tif file in the subdirectory
                for file in os.listdir(site_path):
                    if file.endswith('.tif'):
                        # Extract the date from the filename
                        date_str = file[:8]  # Assuming the date is the first 8 characters
                        date = datetime.strptime(date_str, '%Y%m%d').date()
                        
                        # Check if the date is in the targets DataFrame
                        if date in targets_df.index:
                            target_value = targets_df.loc[date, 'turbidity (FNU)']
                            
                            # Read the .tif file as a 4-dimensional array
                            tif_path = os.path.join(site_path, file)
                            with rasterio.open(tif_path) as src:
                                raster_array = src.read()  # (bands, width, height)
                            
                            # Append the data to the processed_data list
                            processed_data.append({
                                'site': site_folder,
                                'date': date,
                                'raster': raster_array,
                                'target': target_value
                            })

                
    


In [3]:
import pickle

# Save processed_data as a pickle file
pickle_file_path = 'processed_data.pkl'
with open(pickle_file_path, 'wb') as f:
    pickle.dump(processed_data, f)