In [24]:
pip install netcdf4 xarray

Note: you may need to restart the kernel to use updated packages.


In [3]:
# This script is to extract ducmass variable from the ncei dust nc4 file!
# created by : saima
# saves ducmass variable to a netcdf file from 2020-2024 May-August: total 615 days, time 12.30pm. 
# output: ducmass_combined_1230.nc

import xarray as xr
import numpy as np
import os

# Base path containing the dust data
base_path = "/Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data"
output_file = "ducmass_combined_1230.nc"

# Initialize a list to store filtered data
filtered_data = []

# Loop through years and files
for year in range(2020, 2025):
    year_path = os.path.join(base_path, str(year))
    print(f"Processing year: {year}")
    if not os.path.exists(year_path):
        print(f"Year folder not found: {year_path}")
        continue
    
    for file in os.listdir(year_path):
        # Skip hidden or metadata files
        if file.startswith("._") or not file.endswith(".nc4"):
            print(f"Skipping file: {file}")
            continue
        
        file_path = os.path.join(year_path, file)
        print(f"Processing file: {file_path}")
        
        try:
            # Open the dataset
            ds = xr.open_dataset(file_path, engine="netcdf4")
            
            # Check if DUCMASS exists
            if "DUCMASS" not in ds:
                print(f"'DUCMASS' variable not found in {file}")
                continue
            
            # Find the nearest time to 12:30 PM in each file
            target_time_str = "12:30:00"
            time_values = ds['time'].values
            target_time = time_values[0].astype('datetime64[D]') + np.timedelta64(12, 'h') + np.timedelta64(30, 'm')
            
            nearest_time = time_values[np.abs(time_values - target_time).argmin()]
            print(f"Nearest time found: {nearest_time}")
            
            # Extract DUCMASS data for the nearest time
            ducmass_1230 = ds['DUCMASS'].sel(time=nearest_time)
            filtered_data.append(ducmass_1230)
        
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue

# Combine and save the filtered data
if filtered_data:
    combined_ds = xr.concat(filtered_data, dim="time")
    combined_ds.to_netcdf(output_file)
    print(f"Filtered data saved to {output_file}")
else:
    print("No valid data found.")


Processing year: 2020
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200501.nc4
Nearest time found: 2020-05-01T12:30:00.000000000
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200502.nc4
Nearest time found: 2020-05-02T12:30:00.000000000
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200503.nc4
Nearest time found: 2020-05-03T12:30:00.000000000
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200504.nc4
Nearest time found: 2020-05-04T12:30:00.000000000
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200505.nc4
Nearest time found: 2020-05-05T12:30:00.000000000
Processing file: /Volumes/Work/Thesis/Paul_miller/Thesis_Work/Dust/dust_data/2020/MERRA2_400.tavg1_2d_aer_Nx.20200506.

In [31]:
# This script is to CSV of ducmass variable nc file!
# created by : saima
# converts nc to csv from 2020-2024 May-August: total 615 days, time 12.30pm. 
# output : ducmass.csv

import xarray as xr
import pandas as pd

def netcdf_to_csv(nc_file, output_csv):
    """
    Convert a NetCDF file to a CSV, keeping only non-NaN values.

    Parameters:
    nc_file (str): Path to the NetCDF file.
    output_csv (str): Path to save the CSV file.
    """
    # Open the NetCDF file
    ds = xr.open_dataset(nc_file)
    
    # Check if DUCMASS exists
    if 'DUCMASS' not in ds.variables:
        print("DUCMASS variable not found in the dataset.")
        return
    
    # Convert to a DataFrame
    df = ds[['DUCMASS']].to_dataframe().reset_index()
    
    # Remove NaN values
    df_cleaned = df.dropna(subset=['DUCMASS'])
    
    # Save to CSV
    df_cleaned.to_csv(output_csv, index=False)
    print(f"CSV saved to {output_csv}")

# File paths
netcdf_file = "ducmass_combined_1230.nc"  # Replace with your NetCDF file
output_csv_file = "ducmass.csv"  # Desired CSV file name

# Convert NetCDF to CSV
netcdf_to_csv(netcdf_file, output_csv_file)


CSV saved to ducmass.csv


In [8]:
# This script is to align ducmass lat and lon according to sst grid!
# created by : saima
# converts, time (615), 12.30pm, lat (140), lon (400), dust 
# output : ducmass_on_sst_grid.nc

import xarray as xr
import numpy as np

# Load the SST and DUCMASS datasets
print("Loading datasets...")
sst_ds = xr.open_dataset("./dust_dataset/new_cropped_oisst.nc")   # Replace with actual SST file path
dust_ds = xr.open_dataset("./dust_dataset/ducmass_cropped.nc")  # Replace with actual DUCMASS file path

# Define new latitude and longitude grids from SST
new_lat = sst_ds['lat']
new_lon = sst_ds['lon']

# Convert longitudes in dust dataset from [-180,180] to [0,360] if necessary
if dust_ds['lon'].min() < 0:
    print("Converting longitudes in DUCMASS dataset from [-180,180] to [0,360]...")
    dust_ds = dust_ds.assign_coords(lon=(dust_ds['lon'] + 360) % 360)
    dust_ds = dust_ds.sortby('lon')

# Interpolate DUCMASS to SST grid
print("Interpolating DUCMASS to SST resolution...")
ducmass_interp = dust_ds.interp(lat=new_lat, lon=new_lon, method="linear")

# Keep only DUCMASS, lat, lon, and time
ducmass_interp = ducmass_interp[['DUCMASS']]

# Convert time to daily format (remove hours)
ducmass_interp['time'] = ducmass_interp['time'].dt.floor("D")  # Keep only date (YYYY-MM-DD)

# Save the new NetCDF file
output_nc = "ducmass_on_sst_grid.nc"
print(f"Saving interpolated DUCMASS dataset to {output_nc}...")
ducmass_interp.to_netcdf(output_nc, mode='w', format='NETCDF4')

print("Processing complete! DUCMASS data is now aligned with SST grid.")


Loading datasets...
Interpolating DUCMASS to SST resolution...
Saving interpolated DUCMASS dataset to ducmass_on_sst_grid.nc...
Processing complete! DUCMASS data is now aligned with SST grid.


In [10]:
# Converts new_cropped_oisst.nc and ducmass_on_sst_grid.nc file to CSV
# time, lat, lon, sst, time, lat, lon, ducmass


import xarray as xr
import pandas as pd

def save_sst_dust_to_csv(sst_file, dust_file, output_csv):
    print("Loading SST dataset...")
    sst_ds = xr.open_dataset(sst_file)

    # Check if 'zlev' exists in SST and drop it
    if 'zlev' in sst_ds.dims:
        print("Dropping 'zlev' dimension from SST dataset...")
        sst_ds = sst_ds.isel(zlev=0)  # Select the first level if it exists

    print("Loading Dust dataset...")
    dust_ds = xr.open_dataset(dust_file)

    # Extract SST variables (time, lat, lon, sst)
    print("Extracting SST variables...")
    sst_df = sst_ds[['sst']].to_dataframe().reset_index()

    # Extract DUCMASS variable
    print("Extracting DUCMASS variable...")
    dust_df = dust_ds[['DUCMASS']].to_dataframe().reset_index()

    # Print dataset shapes for debugging
    print(f"SST dataset shape: {sst_df.shape}")
    print(f"Dust dataset shape: {dust_df.shape}")

    # Concatenate without merging
    print("Concatenating datasets without merging...")
    combined_df = pd.concat([sst_df, dust_df], axis=1)

    # Save to CSV
    print(f"Saving concatenated dataset to {output_csv}...")
    combined_df.to_csv(output_csv, index=False)
    print(f"✅ Dataset successfully saved to {output_csv}")

# File paths
sst_file = "./dust_dataset/new_cropped_oisst.nc"   # Replace with actual SST NetCDF file path
dust_file = "./dust_dataset/ducmass_on_sst_grid.nc" # Replace with actual Dust NetCDF file path
output_csv = "./dust_dataset/sst_dust_combined.csv"

# Run the function
save_sst_dust_to_csv(sst_file, dust_file, output_csv)


Loading SST dataset...
Dropping 'zlev' dimension from SST dataset...
Loading Dust dataset...
Extracting SST variables...
Extracting DUCMASS variable...
SST dataset shape: (34440000, 5)
Dust dataset shape: (34440000, 4)
Concatenating datasets without merging...
Saving concatenated dataset to ./dust_dataset/sst_dust_combined.csv...
✅ Dataset successfully saved to ./dust_dataset/sst_dust_combined.csv


In [11]:
# Converts CSV to CSV
# sst_dust_combined.csv to sst_dust_cleaned.csv
# dropped time, lat, lon, sst, time, lat, lon, ducmass to time, lat, lon, sst, ducmass

import pandas as pd

# Load the dataset
input_csv = "./dust_dataset/sst_dust_combined.csv"  # Update with your actual filename
output_csv = "./dust_dataset/sst_dust_cleaned.csv"

print("Loading dataset...")
df = pd.read_csv(input_csv)

# Identify column names (modify if needed)
columns_to_drop = ['zlev', 'time.1', 'lat.1', 'lon.1']  # Ensure these match your actual column names

# Check actual column names
print("Original Columns:", df.columns)

# Drop unnecessary columns
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

# Replace missing values with NaN
df = df.fillna('NaN')

# Save the cleaned CSV
print(f"Saving cleaned dataset to {output_csv}...")
df.to_csv(output_csv, index=False)

print("Dataset cleaned and saved successfully!")


Loading dataset...
Original Columns: Index(['time', 'lat', 'lon', 'sst', 'zlev', 'time.1', 'lat.1', 'lon.1',
       'DUCMASS'],
      dtype='object')
Saving cleaned dataset to ./dust_dataset/sst_dust_cleaned.csv...
Dataset cleaned and saved successfully!


In [12]:
# CSV to CSV
# sst_dust_cleaned.csv TO sst_dust_filtered.csv
# drops all rows with sst NaN values

import pandas as pd

# Load the cleaned dataset
input_csv = "./dust_dataset/sst_dust_cleaned.csv"
output_csv = "./dust_dataset/sst_dust_filtered.csv"

print("Loading dataset...")
df = pd.read_csv(input_csv)

# Check initial shape
print(f"Original dataset shape: {df.shape}")

# Drop rows where 'sst' is NaN
df_filtered = df.dropna(subset=['sst'])

# Check shape after removing NaN rows
print(f"Filtered dataset shape: {df_filtered.shape}")

# Save the filtered dataset
df_filtered.to_csv(output_csv, index=False)
print(f"Filtered dataset saved to {output_csv}")


Loading dataset...
Original dataset shape: (34440000, 5)
Filtered dataset shape: (25705770, 5)
Filtered dataset saved to ./dust_dataset/sst_dust_filtered.csv


In [13]:
# CSV to CSV
# sst_dust_filtered.csv TO sst_dust_final.csv
# add missing values of ducmass with NaN values. converts scientific notation to digital numbers

## DIDNOT UPDATE NAN VALUES IN DUCMASS..... ALSO THERE ARE SOME SCIENTIFIC NOTATION STILL THERE..


import pandas as pd

# Define input and output file paths
input_csv = "./dust_dataset/sst_dust_filtered.csv"  # Change this if your file is in a different location
output_csv = "./dust_dataset/sst_dust_final.csv"

# Load dataset
print("Loading dataset...")
df = pd.read_csv(input_csv)

# Convert DUCMASS missing values to NaN
print("Replacing missing DUCMASS values with NaN...")
df["DUCMASS"] = df["DUCMASS"].replace([1e15, -1e15], pd.NA)  # Replace invalid scientific notation values

# Convert DUCMASS to normal decimal format
print("Converting DUCMASS scientific notation to normal values...")
df["DUCMASS"] = df["DUCMASS"].astype(float)

# Save cleaned dataset
print(f"Saving cleaned dataset to {output_csv}...")
df.to_csv(output_csv, index=False)

print("Processing complete.")


Loading dataset...
Replacing missing DUCMASS values with NaN...
Converting DUCMASS scientific notation to normal values...
Saving cleaned dataset to ./dust_dataset/sst_dust_final.csv...
Processing complete.


In [6]:
# CSV to CSV
# february 6, 2025
# sst_dust_filtered.csv TO sst_dust_final2.csv
# add missing values of ducmass with NaN values. converts scientific notation to digital numbers

# This one worked well handling missing values with NaN and converted all scientific values but still some E numbers. 


import pandas as pd

# Define file paths
input_csv = "./dust_dataset/sst_dust_final2.csv"  # Modify if needed
output_csv = "./dust_dataset/sst_dust_final3.csv"

# Load dataset
print("Loading dataset...")
df = pd.read_csv(input_csv)

# Replace DUCMASS missing values with 0
print("Replacing missing DUCMASS values with 0...")
df["DUCMASS"] = df["DUCMASS"].replace([1e15, -1e15], 0)  # Replace extreme values
df["DUCMASS"] = df["DUCMASS"].fillna(0)  # Replace NaN with 0

# Convert DUCMASS from scientific notation to normal values
print("Converting DUCMASS from scientific notation to normal values...")
df["DUCMASS"] = df["DUCMASS"].apply(lambda x: f"{x:.10f}")

# Save cleaned dataset
print(f"Saving cleaned dataset to {output_csv}...")
df.to_csv(output_csv, index=False)

print("Processing complete. DUCMASS values converted and missing values filled with 0.")


Loading dataset...
Replacing missing DUCMASS values with 0...
Converting DUCMASS from scientific notation to normal values...
Saving cleaned dataset to ./dust_dataset/sst_dust_final3.csv...
Processing complete. DUCMASS values converted and missing values filled with 0.


In [19]:
# create train, validation, and test datasets while ensuring missing 
# DUCMASS values are replaced with NaN and prints the last 10 rows

# OUTPUT: train_dataset, test_dataset, validation_dataset

import pandas as pd

# Define the file path
csv_file = "./dust_dataset/sst_dust_final3.csv"

# Load the dataset
print("Loading dataset...")
df = pd.read_csv(csv_file)

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Ensure missing DUCMASS values are explicitly filled with NaN
df['DUCMASS'] = df['DUCMASS'].replace({None: pd.NA, "": pd.NA}).astype(float)

# Function to split and save dataset
def save_dataset(df, start_year, end_year, filename):
    subset_df = df[(df['time'].dt.year >= start_year) & (df['time'].dt.year <= end_year)].copy()
    subset_df['DUCMASS'] = subset_df['DUCMASS'].fillna(pd.NA)  # Fill missing DUCMASS with NaN
    subset_df.to_csv(filename, index=False)
    print(f"\n{filename} saved successfully!")
    print(f"Total rows: {subset_df.shape[0]}")
    print(f"Last 10 rows of {filename}:")
    print(subset_df.tail(10))

# Create and save Train dataset (2020-2022)
train_file = "./dust_dataset/train_dataset.csv"
save_dataset(df, 2020, 2022, train_file)

# Create and save Validation dataset (2023)
validation_file = "./dust_dataset/validation_dataset.csv"
save_dataset(df, 2023, 2023, validation_file)

# Create and save Test dataset (2024)
test_file = "./dust_dataset/test_dataset.csv"
save_dataset(df, 2024, 2024, test_file)



Loading dataset...

./dust_dataset/train_dataset.csv saved successfully!
Total rows: 15423462
Last 10 rows of ./dust_dataset/train_dataset.csv:
                        time     lat      lon        sst   DUCMASS
15423452 2022-08-31 12:00:00  34.875  351.375  23.090000  0.000307
15423453 2022-08-31 12:00:00  34.875  351.625  23.500000  0.000310
15423454 2022-08-31 12:00:00  34.875  351.875  23.760000  0.000312
15423455 2022-08-31 12:00:00  34.875  352.125  23.779999  0.000303
15423456 2022-08-31 12:00:00  34.875  352.375  23.630000  0.000293
15423457 2022-08-31 12:00:00  34.875  352.625  23.460000  0.000285
15423458 2022-08-31 12:00:00  34.875  352.875  23.269999  0.000279
15423459 2022-08-31 12:00:00  34.875  353.125  22.840000  0.000272
15423460 2022-08-31 12:00:00  34.875  353.375  22.119999  0.000262
15423461 2022-08-31 12:00:00  34.875  353.625  21.510000  0.000252

./dust_dataset/validation_dataset.csv saved successfully!
Total rows: 5141154
Last 10 rows of ./dust_dataset/validatio

In [5]:
# checks sst_dust_combined lat and lon values if they match in both the dataset or not
# Load merged dataset
import pandas as pd
import pandas as pd

# Load the merged dataset
merged_file = './dust_dataset/sst_dust_combined.csv'
print("Loading merged dataset...")
df = pd.read_csv(merged_file)

# Display the first few rows of 'lat' and 'lon' from SST and Dust
print("\nFirst 5 rows of 'lat' from SST:")
print(df[['lat']].head())

print("\nFirst 5 rows of 'lat.1' from Dust:")
print(df[['lat.1']].head())

print("\nFirst 5 rows of 'lon' from SST:")
print(df[['lon']].head())

print("\nFirst 5 rows of 'lon.1' from Dust:")
print(df[['lon.1']].head())

# Check for mismatches between 'lat' columns
mismatched_lat = df[df['lat'] != df['lat.1']]
print(f"\nNumber of mismatched latitude rows: {mismatched_lat.shape[0]}")

# Check for mismatches between 'lon' columns
mismatched_lon = df[df['lon'] != df['lon.1']]
print(f"Number of mismatched longitude rows: {mismatched_lon.shape[0]}")

# Display some mismatched latitude rows if any
if not mismatched_lat.empty:
    print("\nFirst 5 mismatched latitude rows:")
    print(mismatched_lat[['lat', 'lat.1']].head())
else:
    print("\nAll latitude entries match between SST and Dust datasets.")

# Display some mismatched longitude rows if any
if not mismatched_lon.empty:
    print("\nFirst 5 mismatched longitude rows:")
    print(mismatched_lon[['lon', 'lon.1']].head())
else:
    print("\nAll longitude entries match between SST and Dust datasets.")

# Optionally, save mismatched lat/lon rows to CSV if needed
# mismatched_lat.to_csv('./dust_dataset/mismatched_lat.csv', index=False)
# mismatched_lon.to_csv('./dust_dataset/mismatched_lon.csv', index=False)


Loading merged dataset...

First 5 rows of 'lat' from SST:
     lat
0  0.125
1  0.125
2  0.125
3  0.125
4  0.125

First 5 rows of 'lat.1' from Dust:
   lat.1
0  0.125
1  0.125
2  0.125
3  0.125
4  0.125

First 5 rows of 'lon' from SST:
       lon
0  260.125
1  260.375
2  260.625
3  260.875
4  261.125

First 5 rows of 'lon.1' from Dust:
     lon.1
0  260.125
1  260.375
2  260.625
3  260.875
4  261.125

Number of mismatched latitude rows: 0
Number of mismatched longitude rows: 0

All latitude entries match between SST and Dust datasets.

All longitude entries match between SST and Dust datasets.


In [8]:
# THIS IS THE FINAL SCRIPT THAT RUNS AND MAKES 3 DATASETS OF TRAIN , TEST AND VALIDATION
# FEBRUARY 6, 2025
# FOR SST+ DUST
import pandas as pd

# Load the final dataset
final_csv = "./dust_dataset/sst_dust_final3.csv"
print("Loading final dataset...")
df = pd.read_csv(final_csv)

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Function to split and save dataset
def save_dataset(df, start_year, end_year, filename):
    # Filter dataset for the specified year range
    subset_df = df[(df['time'].dt.year >= start_year) & (df['time'].dt.year <= end_year)].copy()
    
    # Save the dataset to CSV
    subset_df.to_csv(filename, index=False)
    
    # Display summary
    print(f"\n{filename} saved successfully!")
    print(f"Total rows: {subset_df.shape[0]}")
    print(f"Last 10 rows of {filename}:")
    print(subset_df.tail(10))

# Create and save Train dataset (2020-2022)
train_file = "./dust_dataset/Dust_data/train_dataset.csv"
save_dataset(df, 2020, 2022, train_file)

# Create and save Validation dataset (2023)
validation_file = "./dust_dataset/Dust_data/validation_dataset.csv"
save_dataset(df, 2023, 2023, validation_file)

# Create and save Test dataset (2024)
test_file = "./dust_dataset/Dust_data/test_dataset.csv"
save_dataset(df, 2024, 2024, test_file)

print("\nAll datasets created successfully!")


Loading final dataset...

./dust_dataset/Dust_data/train_dataset.csv saved successfully!
Total rows: 15423462
Last 10 rows of ./dust_dataset/Dust_data/train_dataset.csv:
                        time     lat      lon        sst   DUCMASS
15423452 2022-08-31 12:00:00  34.875  351.375  23.090000  0.000307
15423453 2022-08-31 12:00:00  34.875  351.625  23.500000  0.000310
15423454 2022-08-31 12:00:00  34.875  351.875  23.760000  0.000312
15423455 2022-08-31 12:00:00  34.875  352.125  23.779999  0.000303
15423456 2022-08-31 12:00:00  34.875  352.375  23.630000  0.000293
15423457 2022-08-31 12:00:00  34.875  352.625  23.460000  0.000285
15423458 2022-08-31 12:00:00  34.875  352.875  23.269999  0.000279
15423459 2022-08-31 12:00:00  34.875  353.125  22.840000  0.000272
15423460 2022-08-31 12:00:00  34.875  353.375  22.119999  0.000262
15423461 2022-08-31 12:00:00  34.875  353.625  21.510000  0.000252

./dust_dataset/Dust_data/validation_dataset.csv saved successfully!
Total rows: 5141154
Last

In [13]:
# THIS IS THE FINAL SCRIPT THAT RUNS AND MAKES 3 DATASETS OF TRAIN , TEST AND VALIDATION
# FEBRUARY 6, 2025
# FOR SST
import pandas as pd

# Load the final dataset
final_csv = "./sst_dataset.csv"
print("Loading final dataset...")
df = pd.read_csv(final_csv)

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Function to split and save dataset
def save_dataset(df, start_year, end_year, filename):
    # Filter dataset for the specified year range
    subset_df = df[(df['time'].dt.year >= start_year) & (df['time'].dt.year <= end_year)].copy()
    
    # Save the dataset to CSV
    subset_df.to_csv(filename, index=False)
    
    # Display summary
    print(f"\n{filename} saved successfully!")
    print(f"Total rows: {subset_df.shape[0]}")
    print(f"Last 10 rows of {filename}:")
    print(subset_df.tail(10))

# Create and save Train dataset (2020-2022)
train_file = "./dataset/train_dataset.csv"
save_dataset(df, 2020, 2022, train_file)

# Create and save Validation dataset (2023)
validation_file = "./dataset/validation_dataset.csv"
save_dataset(df, 2023, 2023, validation_file)

# Create and save Test dataset (2024)
test_file = "./dataset/test_dataset.csv"
save_dataset(df, 2024, 2024, test_file)

print("\nAll datasets created successfully!")


Loading final dataset...

./dataset/train_dataset.csv saved successfully!
Total rows: 15423462
Last 10 rows of ./dataset/train_dataset.csv:
                        time     lat      lon        sst
15423452 2022-08-31 12:00:00  34.875  351.375  23.090000
15423453 2022-08-31 12:00:00  34.875  351.625  23.500000
15423454 2022-08-31 12:00:00  34.875  351.875  23.760000
15423455 2022-08-31 12:00:00  34.875  352.125  23.779999
15423456 2022-08-31 12:00:00  34.875  352.375  23.630000
15423457 2022-08-31 12:00:00  34.875  352.625  23.460000
15423458 2022-08-31 12:00:00  34.875  352.875  23.269999
15423459 2022-08-31 12:00:00  34.875  353.125  22.840000
15423460 2022-08-31 12:00:00  34.875  353.375  22.119999
15423461 2022-08-31 12:00:00  34.875  353.625  21.510000

./dataset/validation_dataset.csv saved successfully!
Total rows: 5141154
Last 10 rows of ./dataset/validation_dataset.csv:
                        time     lat      lon        sst
20564606 2023-08-31 12:00:00  34.875  351.375  23.76