<a href="https://colab.research.google.com/github/rodgpt/MAR_FUTURA/blob/main/NDSI/SiteAnalysis_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SiteAnalysis (single site) - NDSI + bio/anthro energy

This notebook is structured to run on Google Colab using a Python runtime with R via rpy2.
Three changes to run locally vs run in colab:
1. Remove the from google.colab import drive line
2. Remove the drive.mount("/content/gdrive") line
3. Change the path to the data folder in the third cell

In [None]:
#This cell mounts the google drive and install packages to be able to run the rest in R, because the whole runtime is in phyton

#Just for google colab
#from google.colab import drive
#drive.mount('/content/drive')

!pip install rpy2
%load_ext rpy2.ipython

In [None]:
%%R

install.packages("tuneR")
install.packages("seewave")
install.packages("dplyr")
install.packages("purrr")
install.packages("ggplot2")
install.packages("scales")
install.packages("lubridate")
install.packages("tidyr")
install.packages("soundecology")
install.packages("beepr")

library(tuneR)
library(seewave)
library(dplyr)
library(purrr)
library(ggplot2)
library(scales)
library(lubridate)
library(tidyr)
library(soundecology)
library(beepr)


In [None]:
%%R

#For running locally (Rod)
site_label <- "Zapallar 32"
site_dir <- "/Users/rod/Library/CloudStorage/GoogleDrive-royanedel@marfutura.org/Unidades compartidas/MAR FUTURA/Hydrophones/Zapallar/20-10-25/32"
output_dir <- "/Users/rod/Library/CloudStorage/GoogleDrive-royanedel@marfutura.org/Unidades compartidas/MAR FUTURA/Hydrophones/NDSIResults"

#For running in Colab
#site_label <- "San Antonio 28"
#site_dir <- "/content/drive/Shareddrives/MAR FUTURA/Hydrophones/San Antonio/18-10-25/28"
#output_dir <- "/content/drive/Shareddrives/MAR FUTURA/Hydrophones/NDSIResults"

segment_sec <- 60
anthro_band <- c(1000, 2000)
bio_band    <- c(2000, 3000)

tz <- "UTC"
analysis_duration <- NA
files_per_folder <- NA
files_per_hour <- NA

start_date <- NA
end_date   <- NA

extract_datetime <- function(filename) {
  dt_str <- sub("^(?:ST_\\d+_)?(\\d{8}_\\d{6})\\.WAV$", "\\1", basename(filename), ignore.case = TRUE)
  as.POSIXct(dt_str, format = "%Y%m%d_%H%M%S", tz = tz)
}

calculate_ndsi <- function(wave_obj) {
  nd <- NULL
  suppressWarnings(invisible(capture.output({
    nd <- soundecology::ndsi(
      wave_obj,
      anthro_min = anthro_band[1], anthro_max = anthro_band[2],
      bio_min    = bio_band[1],    bio_max    = bio_band[2]
    )
  })))

  list(
    ndsi = nd$ndsi_left,
    anthro_energy = NA_real_,
    bio_energy = NA_real_
  )
}

.evenly_spaced_indices <- function(n, k) {
  if (k <= 0) return(integer(0))
  if (n <= k) return(seq_len(n))
  idx <- round((seq_len(k) * (n + 1)) / (k + 1))
  idx <- pmax(1L, pmin(n, idx))
  sort(unique(idx))
}

.fmt_secs <- function(secs) {
  if (!is.finite(secs) || is.na(secs) || secs < 0) return("NA")
  secs <- as.integer(round(secs))
  h <- secs %/% 3600
  m <- (secs %% 3600) %/% 60
  s <- secs %% 60
  if (h > 0) return(sprintf("%dh%02dm%02ds", h, m, s))
  if (m > 0) return(sprintf("%dm%02ds", m, s))
  sprintf("%ds", s)
}

process_site <- function(directory, label) {
  files <- list.files(directory, pattern = "\\.wav$", full.names = TRUE, recursive = TRUE, ignore.case = TRUE)
  message("Found ", length(files), " files in ", label, " (searching recursively, case-insensitive)")

  if (length(files) == 0) {
    return(tibble(
      Site = character(),
      Time = as.POSIXct(character()),
      NDSI = numeric(),
      Anthro_Energy = numeric(),
      Bio_Energy = numeric()
    ))
  }

  files <- sort(files)

  file_dt <- purrr::map_df(files, ~tibble(
    filepath = .x,
    start_dt = extract_datetime(.x)
  ))

  if (!is.na(start_date)) {
    file_dt <- dplyr::filter(file_dt, start_dt >= (start_date - segment_sec))
  }
  if (!is.na(end_date)) {
    file_dt <- dplyr::filter(file_dt, start_dt <= end_date)
  }

  file_dt <- dplyr::arrange(file_dt, start_dt)

  if (!is.na(files_per_folder)) {
    file_dt <- dplyr::slice_head(file_dt, n = files_per_folder)
  }

  if (!is.na(files_per_hour) && (is.na(files_per_hour) || files_per_hour <= 0)) {
    stop("`files_per_hour` must be NA or a positive number.")
  }

  if (!is.na(files_per_hour)) {
    k <- as.integer(files_per_hour)
    file_dt <- file_dt %>%
      mutate(.hour = lubridate::floor_date(start_dt, unit = "hour")) %>%
      group_by(.hour) %>%
      arrange(start_dt, .by_group = TRUE) %>%
      group_modify(function(.x, .g) {
        idx <- .evenly_spaced_indices(nrow(.x), k)
        .x[idx, , drop = FALSE]
      }) %>%
      ungroup() %>%
      select(-.hour)
  }

  files <- file_dt$filepath

  if (length(files) == 0) {
    return(tibble(
      Site = character(),
      Time = as.POSIXct(character()),
      NDSI = numeric(),
      Anthro_Energy = numeric(),
      Bio_Energy = numeric()
    ))
  }

  t0 <- Sys.time()
  last_print <- Sys.time()

  results <- vector("list", length(files))
  out_k <- 0L

  for (i in seq_along(files)) {
    fp <- files[[i]]

    now <- Sys.time()
    if (i == 1L || i == length(files) || as.numeric(difftime(now, last_print, units = "secs")) >= 2) {
      elapsed <- as.numeric(difftime(now, t0, units = "secs"))
      avg_sec <- elapsed / i
      remaining <- avg_sec * (length(files) - i)
      pct <- 100 * i / length(files)
      eta <- now + remaining

      message(sprintf(
        "%s | %5.1f%% | %d/%d | avg %s/file | ETA %s | ~%s remaining",
        label,
        pct,
        i,
        length(files),
        .fmt_secs(avg_sec),
        format(eta, "%H:%M:%S"),
        .fmt_secs(remaining)
      ))

      last_print <- now
    }

    start_dt <- extract_datetime(fp)

    wav <- tryCatch(readWave(fp), error = function(e) {
      warning("Skipping unreadable file: ", fp)
      return(NULL)
    })
    if (is.null(wav)) next

    dur_sec <- length(wav@left) / wav@samp.rate
    seg_len <- min(segment_sec, dur_sec)
    if (!is.finite(seg_len) || seg_len <= 0) next

    max_start <- max(0, dur_sec - seg_len)
    starts <- seq(0, max_start, by = seg_len)

    for (st in starts) {
      segment_time <- start_dt + st

      if (!is.na(start_date) && segment_time < start_date) {
        next
      }
      if (!is.na(end_date) && segment_time > end_date) {
        break
      }

      seg <- tryCatch(
        extractWave(wav, from = st, to = st + seg_len, xunit = "time"),
        error = function(e) return(NULL)
      )
      if (is.null(seg)) next

      ndsi_res <- calculate_ndsi(seg)

      out_k <- out_k + 1L
      results[[out_k]] <- tibble(
        Site = label,
        Time = segment_time,
        NDSI = ndsi_res$ndsi,
        Anthro_Energy = ndsi_res$anthro_energy,
        Bio_Energy = ndsi_res$bio_energy
      )
    }
  }

  if (out_k == 0L) {
    return(tibble(
      Site = character(),
      Time = as.POSIXct(character()),
      NDSI = numeric(),
      Anthro_Energy = numeric(),
      Bio_Energy = numeric()
    ))
  }

  bind_rows(results[seq_len(out_k)])
}

all_results <- process_site(site_dir, site_label)

if (!exists("all_results") || nrow(all_results) == 0) {
  stop("No WAV files found in the provided directory and date range. Please verify `site_dir` and the date filters.")
}

anthro_tag <- paste0("anthro_", anthro_band[1], "-", anthro_band[2])
bio_tag <- paste0("bio_", bio_band[1], "-", bio_band[2])

date_tag <- if (!is.na(start_date) && !is.na(end_date)) {
  paste0(format(start_date, "%Y%m%d"), "-", format(end_date, "%Y%m%d"))
} else {
  "alltime"
}

dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

output_csv <- file.path(output_dir, paste0(
  "ndsi_", gsub("[^A-Za-z0-9]+", "_", tolower(site_label)), "_",
  date_tag, "_", anthro_tag, "_", bio_tag, ".csv"
))

write.csv(all_results, output_csv, row.names = FALSE)
message("Saved to: ", output_csv)

summary_stats <- all_results %>%
  group_by(Site) %>%
  summarize(
    Segments  = n(),
    Mean_NDSI = mean(NDSI, na.rm = TRUE),
    SD_NDSI   = sd(NDSI, na.rm = TRUE)
  )
print(summary_stats)

plot_data <- all_results

p_ndsi <- ggplot(plot_data, aes(x = Time, y = NDSI)) +
  geom_line(size = 0.8, color = "#2C7FB8") +
  scale_x_datetime(
    date_labels = "%d-%b %H:%M",
    date_breaks = "2 hour",
    expand = expansion(mult = c(0.01, 0.01))
  ) +
  labs(
    title = paste0("NDSI Over Time - ", site_label),
    x = "Date-Time",
    y = "NDSI"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(
      angle = 45,
      hjust = 1,
      vjust = 1,
      size = 6,
      margin = margin(t = 5)
    )
  )
print(p_ndsi)

ndsi_quadrant_table <- plot_data %>%
  mutate(
    NDSI_Quadrant = case_when(
      NDSI >= 0.5  & NDSI <= 1   ~ "[0.5, 1]",
      NDSI >  0    & NDSI <  0.5 ~ "(0, 0.5)",
      NDSI >= -0.5 & NDSI <= 0   ~ "[-0.5, 0]",
      NDSI >= -1   & NDSI < -0.5 ~ "[-1, -0.5)",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(NDSI_Quadrant)) %>%
  group_by(Site, NDSI_Quadrant) %>%
  summarise(
    Segments = n(),
    .groups = "drop_last"
  ) %>%
  mutate(
    Total_Segments = sum(Segments),
    Percent_Time = 100 * Segments / Total_Segments
  ) %>%
  ungroup()

print(ndsi_quadrant_table)

beepr::beep(3)


  |=                                                                     |   1%\r  0.0% | 1/17588 | avg 0s/file | ETA 12:54:48 | ~1s remaining\r  0.0% | 7/17588 | avg 0s/file | ETA 14:28:19 | ~1h33m31s remaining\r  0.1% | 12/17588 | avg 0s/file | ETA 14:43:57 | ~1h49m06s remaining\r  0.1% | 17/17588 | avg 0s/file | ETA 14:50:01 | ~1h55m08s remaining\r  0.1% | 22/17588 | avg 0s/file | ETA 14:54:03 | ~1h59m08s remaining\r  0.2% | 27/17588 | avg 0s/file | ETA 14:55:29 | ~2h00m32s remaining\r  0.2% | 32/17588 | avg 0s/file | ETA 14:57:12 | ~2h02m12s remaining\r  0.2% | 37/17588 | avg 0s/file | ETA 14:57:23 | ~2h02m21s remaining\r  0.2% | 42/17588 | avg 0s/file | ETA 14:57:33 | ~2h02m30s remaining\r  0.3% | 47/17588 | avg 0s/file | ETA 14:58:23 | ~2h03m17s remaining\r  0.3% | 52/17588 | avg 0s/file | ETA 14:59:21 | ~2h04m13s remaining\r  0.3% | 57/17588 | avg 0s/file | ETA 14:59:50 | ~2h04m40s remaining\r  0.4% | 62/17588 | avg 0s/file | ETA 14:59:43 | ~2h04m30s remaining\r  0.4% | 67/17588