**Generate waveforms** 

Pull from the extracted csv folders and run through to clean and process resistivity data into 60s windows of waveforms. 

In [0]:
from pyspark.sql.functions import col
import pandas as pd

# Load CSV without header
df = spark.read.csv(
    "/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/extracted_csv/test_KMThu16_2021_07_16_15_36_25Z",
    header=False,
    inferSchema=True
)

# Rename columns
df = df.withColumnRenamed("_c0", "timestamp") \
       .withColumnRenamed("_c1", "upstream") \
       .withColumnRenamed("_c2", "downstream")

# Convert to Pandas
pdf = df.select("timestamp", "upstream", "downstream").toPandas()

# Convert Unix ms to datetime and delete the timestamp column
pdf["datetime"] = pd.to_datetime(pdf["timestamp"], unit='ms', utc=True)
pdf = pdf.drop(columns=["timestamp"])

# Calculate differential conductance
pdf["differential_conductance"] = (pdf["upstream"] - pdf["downstream"]) / 2

# Show head to check 
display(pdf.head(10))


In [0]:
# Generate 60 second bins 

# Make sure dataframe is sorted by time
pdf = pdf.sort_values("datetime").reset_index(drop=True)

# Total number of samples
num_samples = len(pdf)

# Time difference in seconds
time_delta = (pdf["datetime"].iloc[-1] - pdf["datetime"].iloc[0]).total_seconds()

# Samples per second
sps = num_samples / time_delta

print(f"Samples per second: {sps:.2f}")

# Make a window frame
window_duration_seconds = 60

rows_per_window = int(sps * window_duration_seconds)

print(f"Rows per 60 sec window: {rows_per_window}")


In [0]:
# Double check runs on 1st 60 seconds 
window = pdf.iloc[0:rows_per_window]
plt.plot(window["datetime"], window["differential_conductance"])
plt.xlabel("Time")
plt.ylabel("Differential Conductance (μS/2)")
plt.show()

In [0]:
import matplotlib.pyplot as plt

# You might want to save or keep track of plots
import os; plot_dir = "/mnt/lab/unrestricted/rachel.lennon@defra.gov.uk/2021-07-17-waveform_plots/"; os.makedirs(plot_dir, exist_ok=True)  # your output path if needed

# For example: every 60-second window
for i in range(0, len(pdf), rows_per_window):
    window = pdf.iloc[i:i+rows_per_window]
    
    # Skip incomplete window
    if len(window) < rows_per_window:
        continue

    # Extract time and conductance
    times = window["datetime"]
    conductance = window["differential_conductance"]

    # Plot
    plt.figure(figsize=(10, 4))
    plt.plot(times, conductance, color="steelblue")
    plt.xlabel("Time")
    plt.ylabel("Differential Conductance (μS/2)")
    plt.tight_layout()

    # Show plot (if running interactively)
    plt.show()

    # Optionally, save to file
    filename = f"{plot_dir}waveform_{i//rows_per_window + 1}.png"
    plt.savefig(filename)
    plt.close()
