**Generate waveforms** 

Pull from the extracted csv folders and run through to clean and process resistivity data into 60s windows of waveforms. 

In [0]:
from pyspark.sql.functions import col
import pandas as pd

# Load CSV without header
df = spark.read.csv(
    "/FileStore/rachlenn/Thr 20 process/test_KMThu16_2021_07_19_15_36_28Z",
    header=False,
    inferSchema=True
)

# Rename columns
df = df.withColumnRenamed("_c0", "timestamp") \
       .withColumnRenamed("_c1", "upstream") \
       .withColumnRenamed("_c2", "downstream")

# Convert to Pandas
pdf = df.select("timestamp", "upstream", "downstream").toPandas()

# Convert Unix ms to datetime and delete the timestamp column
pdf["Time"] = pd.to_datetime(pdf["timestamp"], unit='ms', utc=True)
pdf = pdf.drop(columns=["timestamp"])

# Calculate differential conductance
pdf["differential_conductance"] = (pdf["downstream"] - pdf["upstream"]) / 2

# Show head to check 
display(pdf.head(10))

# Save updated file back to dbfs
pdf.to_csv("/dbfs/FileStore/rachlenn/Thr 20 process/pdf_KMThu16_2021_07_19_15_36_28Z", index=False)

In [0]:
# Generate 2 minute bins 

# Make sure dataframe is sorted by time
pdf = pdf.sort_values("Time").reset_index(drop=True)

# Total number of samples
num_samples = len(pdf)

# Time difference in seconds
time_delta = (pdf["Time"].iloc[-1] - pdf["Time"].iloc[0]).total_seconds()

# Samples per second
sps = num_samples / time_delta

print(f"Samples per second: {sps:.2f}")

# Make a window frame
window_duration_seconds = 120

rows_per_window = int(sps * window_duration_seconds)

print(f"Rows per 2 min window: {rows_per_window}")


In [0]:
# Double check runs on 1st 2 mins 
window = pdf.iloc[0:rows_per_window]
plt.plot(window["datetime"], window["differential_conductance"])
plt.xlabel("Time")
plt.ylabel("Differential Conductance (μS/2)")
plt.show()

In [0]:
# Define start time
start_time = pd.Timestamp("2021-07-17 05:53:00", tz="UTC")   # adjust date to match your data

# Filter the data starting from this time
window_start = pdf.loc[pdf["datetime"] >= start_time].iloc[0].name

# Now take the next 60 seconds (or rows)
window = pdf.iloc[window_start : window_start + rows_per_window]

plt.plot(window["datetime"], window["differential_conductance"])
plt.xlabel("Time")
plt.ylabel("Differential Conductance (μS/2)")
plt.show()
