In [None]:
# =========================================================
# Preprocessing Notebook for Solar Panel Monitor Dataset
# =========================================================

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pvlib  # library for solar position calculations (pip install pvlib)

# ---------------------------------------------------------
# 2. Load Dataset
# ---------------------------------------------------------
df = pd.read_csv("..data/solar_panel_monitor_dataset.csv")

# Inspect
print(df.head())
print(df.info())

# ---------------------------------------------------------
# 3. Handle Missing / Invalid Values
# ---------------------------------------------------------
# Drop rows with NaNs or replace with mean
df = df.dropna()

# Remove negative/invalid sensor values
for col in ["ldr_tl","ldr_tr","ldr_bl","ldr_br","voltage","current"]:
    df = df[df[col] >= 0]

# ---------------------------------------------------------
# 4. Feature Engineering
# ---------------------------------------------------------

# A) Time-based features
df["timestamp"] = pd.to_datetime(df["timestamp"])   # assuming timestamp column exists
df["hour"] = df["timestamp"].dt.hour
df["minute"] = df["timestamp"].dt.minute
df["day_of_year"] = df["timestamp"].dt.dayofyear

# Cyclic encodings for hour & minute
df["sin_hour"] = np.sin(2*np.pi*df["hour"]/24)
df["cos_hour"] = np.cos(2*np.pi*df["hour"]/24)
df["sin_min"] = np.sin(2*np.pi*df["minute"]/60)
df["cos_min"] = np.cos(2*np.pi*df["minute"]/60)

# B) LDR aggregation
df["ldr_sum"] = df[["ldr_tl","ldr_tr","ldr_bl","ldr_br"]].sum(axis=1)
df["ldr_left"] = df["ldr_tl"] + df["ldr_bl"]
df["ldr_right"] = df["ldr_tr"] + df["ldr_br"]
df["ldr_top"] = df["ldr_tl"] + df["ldr_tr"]
df["ldr_bottom"] = df["ldr_bl"] + df["ldr_br"]

# Normalize each LDR by total light
for col in ["ldr_tl","ldr_tr","ldr_bl","ldr_br"]:
    df[f"{col}_norm"] = df[col] / (df["ldr_sum"] + 1e-6)

# C) Solar position (using pvlib)
latitude, longitude = 22.57, 88.36   # Example: Kolkata
df["sun_azimuth"] = df["timestamp"].apply(
    lambda t: pvlib.solarposition.get_solarposition(t, latitude, longitude)["azimuth"].values[0]
)
df["sun_elevation"] = df["timestamp"].apply(
    lambda t: pvlib.solarposition.get_solarposition(t, latitude, longitude)["apparent_elevation"].values[0]
)

# ---------------------------------------------------------
# 5. Targets
# ---------------------------------------------------------
# Depending on your approach:
# Option A: direct angles
# df["target_horiz"] = df["horiz_pos"]
# df["target_vert"] = df["vert_pos"]

# Option B: delta control
df["delta_horiz"] = df["horiz_pos"].diff().fillna(0)
df["delta_vert"] = df["vert_pos"].diff().fillna(0)

# ---------------------------------------------------------
# 6. Save Clean Dataset
# ---------------------------------------------------------
df.to_csv("solar_data_processed.csv", index=False)

print("Preprocessing complete. Saved as solar_data_processed.csv")

# ---------------------------------------------------------
# 7. Visualization
# ---------------------------------------------------------
plt.figure(figsize=(8,4))
plt.plot(df["timestamp"], df["ldr_sum"], label="Total LDR")
plt.plot(df["timestamp"], df["voltage"], label="Voltage")
plt.plot(df["timestamp"], df["current"], label="Current")
plt.legend()
plt.title("LDR vs Voltage/Current over Time")
plt.show()
