In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set visual style
plt.style.use('seaborn')
%matplotlib inline

In [None]:
# Load raw data
df = pd.read_csv("../data/benin-malanville.csv", parse_dates=["Timestamp"])
print(df.shape)
df.head()

In [None]:
# Summary statistics
df.describe()

# Missing values report
missing_report = df.isna().sum() / len(df) * 100
print("Columns with >5% missing values:")
missing_report[missing_report > 5]

In [None]:
# Calculate Z-scores for key columns
outlier_cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]

for col in outlier_cols:
    df[f"{col}_zscore"] = np.abs(stats.zscore(df[col], nan_policy='omit'))
    df[f"{col}_outlier"] = df[f"{col}_zscore"] > 3

# Count outliers
df[[f"{col}_outlier" for col in outlier_cols]].sum()

In [None]:
# Drop rows with missing GHI/DNI/DHI
df_clean = df.dropna(subset=["GHI", "DNI", "DHI"])

# Impute outliers with median (e.g., ModA)
df_clean.loc[df_clean["ModA_outlier"], "ModA"] = df_clean["ModA"].median()

# Remove temporary columns
df_clean = df_clean.drop(columns=[col for col in df_clean.columns if "_zscore" in col or "_outlier" in col])

# Export cleaned data
df_clean.to_csv("../data/benin_clean.csv", index=False)
print("Cleaned data saved to ../data/benin_clean.csv")

In [None]:
# Plot solar irradiance over time
df_clean.set_index("Timestamp")[["GHI", "DNI", "DHI"]].plot(figsize=(12, 6))
plt.title("Solar Irradiance Trends")
plt.ylabel("W/m²")
plt.show()

In [None]:
# Heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(df_clean[["GHI", "DNI", "DHI", "TModA", "TModB", "WS", "RH"]].corr(), annot=True)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Scatter plot: Wind speed vs. GHI
sns.scatterplot(data=df_clean, x="WS", y="GHI", hue="RH", palette="viridis")
plt.title("GHI vs. Wind Speed (Colored by Humidity)")
plt.show()

In [None]:
# Bubble chart: GHI vs. Temp (size=RH, color=BP)
plt.figure(figsize=(10, 6))
plt.scatter(
    x=df_clean["Tamb"], 
    y=df_clean["GHI"], 
    s=df_clean["RH"]*2,  # Bubble size
    c=df_clean["BP"],    # Color
    alpha=0.5,
    cmap="magma"
)
plt.colorbar(label="BP (hPa)")
plt.xlabel("Temperature (°C)")
plt.ylabel("GHI (W/m²)")
plt.title("GHI vs. Ambient Temperature")
plt.show()