In [None]:
# Import helper func
import sys
import os
sys.path.append(os.path.abspath("../")) 
from src.utils.utils import *

# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load data
df = pd.read_csv('../data/benin-malanville.csv', parse_dates=['Timestamp'])
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Summary Statistics
df.describe()

# Missing Values Report
missing = pd.DataFrame({
    'Missing Values': df.isna().sum(),
    'Percent': (df.isna().sum()/df.shape[0])*100
})
missing[missing['Percent'] > 5].sort_values('Percent', ascending=False)

In [None]:
# Outlier detection for key columns
key_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
outliers = {}

for col in key_cols:
    if col in df.columns:
        # Calculate Z-scores
        z = np.abs(stats.zscore(df[col].dropna()))
        outliers[col] = (z > 3).sum()

print("Outlier counts (|Z| > 3):")
pd.Series(outliers)

# Clean data by imputing missing values with median
df_clean = df.copy()
for col in key_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Export cleaned data
df_clean.to_csv('../data/benin_clean.csv', index=False)
print("Cleaned data saved to ../data/benin_clean.csv")

In [None]:
# Time series plot - GHI, DNI, DHI, Tamb
plt.figure(figsize=(12, 6))
plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI')
plt.title('Global Horizontal Irradiance Time Series')
plt.xlabel('Timestamp')
plt.ylabel('GHI (W/m²)')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# Monthly patterns (add this to observe patterns by month)
df_clean['Month'] = df_clean['Timestamp'].dt.month
monthly_avg = df_clean.groupby('Month')[['GHI', 'DNI', 'DHI', 'Tamb']].mean()

plt.figure(figsize=(10, 5))
monthly_avg[['GHI', 'DNI', 'DHI']].plot(kind='bar')
plt.title('Monthly Average Solar Irradiance')
plt.xlabel('Month')
plt.ylabel('W/m²')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Check if Cleaning column exists
if 'Cleaning' in df_clean.columns:
    # Group by Cleaning flag
    cleaning_impact = df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean()
    
    # Plot average ModA & ModB pre/post-clean
    cleaning_impact.plot(kind='bar', figsize=(8, 5))
    plt.title('Impact of Cleaning on Module Temperatures')
    plt.xlabel('Cleaning Status')
    plt.ylabel('Average Temperature (°C)')
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()
else:
    print("Cleaning flag not found in dataset")

In [None]:
# Correlation heatmap
corr_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'WS', 'WSgust', 'RH']
plt.figure(figsize=(10, 8))
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

# Simple scatter plot - WS vs GHI
plt.figure(figsize=(8, 6))
plt.scatter(df_clean['WS'], df_clean['GHI'], alpha=0.5)
plt.title('Wind Speed vs GHI')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('GHI (W/m²)')
plt.grid(True)
plt.tight_layout()
plt.show()

# RH vs Tamb
plt.figure(figsize=(8, 6))
plt.scatter(df_clean['RH'], df_clean['Tamb'], alpha=0.5)
plt.title('Relative Humidity vs Ambient Temperature')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Ambient Temperature (°C)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Temperature analysis with RH influence
plt.figure(figsize=(10, 6))
plt.scatter(df_clean['Tamb'], df_clean['GHI'], c=df_clean['RH'], alpha=0.5, cmap='viridis')
plt.colorbar(label='Relative Humidity (%)')
plt.title('Temperature vs GHI colored by Relative Humidity')
plt.xlabel('Ambient Temperature (°C)')
plt.ylabel('GHI (W/m²)')
plt.grid(True)
plt.tight_layout()
plt.show()

# Bubble chart - GHI vs Tamb with bubble size = RH
# Sample data for better visualization if dataset is large
sample = df_clean.sample(min(3000, len(df_clean)))
# Normalize RH for bubble size
size = (sample['RH'] - sample['RH'].min()) / (sample['RH'].max() - sample['RH'].min()) * 100 + 10

plt.figure(figsize=(10, 6))
plt.scatter(sample['Tamb'], sample['GHI'], s=size, alpha=0.5)
plt.title('GHI vs. Ambient Temperature (Bubble size = Relative Humidity)')
plt.xlabel('Ambient Temperature (°C)')
plt.ylabel('GHI (W/m²)')
plt.grid(True)
plt.tight_layout()
plt.show()