In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import zscore


df = pd.read_csv("raw_data/benin.csv")


# Summary Statistics & Missing Values

In [None]:
import pandas as pd

df = pd.read_csv("raw_data/benin.csv") 

df.describe()

# Missing values
missing = df.isna().sum()
missing[missing > 0]

# Columns with >5% nulls
missing[missing > 0.05 * len(df)]



# Outlier Detection & Cleaning

In [None]:
cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[cols].apply(zscore)
df['Outlier'] = (z_scores.abs() > 3).any(axis=1)

# Impute missing values
for col in cols:
    df[col] = df[col].fillna(df[col].median())

# Export Cleaned Data

In [None]:
df_clean = df[~df['Outlier']]
df_clean.to_csv("data/benin_clean.csv", index=False)

# Time Analysis Serires

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot()
plt.title("Solar Irradiance and Temperature Over Time")
plt.show()

# cleaning Impact

In [None]:
df.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar')
plt.title("ModA & ModB Before/After Cleaning")
plt.show()

# Correlation & Relationships

In [None]:
sns.heatmap(df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr(), annot=True)
sns.scatterplot(x='WS', y='GHI', data=df)
sns.scatterplot(x='RH', y='Tamb', data=df)

# Wind & Distribution

In [None]:
df['GHI'].hist()
df['WS'].hist()

# Optional wind rose
from windrose import WindroseAxes
ax = WindroseAxes.from_ax()
ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
plt.show()

# Temperature & Bubble Chart

In [None]:
sns.scatterplot(x='RH', y='Tamb', data=df)
plt.scatter(df['GHI'], df['Tamb'], s=df['RH'], alpha=0.5)
plt.xlabel('GHI')
plt.ylabel('Tamb')
plt.title('GHI vs Tamb (Bubble = RH)')
plt.show()

# Cleaning Logic using reusable func in scripts/cleaning_utils

In [None]:
from scripts.cleaning_utils import clean_and_flag_outliers
df_clean = clean_and_flag_outliers(df, cols)