In [None]:
# Cell 1
import os
import sys
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.clean_data import load_and_basic_clean, flag_zscore_outliers, impute_median, export_clean

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12,6)

In [None]:
# Cell 2
RAW_PATH = "../data/togo-dapaong_qc.csv" 
df_raw = pd.read_csv(RAW_PATH)

print("Raw shape:", df_raw.shape)
print("\nData types and non-null counts:")
display(df_raw.info())

print("\nSummarry statistics for numeric columns:")
display(df_raw.describe().T)

print("\nMissing value counts:")
display(df_raw.isna().sum())

print("\nColumns with >5% nulls:")
total = len(df_raw)
null_pct = (df_raw.isna().sum() / total) * 100
print(null_pct[null_pct > 5].sort_values(ascending=False))

In [None]:
# Cell 3
df_clean = load_and_basic_clean(RAW_PATH, "Togo")
print("After basic cleaning:", df_clean.shape)
display(df_clean.head())

In [None]:
# Cell 4
cols_to_check = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']
df_flagged = flag_zscore_outliers(df_clean, cols_to_check)

# Count rows flagged for any of the zscore outliers
zcols = [c for c in df_flagged.columns if c.endswith('_z_outlier')]
df_flagged['any_z_outlier'] = df_flagged[zcols].any(axis=1)
print("Total rows flagged as outlier (|Z|>3) for any metric:", df_flagged['any_z_outlier'].sum())

In [None]:
# Cell 5
# Choose columns to impute (key sensor and irradiance columns)
impute_cols = ['GHI','DNI','DHI','ModA','ModB','Tamb','RH']
df_imputed = impute_median(df_flagged.copy(), impute_cols)

# verify no nulls remain in these key columns
display(df_imputed[impute_cols].isna().sum())

In [None]:
# Cell 6
import os
os.makedirs("../data", exist_ok=True)
export_clean(df_imputed, "../data/togo_clean.csv")
print("Exported ../data/togo_clean.csv (do NOT commit this file)")

In [None]:
# Cell 7 - daily avg GHI line
df_daily = df_imputed.resample('D').mean(numeric_only=True)
plt.figure(figsize=(14,5))
plt.plot(df_daily.index, df_daily['GHI'], marker='.', linewidth=0.8)
plt.title('Daily average GHI - Togo')
plt.ylabel('GHI (W/m^2)')
plt.xlabel('Date')
plt.show()

In [None]:
# Cell 8
if 'Cleaning' in df_imputed.columns:
    before = df_imputed[df_imputed['Cleaning'] == 0][['ModA','ModB']].mean()
    after = df_imputed[df_imputed['Cleaning'] == 1][['ModA','ModB']].mean()
    print("Average before cleaning:\n", before)
    print("Average after cleaning:\n", after)
else:
    print("No Cleaning column found.")

In [None]:
# Cell 9
corr_cols = ['GHI','DNI','DHI','TModA','TModB']
corr_df = df_imputed[corr_cols].dropna()
plt.figure(figsize=(8,6))
sns.heatmap(corr_df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation heatmap")
plt.show()

In [None]:
# Cell 10 - histogram for WS
plt.figure(figsize=(8,4))
df_imputed['WS'].dropna().hist(bins=30)
plt.title('Wind speed histogram (WS)')
plt.xlabel('WS (m/s)')
plt.show()

In [None]:
# Cell 11 — Additional time series plots
df_daily_avg = df_imputed.resample('D').mean(numeric_only=True)

plt.figure(figsize=(14,5))
plt.plot(df_daily_avg.index, df_daily_avg['DNI'], label='DNI')
plt.plot(df_daily_avg.index, df_daily_avg['DHI'], label='DHI')
plt.plot(df_daily_avg.index, df_daily_avg['Tamb'], label='Tamb', linestyle='--')
plt.title('Daily Averages: DNI, DHI, and Tamb - Togo')
plt.xlabel('Date')
plt.ylabel('Values')
plt.legend()
plt.show()

In [None]:
# Cell 12 — Scatter plots
plt.figure(figsize=(12,5))
sns.scatterplot(x='WS', y='GHI', data=df_imputed, alpha=0.5)
plt.title('Wind Speed vs GHI')
plt.show()

plt.figure(figsize=(12,5))
sns.scatterplot(x='RH', y='Tamb', data=df_imputed, alpha=0.5, color='green')
plt.title('Relative Humidity vs Temperature')
plt.show()

In [None]:
# Cell 13 — Bubble chart
plt.figure(figsize=(10,6))
plt.scatter(
    df_imputed['Tamb'],
    df_imputed['GHI'],
    s=df_imputed['RH'], # bubble size = humidity
    alpha=0.4,
    c=df_imputed['RH'],
    cmap='coolwarm'
)
plt.title('Bubble Chart: GHI vs Tamb (bubble size = RH)')
plt.xlabel('Temperature (Tamb)')
plt.ylabel('GHI (W/m²)')
plt.colorbar(label='Relative Humidity (%)')
plt.show()