# Fix: ensure kernel metadata set to eda-benin


# Togo EDA Notebook

This notebook performs exploratory data analysis for Togo. It loads `data/togo.csv`, prints summary statistics and missing-value reports, flags outliers using Z-scores, imputes medians for key columns, saves a cleaned CSV to `data/togo_clean.csv`, and produces a few diagnostic plots saved under `outputs/togo/`.


In [None]:
# Imports and settings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path
%matplotlib inline

Path('outputs/togo').mkdir(parents=True, exist_ok=True)


In [None]:
# Load data
data_path = Path('data/togo.csv')
assert data_path.exists(), f'{data_path} not found'
df = pd.read_csv(data_path, parse_dates=['Timestamp'], low_memory=False)
print('Loaded', len(df), 'rows and', len(df.columns), 'columns')
df.head()


In [None]:
# Summary statistics (numeric)
display(df.select_dtypes(include=[np.number]).describe().transpose())

# Missing values
missing = df.isna().sum()
missing_pct = 100 * missing / len(df)
mv = pd.DataFrame({'missing_count': missing, 'missing_pct': missing_pct})
display(mv.sort_values('missing_pct', ascending=False).head(20))


In [None]:
# Outlier detection (Z-score) and median imputation for key cols
key_cols = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']
df2 = df.copy()
outlier_counts = {}
for c in key_cols:
    if c in df2.columns and pd.api.types.is_numeric_dtype(df2[c]):
        z = np.abs(stats.zscore(df2[c].fillna(df2[c].median())))
        mask = z > 3
        df2[c + '_zflag'] = mask
        outlier_counts[c] = int(mask.sum())
# overall flag
zflags = [col for col in df2.columns if col.endswith('_zflag')]
if zflags:
    df2['any_z_outlier'] = df2[zflags].any(axis=1)
else:
    df2['any_z_outlier'] = False

# median impute
medians = {}
for c in key_cols:
    if c in df2.columns and pd.api.types.is_numeric_dtype(df2[c]):
        med = df2[c].median()
        medians[c] = med
        df2[c] = df2[c].fillna(med)

print('Outlier counts:', outlier_counts)
print('Medians used for imputation:', medians)


In [None]:
# Save cleaned CSV (data/ is gitignored)
Path('data').mkdir(exist_ok=True)
df2.to_csv('data/togo_clean.csv', index=False)
print('Wrote data/togo_clean.csv')


In [None]:
# Simple plots: time series for GHI and Tamb, correlation heatmap, histograms
import matplotlib.dates as mdates
ts_cols = [c for c in ['GHI','DNI','DHI','Tamb'] if c in df2.columns]
if 'Timestamp' in df2.columns and ts_cols:
    plt.figure(figsize=(12,5))
    for c in ts_cols:
        plt.plot(df2['Timestamp'], df2[c], label=c, alpha=0.6)
    plt.legend()
    plt.title('Time series (sample)')
    plt.tight_layout()
    plt.savefig('outputs/togo/timeseries_ghi_dni_dhi_tamb.png')
    plt.show()

corr_cols = [c for c in ['GHI','DNI','DHI','TModA','TModB'] if c in df2.columns]
if corr_cols:
    plt.figure(figsize=(6,5))
    sns.heatmap(df2[corr_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm')
    plt.title('Correlation heatmap')
    plt.tight_layout()
    plt.savefig('outputs/togo/correlation_heatmap.png')
    plt.show()

if 'GHI' in df2.columns:
    plt.figure(figsize=(6,4))
    sns.histplot(df2['GHI'].dropna(), bins=50)
    plt.title('Histogram GHI')
    plt.tight_layout()
    plt.savefig('outputs/togo/hist_GHI.png')
    plt.show()
