In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np

df = pd.read_csv('../data/togo.csv', parse_dates=['Timestamp'])
df.head()

print(df.describe())
print("\nMissing Values:\n", df.isna().sum())
missing_pct = df.isna().mean() * 100
print("\nColumns with >5% missing values:\n", missing_pct[missing_pct > 5])

cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = np.abs(stats.zscore(df[cols_to_check].dropna()))
outliers = (z_scores > 3).any(axis=1)
df_clean = df[~outliers].copy()
df_clean[cols_to_check] = df_clean[cols_to_check].fillna(df_clean[cols_to_check].median())

plt.figure(figsize=(12,6))
for col in ['GHI','DNI','DHI','Tamb']:
    plt.plot(df_clean['Timestamp'], df_clean[col], label=col)
plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Time Series Plot')
plt.legend()
plt.show()

df_clean.groupby('Cleaning')[['ModA','ModB']].mean().plot(kind='bar')
plt.title('Effect of Cleaning on Module Output')
plt.show()

sns.heatmap(df_clean[['GHI','DNI','DHI','TModA','TModB']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

sns.scatterplot(data=df_clean, x='WS', y='GHI')
plt.title('Wind Speed vs GHI')
plt.show()

sns.scatterplot(data=df_clean, x='RH', y='Tamb')
plt.title('Relative Humidity vs Ambient Temperature')
plt.show()

df_clean['WS'].hist(bins=30)
plt.title('Wind Speed Distribution')
plt.show()

sns.histplot(df_clean['WD'], bins=36, kde=False)
plt.title('Wind Direction Distribution')
plt.show()

df_clean.to_csv('../data/togo_clean.csv', index=False)
