In [1]:
# --- Setup: Imports and Path Adjustments ---
import pandas as pd
import sys
import os

# Adjust sys.path to allow imports from src/
current_notebook_dir = os.getcwd()  # Use current working directory for Jupyter notebooks
project_root_dir = os.path.join(current_notebook_dir, '..')  # Go up to 'solar-challenge-week1/'
sys.path.insert(0, project_root_dir)

# Import functions from your src modules
import scripts.data_cleaning as dc
import scripts.data_profiling as dp
import scripts.eda_plots as ep

# Define country name for consistent naming
COUNTRY = "Togo"
DATA_FILE = "togo-dapaong_qc.csv"

# **1. Load Raw Data**

We begin by loading the raw data for Benin. The dataset is expected to be in the `data/` directory. If the file is missing, an error will be raised.

In [2]:
# --- 1. Load Raw Data ---
print(f"--- Loading Raw Data for {COUNTRY} ---")
data_path = os.path.join(project_root_dir, 'data', DATA_FILE)

try:
    df_raw = pd.read_csv(data_path)
    print(f"Successfully loaded {DATA_FILE}.")
except FileNotFoundError:
    print(f"Error: Raw data file not found at {data_path}. Please check the path and file existence.")
    sys.exit("Data file not found.")

print("\n**Raw Data Head:**")
display(df_raw.head())
print(f"\n**Raw Data Shape:** {df_raw.shape}")

--- Loading Raw Data for Togo ---
Successfully loaded togo-dapaong_qc.csv.

**Raw Data Head:**


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-10-25 00:01,-1.3,0.0,0.0,0.0,0.0,24.8,94.5,0.9,1.1,0.4,227.6,1.1,977,0,0.0,24.7,24.4,
1,2021-10-25 00:02,-1.3,0.0,0.0,0.0,0.0,24.8,94.4,1.1,1.6,0.4,229.3,0.7,977,0,0.0,24.7,24.4,
2,2021-10-25 00:03,-1.3,0.0,0.0,0.0,0.0,24.8,94.4,1.2,1.4,0.3,228.5,2.9,977,0,0.0,24.7,24.4,
3,2021-10-25 00:04,-1.2,0.0,0.0,0.0,0.0,24.8,94.3,1.2,1.6,0.3,229.1,4.6,977,0,0.0,24.7,24.4,
4,2021-10-25 00:05,-1.2,0.0,0.0,0.0,0.0,24.8,94.0,1.3,1.6,0.4,227.5,1.6,977,0,0.0,24.7,24.4,



**Raw Data Shape:** (525600, 19)


# **2. Initial Data Profiling**

Before cleaning, we perform an initial data profiling to understand the raw state of the dataset. This includes summary statistics and a missing value report.

In [3]:
# --- 2. Initial Data Profiling (Raw Data) ---
print("\n--- Initial Data Profiling (Raw Data) ---")
display(dp.get_summary_statistics(df_raw.select_dtypes(include=['number'])))
dp.print_missing_value_report(df_raw, threshold=5)  # List columns with >5% nulls


--- Initial Data Profiling (Raw Data) ---

--- Summary Statistics ---


Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
count,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,0.0
mean,230.55504,151.258469,116.444352,226.144375,219.568588,27.751788,55.01316,2.368093,3.22949,0.55774,161.741845,10.559568,975.915242,0.000535,0.001382,32.444403,33.54333,
std,322.532347,250.956962,156.520714,317.346938,307.93251,4.758023,28.778732,1.462668,1.882565,0.268923,91.877217,5.91549,2.153977,0.023116,0.02635,10.998334,12.769277,
min,-12.7,0.0,0.0,0.0,0.0,14.9,3.3,0.0,0.0,0.0,0.0,0.0,968.0,0.0,0.0,13.1,13.1,
25%,-2.2,0.0,0.0,0.0,0.0,24.2,26.5,1.4,1.9,0.4,74.8,6.9,975.0,0.0,0.0,23.9,23.6,
50%,2.1,0.0,2.5,4.4,4.3,27.2,59.3,2.2,2.9,0.5,199.1,10.8,976.0,0.0,0.0,28.4,28.4,
75%,442.4,246.4,215.7,422.525,411.0,31.1,80.8,3.2,4.4,0.7,233.5,14.1,977.0,0.0,0.0,40.6,43.0,
max,1424.0,1004.5,805.7,1380.0,1367.0,41.4,99.8,16.1,23.1,4.7,360.0,86.9,983.0,1.0,2.3,70.4,94.6,



--- Missing Value Report ---
Missing values by column:
          Missing Count  Missing Percentage
Comments         525600               100.0

Columns with more than 5% missing values:
          Missing Count  Missing Percentage
Comments         525600               100.0


# **3. Clean Data**

The raw data is cleaned using the `clean_data` function from `src/data_cleaning.py`. This process includes handling missing values, removing outliers, and ensuring data integrity.

In [4]:
# --- 3. Clean Data ---
print(f"\n--- Cleaning Data for {COUNTRY} using src/data_cleaning.py ---")
df_cleaned = dc.clean_data(df_raw.copy(), country_name=COUNTRY)

print("\n**Cleaned Data Head:**")
display(df_cleaned.head())
print(f"\n**Cleaned Data Shape:** {df_cleaned.shape}")


--- Cleaning Data for Togo using src/data_cleaning.py ---

--- Cleaning Data for Togo ---

  Enforcing Physical Constraints (Setting impossible values to NaN)...
    Corrected 257385 negative values in 'GHI'.
  Total physically impossible values corrected: 257385

  Dropped 257385 rows due to missing values in critical columns: GHI, DNI, DHI, ModA, ModB.

  Imputing remaining numerical missing values with median...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Comments'].fillna('No Comment', inplace=True)
  df_cleaned['Comments'].fillna('No Comment', inplace=True)


  No further numerical missing values required imputation.
  Filled 268215 missing 'Comments' with 'No Comment'.

  Detecting potential statistical outliers (Z-score > 3) for reporting...
  Potential statistical outliers detected (Z-score > 3):
    - GHI: 5 outliers
    - DHI: 182 outliers
    - ModB: 2 outliers
    - Tamb: 69 outliers
    - WS: 1033 outliers
    - WSgust: 1177 outliers
    - BP: 150 outliers
    - Precipitation: 1698 outliers
    - TModB: 260 outliers
  Note: These values are flagged for awareness but not automatically removed or capped by this function.

--- Cleaning Summary for Togo ---
  Initial rows: 525600
  Final rows after cleaning: 268215
  Total rows removed/adjusted: 257385

  Missing values after cleaning:
Series([], dtype: int64)

Cleaned data saved to: c:\Users\Perserverence\Documents\Python_Scripts\solar-challenge-week1\scripts\..\data\togo_clean.csv

**Cleaned Data Head:**


Unnamed: 0_level_0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-10-25 05:43:00,0.0,0.0,0.6,1.2,1.2,25.2,92.7,1.8,2.1,0.3,225.4,4.8,977,0,0.0,25.0,24.8,No Comment
2021-10-25 05:44:00,0.2,0.0,0.7,1.4,1.4,25.2,92.7,1.4,1.9,0.4,231.2,5.8,977,0,0.0,25.0,24.7,No Comment
2021-10-25 05:45:00,0.5,0.0,0.8,1.6,1.6,25.2,92.8,1.5,1.9,0.4,229.9,7.9,977,0,0.0,25.0,24.7,No Comment
2021-10-25 05:46:00,0.8,0.0,0.9,1.9,1.9,25.2,92.6,1.6,2.1,0.4,230.1,7.0,977,0,0.0,25.0,24.7,No Comment
2021-10-25 05:47:00,1.0,0.0,1.0,2.1,2.1,25.1,92.5,1.6,1.9,0.4,230.0,7.0,977,0,0.0,25.0,24.7,No Comment



**Cleaned Data Shape:** (268215, 18)


# **4. Post-Cleaning Data Profiling**

After cleaning, we re-profile the data to verify the effectiveness of the cleaning process. This includes updated summary statistics and a missing value report.# **4. Post-Cleaning Data Profiling**

After cleaning, we re-profile the data to verify the effectiveness of the cleaning process. This includes updated summary statistics and a missing value report.

In [5]:
# --- 4. Post-Cleaning Data Profiling ---
print(f"\n--- Post-Cleaning Data Profiling for {COUNTRY} ---")
print("\n**Summary Statistics (Cleaned Data):**")
display(dp.get_summary_statistics(df_cleaned.select_dtypes(include=['number'])))

print("\n**Missing Value Report (Cleaned Data):**")
dp.print_missing_value_report(df_cleaned, threshold=0)  # Check if any nulls remain


--- Post-Cleaning Data Profiling for Togo ---

**Summary Statistics (Cleaned Data):**

--- Summary Statistics ---


Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB
count,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0,268215.0
mean,454.081218,296.409415,228.16618,443.109538,430.2249,29.868625,50.954453,2.89756,3.951753,0.650577,171.400294,13.115981,975.956483,0.001048,0.0016,39.984648,42.444731
std,319.09601,283.534417,150.064955,318.15581,308.539523,4.825459,27.871583,1.474324,1.850927,0.265139,89.270858,5.635309,2.341965,0.032351,0.029865,10.42551,12.060518
min,0.0,0.0,0.0,0.0,0.0,14.9,3.3,0.0,0.0,0.0,0.0,0.0,968.0,0.0,0.0,13.7,13.4
25%,156.5,7.0,112.4,144.2,140.7,26.4,22.8,1.9,2.6,0.5,83.2,10.0,975.0,0.0,0.0,31.9,33.0
50%,430.3,233.8,211.4,410.8,399.8,29.9,55.2,2.8,3.9,0.6,202.8,12.9,976.0,0.0,0.0,40.3,42.6
75%,743.9,548.0,330.3,733.4,710.4,33.5,74.6,3.8,5.2,0.8,240.7,16.0,978.0,0.0,0.0,47.8,51.4
max,1424.0,1004.5,805.7,1380.0,1367.0,41.4,99.8,16.1,22.9,4.4,360.0,86.9,983.0,1.0,2.3,70.4,94.6



**Missing Value Report (Cleaned Data):**

--- Missing Value Report ---
No missing values found in the DataFrame.


# **5. Time Series Analysis**

We analyze the cleaned data over time, focusing on key variables such as GHI, DNI, DHI, and Tamb. This includes plotting time series and average daily/monthly profiles.

In [6]:
# --- 5. Time Series Analysis ---
print(f"\n--- Time Series Analysis for {COUNTRY} ---")

# GHI, DNI, DHI over Time
ep.plot_time_series(df_cleaned, ['GHI', 'DNI', 'DHI'], COUNTRY, title_suffix="Irradiance")

# Tamb over Time
ep.plot_time_series(df_cleaned, 'Tamb', COUNTRY, title_suffix="Ambient Temperature")

# Average Daily Profiles (hourly)
ep.plot_daily_average_profile(df_cleaned, ['GHI', 'Tamb'], COUNTRY)

# Average Monthly Profiles
ep.plot_monthly_average_profile(df_cleaned, ['GHI', 'Tamb'], COUNTRY)


--- Time Series Analysis for Togo ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_dni_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_dhi_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_tamb_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_daily_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_tamb_daily_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_monthly_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_tamb_monthly_profile.png


### **6. Cleaning Impact Analysis**

In [7]:
print(f"\n--- Cleaning Impact Analysis for {COUNTRY} ---")
ep.plot_cleaning_impact(df_cleaned, 'ModA', 'ModB', COUNTRY)


--- Cleaning Impact Analysis for Togo ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_cleaning_impact.png


### **7. Correlation & Relationship Analysis**

In [8]:
print(f"\n--- Correlation & Relationship Analysis for {COUNTRY} ---")

# Heatmap
ep.plot_correlation_heatmap(df_cleaned, ['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH', 'WS', 'Precipitation'], COUNTRY)

# Scatter plots
ep.plot_scatter(df_cleaned, 'WS', 'GHI', COUNTRY, title_suffix="(Wind Speed vs GHI)")
ep.plot_scatter(df_cleaned, 'WSgust', 'GHI', COUNTRY, title_suffix="(Wind Gust vs GHI)")
ep.plot_scatter(df_cleaned, 'RH', 'Tamb', COUNTRY, title_suffix="(Relative Humidity vs Ambient Temperature)")
ep.plot_scatter(df_cleaned, 'RH', 'GHI', COUNTRY, title_suffix="(Relative Humidity vs GHI)")


--- Correlation & Relationship Analysis for Togo ---
  Saved matplotlib plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_correlation_heatmap.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_vs_ws_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_vs_wsgust_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_tamb_vs_rh_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_vs_rh_scatter.png


### **8. Wind & Distribution Analysis**

In [9]:
print(f"\n--- Wind & Distribution Analysis for {COUNTRY} ---")

# Wind Rose
ep.plot_wind_rose(df_cleaned, 'WS', 'WD', COUNTRY)

# Histograms
ep.plot_distribution(df_cleaned, 'GHI', COUNTRY, hist_type='histogram')
ep.plot_distribution(df_cleaned, 'WS', COUNTRY, hist_type='histogram')
ep.plot_distribution(df_cleaned, 'RH', COUNTRY, hist_type='histogram')


--- Wind & Distribution Analysis for Togo ---






  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_wind_rose.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_distribution.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ws_distribution.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_rh_distribution.png


### **9. Bubble Chart**

In [10]:
print(f"\n--- Bubble Chart for {COUNTRY} ---")

# GHI vs. Tamb with bubble size = RH
ep.plot_bubble_chart(df_cleaned, 'Tamb', 'GHI', 'RH', COUNTRY)

# GHI vs. Tamb with bubble size = BP (if BP is relevant as a bubble size)
ep.plot_bubble_chart(df_cleaned, 'Tamb', 'GHI', 'BP', COUNTRY)


--- Bubble Chart for Togo ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_vs_tamb_rh_bubble.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\togo_ghi_vs_tamb_bp_bubble.png
