In [1]:
# --- Setup: Imports and Path Adjustments ---
import pandas as pd
import sys
import os

# Adjust sys.path to allow imports from src/
current_notebook_dir = os.getcwd()  # Use current working directory for Jupyter notebooks
project_root_dir = os.path.join(current_notebook_dir, '..')  # Go up to 'solar-challenge-week1/'
sys.path.insert(0, project_root_dir)

# Import functions from your src modules
import scripts.data_cleaning as dc
import scripts.data_profiling as dp
import scripts.eda_plots as ep

# Define country name for consistent naming
COUNTRY = "Sierraleone"
DATA_FILE = "sierraleone-bumbuna.csv"

# **1. Load Raw Data**

We begin by loading the raw data for Benin. The dataset is expected to be in the `data/` directory. If the file is missing, an error will be raised.

In [2]:
# --- 1. Load Raw Data ---
print(f"--- Loading Raw Data for {COUNTRY} ---")
data_path = os.path.join(project_root_dir, 'data', DATA_FILE)

try:
    df_raw = pd.read_csv(data_path)
    print(f"Successfully loaded {DATA_FILE}.")
except FileNotFoundError:
    print(f"Error: Raw data file not found at {data_path}. Please check the path and file existence.")
    sys.exit("Data file not found.")

print("\n**Raw Data Head:**")
display(df_raw.head())
print(f"\n**Raw Data Shape:** {df_raw.shape}")

--- Loading Raw Data for Sierraleone ---
Successfully loaded sierraleone-bumbuna.csv.

**Raw Data Head:**


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-10-30 00:01,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.1,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
1,2021-10-30 00:02,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.2,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
2,2021-10-30 00:03,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.2,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,
3,2021-10-30 00:04,-0.7,0.0,-0.8,0.0,0.0,21.9,99.3,0.0,0.0,0.0,0.0,0.0,1002,0,0.1,22.3,22.6,
4,2021-10-30 00:05,-0.7,-0.1,-0.8,0.0,0.0,21.9,99.3,0.0,0.0,0.0,0.0,0.0,1002,0,0.0,22.3,22.6,



**Raw Data Shape:** (525600, 19)


# **2. Initial Data Profiling**

Before cleaning, we perform an initial data profiling to understand the raw state of the dataset. This includes summary statistics and a missing value report.

In [3]:
# --- 2. Initial Data Profiling (Raw Data) ---
print("\n--- Initial Data Profiling (Raw Data) ---")
display(dp.get_summary_statistics(df_raw.select_dtypes(include=['number'])))
dp.print_missing_value_report(df_raw, threshold=5)  # List columns with >5% nulls


--- Initial Data Profiling (Raw Data) ---

--- Summary Statistics ---


Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
count,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,0.0
mean,201.957515,116.376337,113.720571,206.643095,198.114691,26.319394,79.448857,1.146113,1.691606,0.363823,133.044668,7.17222,999.876469,0.000967,0.004806,32.504263,32.593091,
std,298.49515,218.652659,158.946032,300.896893,288.889073,4.398605,20.520775,1.239248,1.617053,0.295,114.284792,7.535093,2.104419,0.031074,0.047556,12.434899,12.009161,
min,-19.5,-7.8,-17.9,0.0,0.0,12.3,9.9,0.0,0.0,0.0,0.0,0.0,993.0,0.0,0.0,10.7,11.1,
25%,-2.8,-0.3,-3.8,0.0,0.0,23.1,68.7,0.0,0.0,0.0,0.0,0.0,999.0,0.0,0.0,23.5,23.8,
50%,0.3,-0.1,-0.1,3.6,3.4,25.3,85.4,0.8,1.6,0.4,161.5,6.2,1000.0,0.0,0.0,26.6,26.9,
75%,362.4,107.0,224.7,359.5,345.4,29.4,96.7,2.0,2.6,0.6,234.1,12.0,1001.0,0.0,0.0,40.9,41.3,
max,1499.0,946.0,892.0,1507.0,1473.0,39.9,100.0,19.2,23.9,4.1,360.0,98.4,1006.0,1.0,2.4,72.8,70.4,



--- Missing Value Report ---
Missing values by column:
          Missing Count  Missing Percentage
Comments         525600               100.0

Columns with more than 5% missing values:
          Missing Count  Missing Percentage
Comments         525600               100.0


# **3. Clean Data**

The raw data is cleaned using the `clean_data` function from `src/data_cleaning.py`. This process includes handling missing values, removing outliers, and ensuring data integrity.

In [4]:
# --- 3. Clean Data ---
print(f"\n--- Cleaning Data for {COUNTRY} using src/data_cleaning.py ---")
df_cleaned = dc.clean_data(df_raw.copy(), country_name=COUNTRY)

print("\n**Cleaned Data Head:**")
display(df_cleaned.head())
print(f"\n**Cleaned Data Shape:** {df_cleaned.shape}")


--- Cleaning Data for Sierraleone using src/data_cleaning.py ---

--- Cleaning Data for Sierraleone ---

  Enforcing Physical Constraints (Setting impossible values to NaN)...
    Corrected 261135 negative values in 'GHI'.
    Corrected 266352 negative values in 'DNI'.
    Corrected 263128 negative values in 'DHI'.
  Total physically impossible values corrected: 790615

  Dropped 286728 rows due to missing values in critical columns: GHI, DNI, DHI, ModA, ModB.

  Imputing remaining numerical missing values with median...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Comments'].fillna('No Comment', inplace=True)
  df_cleaned['Comments'].fillna('No Comment', inplace=True)


  No further numerical missing values required imputation.
  Filled 238872 missing 'Comments' with 'No Comment'.

  Detecting potential statistical outliers (Z-score > 3) for reporting...
  Potential statistical outliers detected (Z-score > 3):
    - GHI: 81 outliers
    - DHI: 415 outliers
    - ModA: 33 outliers
    - ModB: 51 outliers
    - Tamb: 339 outliers
    - WS: 915 outliers
    - WSgust: 1140 outliers
    - Precipitation: 1638 outliers
  Note: These values are flagged for awareness but not automatically removed or capped by this function.

--- Cleaning Summary for Sierraleone ---
  Initial rows: 525600
  Final rows after cleaning: 238872
  Total rows removed/adjusted: 286728

  Missing values after cleaning:
Series([], dtype: int64)

Cleaned data saved to: c:\Users\Perserverence\Documents\Python_Scripts\solar-challenge-week1\scripts\..\data\sierraleone_clean.csv

**Cleaned Data Head:**


Unnamed: 0_level_0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-10-30 07:13:00,17.7,0.0,17.7,20.1,19.2,22.0,98.6,0.2,0.7,0.4,212.8,2.6,1003,0,0.0,22.5,22.9,No Comment
2021-10-30 07:14:00,18.3,0.0,18.3,20.7,19.9,22.0,98.5,0.1,0.7,0.3,221.4,0.4,1003,0,0.0,22.5,22.9,No Comment
2021-10-30 07:15:00,18.9,0.0,18.9,21.4,20.5,22.0,98.6,0.2,0.7,0.5,189.1,6.1,1003,0,0.0,22.6,22.9,No Comment
2021-10-30 07:16:00,19.5,0.0,19.5,22.0,21.1,22.0,98.6,0.4,1.4,0.6,199.4,5.8,1003,0,0.0,22.6,22.9,No Comment
2021-10-30 07:17:00,20.1,0.0,20.1,22.7,21.8,22.0,98.5,0.1,1.1,0.3,192.0,1.0,1003,0,0.0,22.6,22.9,No Comment



**Cleaned Data Shape:** (238872, 18)


# **4. Post-Cleaning Data Profiling**

After cleaning, we re-profile the data to verify the effectiveness of the cleaning process. This includes updated summary statistics and a missing value report.# **4. Post-Cleaning Data Profiling**

After cleaning, we re-profile the data to verify the effectiveness of the cleaning process. This includes updated summary statistics and a missing value report.

In [5]:
# --- 4. Post-Cleaning Data Profiling ---
print(f"\n--- Post-Cleaning Data Profiling for {COUNTRY} ---")
print("\n**Summary Statistics (Cleaned Data):**")
display(dp.get_summary_statistics(df_cleaned.select_dtypes(include=['number'])))

print("\n**Missing Value Report (Cleaned Data):**")
dp.print_missing_value_report(df_cleaned, threshold=0)  # Check if any nulls remain


--- Post-Cleaning Data Profiling for Sierraleone ---

**Summary Statistics (Cleaned Data):**

--- Summary Statistics ---


Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB
count,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0,238872.0
mean,445.471047,256.390219,251.600694,449.875422,431.288071,29.144408,68.701027,1.56861,2.297283,0.473658,177.697873,10.755592,999.467556,0.00206,0.001615,42.787084,42.607726
std,294.381895,263.173538,141.632581,300.153437,288.6706,4.202268,22.136594,1.225995,1.531143,0.265979,105.684921,8.054122,2.354675,0.045337,0.030674,11.545187,10.983106
min,0.0,0.0,0.0,0.0,0.0,12.4,9.9,0.0,0.0,0.0,0.0,0.0,993.0,0.0,0.0,11.7,12.0
25%,192.2,3.1,143.9,188.3,180.1,25.9,52.9,0.4,1.4,0.4,76.9,5.2,998.0,0.0,0.0,33.8,34.0
50%,406.1,166.3,242.4,403.9,387.8,29.3,72.9,1.5,2.4,0.5,216.1,10.6,1000.0,0.0,0.0,42.7,43.1
75%,687.4,485.6,348.9,703.8,671.8,32.3,85.6,2.4,3.4,0.6,254.7,15.2,1001.0,0.0,0.0,52.2,51.8
max,1499.0,946.0,892.0,1507.0,1473.0,39.9,100.0,19.2,23.9,4.1,360.0,98.4,1006.0,1.0,2.4,72.8,70.4



**Missing Value Report (Cleaned Data):**

--- Missing Value Report ---
No missing values found in the DataFrame.


# **5. Time Series Analysis**

We analyze the cleaned data over time, focusing on key variables such as GHI, DNI, DHI, and Tamb. This includes plotting time series and average daily/monthly profiles.

In [6]:
# --- 5. Time Series Analysis ---
print(f"\n--- Time Series Analysis for {COUNTRY} ---")

# GHI, DNI, DHI over Time
ep.plot_time_series(df_cleaned, ['GHI', 'DNI', 'DHI'], COUNTRY, title_suffix="Irradiance")

# Tamb over Time
ep.plot_time_series(df_cleaned, 'Tamb', COUNTRY, title_suffix="Ambient Temperature")

# Average Daily Profiles (hourly)
ep.plot_daily_average_profile(df_cleaned, ['GHI', 'Tamb'], COUNTRY)

# Average Monthly Profiles
ep.plot_monthly_average_profile(df_cleaned, ['GHI', 'Tamb'], COUNTRY)


--- Time Series Analysis for Sierraleone ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_dni_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_dhi_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_tamb_time_series.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_daily_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_tamb_daily_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_monthly_profile.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_tamb_monthly_profile.png


### **6. Cleaning Impact Analysis**

In [7]:
print(f"\n--- Cleaning Impact Analysis for {COUNTRY} ---")
ep.plot_cleaning_impact(df_cleaned, 'ModA', 'ModB', COUNTRY)


--- Cleaning Impact Analysis for Sierraleone ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_cleaning_impact.png


### **7. Correlation & Relationship Analysis**

In [8]:
print(f"\n--- Correlation & Relationship Analysis for {COUNTRY} ---")

# Heatmap
ep.plot_correlation_heatmap(df_cleaned, ['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH', 'WS', 'Precipitation'], COUNTRY)

# Scatter plots
ep.plot_scatter(df_cleaned, 'WS', 'GHI', COUNTRY, title_suffix="(Wind Speed vs GHI)")
ep.plot_scatter(df_cleaned, 'WSgust', 'GHI', COUNTRY, title_suffix="(Wind Gust vs GHI)")
ep.plot_scatter(df_cleaned, 'RH', 'Tamb', COUNTRY, title_suffix="(Relative Humidity vs Ambient Temperature)")
ep.plot_scatter(df_cleaned, 'RH', 'GHI', COUNTRY, title_suffix="(Relative Humidity vs GHI)")


--- Correlation & Relationship Analysis for Sierraleone ---
  Saved matplotlib plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_correlation_heatmap.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_vs_ws_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_vs_wsgust_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_tamb_vs_rh_scatter.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_vs_rh_scatter.png


### **8. Wind & Distribution Analysis**

In [9]:
print(f"\n--- Wind & Distribution Analysis for {COUNTRY} ---")

# Wind Rose
ep.plot_wind_rose(df_cleaned, 'WS', 'WD', COUNTRY)

# Histograms
ep.plot_distribution(df_cleaned, 'GHI', COUNTRY, hist_type='histogram')
ep.plot_distribution(df_cleaned, 'WS', COUNTRY, hist_type='histogram')
ep.plot_distribution(df_cleaned, 'RH', COUNTRY, hist_type='histogram')


--- Wind & Distribution Analysis for Sierraleone ---






  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_wind_rose.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_distribution.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ws_distribution.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_rh_distribution.png


### **9. Bubble Chart**

In [10]:
print(f"\n--- Bubble Chart for {COUNTRY} ---")

# GHI vs. Tamb with bubble size = RH
ep.plot_bubble_chart(df_cleaned, 'Tamb', 'GHI', 'RH', COUNTRY)

# GHI vs. Tamb with bubble size = BP (if BP is relevant as a bubble size)
ep.plot_bubble_chart(df_cleaned, 'Tamb', 'GHI', 'BP', COUNTRY)


--- Bubble Chart for Sierraleone ---
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_vs_tamb_rh_bubble.png
  Saved plotly plot: c:\Users\Perserverence\Documents\Python_Scripts\reports\figures\sierraleone_ghi_vs_tamb_bp_bubble.png
