# 0. Imports and inputs

In [6]:
# import the library
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import calendar
import numpy as np
import matplotlib.dates as mdates
import sys
import os
from os import listdir
from os.path import isfile, join
from dateutil import parser
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (GradientBoostingRegressor, 
                              RandomForestRegressor, 
                              AdaBoostRegressor, 
                              BaggingRegressor, 
                              ExtraTreesRegressor,
                              HistGradientBoostingRegressor,
                              StackingRegressor,
                              VotingRegressor)
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.colors as mcolors
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from datetime import datetime, timedelta
from matplotlib import animation
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
# from func_defs import *

# %matplotlib ipympl
%matplotlib inline


# define the inverter to analyse
inverter = '2-1'


# define the plot template
axis_label_size = 15
axis_num_size = 12
text_size = 12
title_size = 22

# define the figure size for single plot
figure_size = (6, 6)
subplot_size_1_2 = (6, 6)
long_hoz_figsize = (12, 6)

# define the file directory for SunSolve data
sunsolve_dir = r"C:\Users\z5183876\OneDrive - UNSW\Documents\GitHub\25_09_02_Bomen_bifacial_gain_2021\Data\SunSolve Yield\Per inverter\2-1\0_mimic_PVsyst\25_09_09_Bomen_2021_1.csv"
# define the file directory for PVsyst data
pvsyst_dir = r"C:\Users\z5183876\OneDrive - UNSW\Documents\GitHub\25_09_02_Bomen_bifacial_gain_2021\Data\PVsyst\per_inv\2-1\Optimise\Bomen solar farm 2021 18 0.0 Perez model.csv"

# 1. Compare SunSolve Yield and PVsyst

## 1.1. SunSolve Yield data loading

In [15]:
# Load the SunSolve yield data as csv file
sunsolve_df = pd.read_csv(sunsolve_dir)

# Print the columns of the SunSolve df
# print("Columns in SunSolve data:")
print(sunsolve_df.columns)

# Create a timestamp column from the Day of year, Hour, and Minute columns
# First, we need to determine the year (assuming 2021 from your file naming)
year = 2021

# Create a timestamp using a different approach - convert day of year to date
# This creates a base date for the year and adds the day of year as timedelta
base_date = pd.Timestamp(f"{year}-01-01")  # January 1st of the specified year
sunsolve_df['Timestamp'] = sunsolve_df.apply(
    lambda row: base_date + pd.Timedelta(days=int(row['Day of year'])-1) + 
                pd.Timedelta(hours=int(row['Hour'])) + 
                pd.Timedelta(minutes=int(row['Minute'])),
    axis=1
)

# print("Created timestamp column.")
# print(sunsolve_df['Timestamp'].head())

# Define a scaling factor for the SunSolve df power
sunsolve_scaling = 1e-6

# Convert power from W to MW
sunsolve_df['Inverter power (MW)'] = sunsolve_df['Power [unit-system] (W)'] * sunsolve_scaling

# Apply a 100 MW clipping to the SunSolve data
sunsolve_df['Inverter power clipped (MW)'] = sunsolve_df['Inverter power (MW)'].clip(upper=100)

# Set the timestamp as the index for easier resampling
sunsolve_df.set_index('Timestamp', inplace=True)

# Now resample to daily data using the index
# For hourly data converted to daily energy, we need to multiply by the time period
# The 'Period (h)' column contains the duration of each data point in hours
sunsolve_df['Hourly energy (MWh)'] = sunsolve_df['Inverter power clipped (MW)'] * sunsolve_df['Period (h)']

# Identify all columns containing "(W)" - these are power measurements
power_columns = [col for col in sunsolve_df.columns if "(W)" in col]

# Create energy columns for each power column
for col in power_columns:
    # Create a new column name by replacing "(W)" with "(Wh)"
    energy_col_name = col.replace("(W)", "(Wh)")
    
    # Convert power to energy by multiplying by the time period
    sunsolve_df[energy_col_name] = sunsolve_df[col] * sunsolve_df['Period (h)']

# Resample all energy columns to daily totals
daily_energy_df = sunsolve_df[[col for col in sunsolve_df.columns if "(Wh)" in col]].resample('D').sum()

# Convert from Wh to MWh for consistency with your other calculations
for col in daily_energy_df.columns:
    daily_energy_df[col] = daily_energy_df[col] * 1e-6  # Convert Wh to MWh
    new_col_name = col.replace("(Wh)", "(MWh/day)")
    daily_energy_df.rename(columns={col: new_col_name}, inplace=True)

# Display the results
print("Daily energy values from power measurements:")
print(daily_energy_df.head())

# Initialize sunsolve_daily with the hourly energy resampled to daily
sunsolve_daily = sunsolve_df['Hourly energy (MWh)'].resample('D').sum().to_frame(name='Inverter energy (MWh/day)')

# Merge with the other energy columns
sunsolve_daily = pd.concat([sunsolve_daily, daily_energy_df], axis=1)

# print hte columns of hte sunsolve_daily
print("Columns in daily resampled SunSolve data:")
print(sunsolve_daily.columns)

Index(['Day of year', 'Hour', 'Minute', 'Period (h)', 'Flag', 'Message',
       'Solar zenith (degrees)', 'Solar elevation (degrees)',
       'Solar azimuth (degrees)', 'GHI (W/m2)', 'DHI (W/m2)', 'DNI (W/m2)',
       'Diffuse fraction', 'Ambient temperature (°C)', 'Wind velocity (m/s)',
       'Wind direction', 'Opaque cloud fraction', 'Air mass',
       'Precipitable water vapour (cm)', 'Ozone (atm.cm)',
       'Aerosol optical density', 'Relative humidity (%)',
       'Far field albedo (%)', 'Surface pressure (mb)',
       'Module tilt (degrees)', 'Incident angle (degrees)',
       'VF power for Tmod [average] (W/m2)',
       'VF power for Tmod front [average] (W/m2)',
       'VF power for Tmod rear [average] (W/m2)',
       'Module temperature [average] (C)',
       'Power - no mismatch @25C [unit-system] (W)',
       'Power - no mismatch [unit-system] (W)', 'Power [unit-system] (W)',
       'Vmp [average] (V)', 'Imp [average] (A)', 'Imp [minimum] (A)',
       'Voc [average] (V)', 

## 1.2. PVsyst data loading

In [20]:
# read the csv file
PVsyst_results_df = pd.read_csv(
    pvsyst_dir,
    delimiter=';',
    skiprows=list(range(10)) + [11],  # Skip metadata (0-9) and units row (11)
    header=0,  # Row 10 becomes the header after skipping
    encoding='latin-1',  # Keep the encoding that worked
    low_memory=False,
    na_values=['', ' ', 'nan', 'NaN']
)

# apply a clipping of 100 MW to the PVsyst data of column EArray
PVsyst_results_df['EArray clipped (MW)'] = PVsyst_results_df['EArray'].clip(upper=100)

# print the statistics of the EArray clipped column
print("Statistics of PVsyst EArray clipped (MW):")
print(PVsyst_results_df['EArray clipped (MW)'].describe())

Statistics of PVsyst EArray clipped (MW):
count    8736.000000
mean        0.468229
std         0.695831
min         0.000000
25%         0.000000
50%         0.000000
75%         0.882375
max         2.312800
Name: EArray clipped (MW), dtype: float64
