In [1]:
from sys import path
if '../..' not in path:
    path.insert(0, '../..')

In [4]:
from _library.utils import SYSTEM_NAMES_FULL, load_datasets, weighted_knn, find_outliers
from os import path, makedirs
from datetime import datetime
import numpy as np
import pandas as pd

In [5]:
# Select the main folder 
%cd /mnt/data/vieri/projects/SAMPLE/

# Visualize names of PV systems
print(SYSTEM_NAMES_FULL)
# --- 0 ---------- 1 --------- 2 ------ 3 ------ 4 --------- 5 --------- 6 -------- 7 ---

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Cantore', 'Emi', 'Soleto 1', 'Soleto 2', 'Galatina', 'Verone']


# Selecting the PV system

In [None]:
system_name = SYSTEM_NAMES[1]
print(f"PV SYSTEM --> {system_name}")

# Loading dataset

In [None]:
# Loading the datasets
system_path, inv_data, inv_names, raw_irr_data, string_inv_data, string_inv_names = load_datasets(system_name, verbose=True)

# Quick data exploration

In [None]:
pd.set_option('display.max_rows', 1000)

# TASK: Descriptive stats
df = inv_data[inv_names[0]]
cols = ["Irradiance [Average(watts-per-meter-sq)]", "Voltage [Average(volts)]"]
for col in df.columns[1:]:
    stats = df[col].describe()
    stats["median"] = df[col].median()
    display(stats)
#display(string_inv_data[string_inv_names[52]]["Voltage [Average(volts)]"].describe())

# TASK: problematic values
prob = df[df[col] >= 10000]
col = "Generated Energy [Interval Sum(kilowatt-hours)]"
print(f"NEGATIVE VALUES ({round((len(prob)/len(df))*100, 2)}%):{len(prob)} --> COL: {col.split('[')[0]}")
#display(prob)

# TASK: Check whather the columns are equal 
pair_to_check = [
    ("Generated Energy [Interval Sum(kilowatt-hours)]", "PV Energy [Interval Sum(kilowatt-hours)]"),
    ("Generated Power [Average(watts)]", "PV Power [Average(watts)]"),
    ("Generated Power [Average(watts)]", "DC Gen. Power [Average(watts)]"),
    ("DC Gen. Power [Average(watts)]", "PV Power [Average(watts)]")
]
for pair in pair_to_check:
    diff = np.abs(df[pair[0]] - df[pair[1]])
    diff = diff.dropna()
    print("\nCOMPARISON:", "- ".join([item.split('[')[0] for item in pair]))
    print("MEAN:", np.mean(diff))
    print("STD:", np.std(diff))

# TASK: Check availability of insolation
col = "Insolation [Interval Sum(kilowatt-hours-per-meter-sq)]"
if col in df.columns:
    prob = df[col].dropna()
    print(f"\nVALUES AVAILABLE ({round((len(prob)/len(df))*100, 2)}%): {len(prob)} --> COL: {col.split('[')[0]}")
    #display(prob)

# ANALYSE outliers of voltage
cols = ["Voltage AN [Average(volts)]", "Voltage BN [Average(volts)]", "Voltage CN [Average(volts)]", "Voltage [Average(volts)]"]
cond = df[cols[0]] < 210
cond2 = df["Date/Time [Europe/Rome]"].dt.time > pd.to_datetime("07:00").time()
filtered_df = df.loc[cond & cond2, ["Date/Time [Europe/Rome]"] + cols]
print(f"\nVOLTAGE (AN/BN/CN) OUTLIERS ({round((len(filtered_df)/len(df))*100,2)}%):{len(filtered_df)}\n")
#display(filtered_df.iloc[:1000,:])

# Check out the cut-off point for binetto 1 (due to corrupt values of the irradiance)
cutoff_date = pd.to_datetime("2020-01-14").date()
print("-"*20, "Checking out cutoff date (due to corrupt values of the irradiance)", "-"*20)
for pre_delta in range(4, -1, -1):
    date = cutoff_date - pd.Timedelta(pre_delta, unit="day")
    if pre_delta == 0:
         print(f"DATE: {date} --> CUT-OFF DAY")
    else:
        print(f"DATE: {date} (- {pre_delta} days)")
    
    values = df[df["Date/Time [Europe/Rome]"].dt.date == date]["Irradiance [Average(watts-per-meter-sq)]"]
    avg = np.round(np.mean(values.dropna().tolist()), 2)
    print(f"Irradiance (AVG): {avg} w/mq\n")
for post_delta in range(1,4):
    date = cutoff_date + pd.Timedelta(post_delta, unit="day")
    print(f"DATE: {date} (+ {post_delta} day(s))")
    
    values = df[df["Date/Time [Europe/Rome]"].dt.date == date]["Irradiance [Average(watts-per-meter-sq)]"]
    avg = np.round(np.mean(values.dropna().tolist()), 2)
    print(f"Irradiance (AVG): {avg} w/mq\n")

# Check integer columns
integer_columns = set()
print(30*"-", "Checking the type of the columns", 30*"-")
for k, col in enumerate(df.columns[1:]):
    raw_values = df[col]
    non_integer = [value for value in raw_values if not value.is_integer()]
    non_nanInteger = [value for value in non_integer if not np.isnan(value)]
    print(f"Checking the column ({k+1}): {col.upper()}")
    if len(non_nanInteger) > 0:
        print(f"--> Number of non integer values (that are not NaN values): {len(non_nanInteger)} "\
              f"({round((len(non_nanInteger)/len(raw_values))*100,2)} %)\n")
    else:
        print("--> All integer values in this column!\n")
        integer_columns.add(col)
print("INTEGER COLUMNS TO CAST:\n")
for idk, col in enumerate(integer_columns):
    print(f"{(idk + 1)}) {col}\n")

# ------ FINDINGS --------
# 1) DISCARD
# ---> CELL TEMP (SYS & AMB) --> Mean -40 --> STD: 0.0
# ---> Insolation --> data available only for the 3.5 % of the data (8K out of 228K)
# ---> Discard PV Energy (as 'Generated Energy' = 'PV Energy')
# ---> Discard PV Power (as 'Generated Power' = 'PV power')
# 3) OUTLIERS TO CLEAN 
#---> (Generated Energy) --> around 0.02% negative values + other outliers (> 50000)
# 4) OUTLIERS TO INVESTIGATE BETTER:


# Merge the two columns of ambiental temperatures

In [None]:
# TASK: Merge the two columns of the ambiental temperatures
dt_col = "Date/Time [Europe/Rome]"

cols = ["Ambient Temp. [Average(celsius)]" ,"Ambient Temp. [Average(celsius)]_SYS"]

# Values
amb_source = raw_irr_data[["Date/Time [Europe/Rome]", "Ambient Temp. [Average(celsius)]"]].dropna().round(decimals = 3)
sys_source = inv_data[inv_names[0]][["Date/Time [Europe/Rome]", "Ambient Temp. [Average(celsius)]_SYS"]].dropna().round(decimals = 3)

# Merge the two columns 
merged_temp = amb_source.merge(sys_source, how ="outer", on="Date/Time [Europe/Rome]", indicator=True)
display(merged_temp.groupby(by = "_merge").count()[[dt_col]])

# Compute the difference
diff_obs = np.abs(merged_temp[cols[1]] - merged_temp[cols[0]])
diff_obs.dropna(inplace=True)
mismatch_indexes = diff_obs[diff_obs > 0].index
mismatch_timestamp = merged_temp.iloc[mismatch_indexes, 0].tolist()

# Fill the missing values with the value available in the other column
merged_temp["Ambiental Temp. (celsius)"] = merged_temp[cols[0]]
merged_temp["Ambiental Temp. (celsius)"].fillna(merged_temp[cols[1]], inplace=True)

# Fix the mismatces ( diff less than 1°C)
mismatches = merged_temp[merged_temp[dt_col].isin(mismatch_timestamp)].index
print("MISMATCHES: ", len(mismatches))

# Compute an average value
merged_temp.iloc[mismatches, -1] = merged_temp.iloc[mismatches, [1, 2]].mean(axis=1)

# Discard duplicates
merged_temp.drop_duplicates(inplace=True)

# Select only the useful column
merged_temp = merged_temp[[dt_col] + ["Ambiental Temp. (celsius)"]]

# Identify duplicate timestamp with different temperature
duplicated_ts = merged_temp[merged_temp['Date/Time [Europe/Rome]'].duplicated(keep=False)]
unique_duplicated_dt = [pd.to_datetime(datetime) for datetime in duplicated_ts["Date/Time [Europe/Rome]"].unique()]

for timestamp in unique_duplicated_dt:
    print("\nTIMESTAMP: ", timestamp)
    
    # Isolate the observation
    daily_duplicated = merged_temp[merged_temp["Date/Time [Europe/Rome]"] == timestamp]

    if len(daily_duplicated) == 0:
        print("No observations found")
        continue
    
    # Compute the average value for these temperatures
    averaged_value = np.round(np.mean(daily_duplicated['Ambiental Temp. (celsius)'].tolist()), 2)
    
    # Create a new observation
    pair = (daily_duplicated['Date/Time [Europe/Rome]'].tolist()[0], averaged_value)
    new_obs = pd.Series(data = pair, index = daily_duplicated.columns)
    print(f"--> An averaged observation has been created: {new_obs['Ambiental Temp. (celsius)']}")
    
    # Add the new averaged observation in the temperature dataframe
    merged_temp = merged_temp.append(new_obs, ignore_index=True)
    
    # Drop the previous observations
    merged_temp.drop(index = daily_duplicated.index, inplace=True)
    
    # Sort the updated dataframe
    merged_temp.sort_values(by = "Date/Time [Europe/Rome]", inplace = True)
    merged_temp.reset_index(inplace=True, drop=True)

# display(merged_temp[merged_temp["Date/Time [Europe/Rome]"].dt.date >  pd.to_datetime("2021-05-31").date()])

# Merge the merged column with the main dataframe
inv_data[inv_names[0]] = inv_data[inv_names[0]].merge(merged_temp, how="inner", on="Date/Time [Europe/Rome]")
inv_data[inv_names[0]].drop(columns = ["Ambient Temp. [Average(celsius)]_SYS", "Ambient Temp. [Average(celsius)]_SYS"], 
                            inplace=True)
inv_data[inv_names[0]].info()

# Check uniquness of timestamp 

In [None]:
def check_datetime_uniqueness(df):
    index_to_delate = []

    # Check duplicates
    condition = df["Date/Time [Europe/Rome]"].duplicated(keep=False)
    duplicated_datetime = df[condition]

    # Continue to analyse other inverter in case of no duplicated observations
    if len(duplicated_datetime) == 0:
        print(f"Oh, that's good. No duplicated observations have been found for {inv_name}!")
        return None
    else:
        # Extraxct unique datetimes
        unique_duplicated_dt = [pd.to_datetime(datetime) for datetime in duplicated_datetime["Date/Time [Europe/Rome]"].unique()]
        print(f"DUPLICATE DATES: {len(unique_duplicated_dt)}:"\
              f"\n{[dt.strftime('%Y-%m-%d, %H:%M') for dt in unique_duplicated_dt]}"\
              f"\nTotal observations: {len(duplicated_datetime)}")
        #display(duplicated_datetime)

        # Investigate this behaviour for a duplicate observations
        datetime_to_investigate = unique_duplicated_dt[0]
        delta = pd.Timedelta(15, unit="minutes") #datetime.timedelta(minutes=60)
        period = (datetime_to_investigate - delta, datetime_to_investigate + delta)
        #display(df.loc[df["Date/Time (Europe/Rome)"].between(period[0], period[1]), :])

        # Compute difference 
        to_discard = set()
        
        for datetime in unique_duplicated_dt:
            daily_duplicated_indexes = df[df["Date/Time [Europe/Rome]"] == datetime].index
            duplicated_obs = df.loc[daily_duplicated_indexes, :].drop(columns = "Date/Time [Europe/Rome]")
            print(f"\nAnalysing {datetime} - (observations: {len(daily_duplicated_indexes)})")
            print("-"*120)

            equal_observations = set()
            for idk_check, index_obs in enumerate(daily_duplicated_indexes):
                other_duplicated_obs = duplicated_obs.drop(index_obs, axis=0)

                check = duplicated_obs.loc[index_obs,:].fillna(0).eq(other_duplicated_obs.fillna(0))
                find_equal_obs = check.all(axis=1)[check.all(axis=1) == True].index.tolist()

                print(f"\n--> Analysing idk: {index_obs}...")                
                if find_equal_obs:
                    print(f"    Equal observation(s) have been found: {find_equal_obs}")
                    idk_equal_obs = [index_obs] + [idk for idk in find_equal_obs]
                    idk_equal_obs.sort()

                    #display(duplicated_obs.loc[idk_equal_obs, :])
                    idk_equal_obs = [idk for idk in find_equal_obs]

                    # Save pairs of equal duplicated observations
                    equal_observations.add(tuple(idk_equal_obs))
                else:
                    print("    No equal observation(s) have been found for the timestamp ")

            # Keep the first equal and discard the other equal observations
            duplicated_diff_obs = sorted(set(obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:]))

            if len(duplicated_diff_obs) != 0:
                to_discard.update(duplicated_diff_obs)
                print(F"\n--> OK, discarding the identical observations ({len(duplicated_diff_obs)}/{len(daily_duplicated_indexes)}): {duplicated_diff_obs}")
                
            # --------------------------------------- STRAT 2 --------------------------------------------
            
            # Highlight potential issues: same timestamp not equal values
            remaining_duplicated_obs = sorted(set(daily_duplicated_indexes) - set(duplicated_diff_obs))

            if len(remaining_duplicated_obs) > 1:
                print(f"\n\n--> Issue, duplicated observations with different values have been found! --> {remaining_duplicated_obs}")
                remaining_df = df.loc[remaining_duplicated_obs, :]
                unique_dt = [pd.to_datetime(datetime) for datetime in remaining_df["Date/Time [Europe/Rome]"].unique()]
                
                obs_to_discart = []
                for datetime in unique_dt:
                    daily_duplicated_indexes = df[df["Date/Time [Europe/Rome]"] == datetime].index
                    obs_to_discard = set(daily_duplicated_indexes.tolist()[1:]) - to_discard
                    print("    STRAT 2 (TO DISCARD):", obs_to_discard)
                    to_discard.update(obs_to_discard)
    
        # Select the index to remove (keep the first one, as they are the equal)
        #idx_to_delate = set(obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:])
        #item_to_delate = sorted(idx_to_delate.union(to_discard))
        item_to_delate = sorted(to_discard)
        print("\n", *"-")
        print(f"TO DELATE ({len(item_to_delate)} out of {len(duplicated_datetime)}):")
        print(item_to_delate, "\n", 80*"-")

        return item_to_delate
#-----------------------------------
item_to_delate = check_datetime_uniqueness(inv_data[inv_names[0]])

# CARRY OUT SOME TRANSFORMATION
1. Discard duplicated observations (i.e., time changes: 2 AM - 3 AM)
2. Discard *unnecessary columns*
3. [only for Binetto 1] Discard observations before a cut-off date due to currupted irradiance values. 
4. Cast columns (from *'float64'* to *'int64'*)
5. *Value transformation* (from 'watt' to 'kilowatt')
6. Rename columns to remove the unnecessary text (i.e., "Average")
7. Reorder columns *improve readability*

In [None]:
# TASK A: Drop duplicated indexes
print(f"MAIN INVERTER: {system_name.upper()}:System")
inv_data[inv_names[0]].drop(index = item_to_delate, inplace=True)
inv_data[inv_names[0]].reset_index(inplace = True, drop=True)
print(f"A) Some ({len(item_to_delate)}) duplicated rows (i.e., due to the time changes) have been DISCARTED\n")

# TASK B: DISCARD COLUMN
columns_to_discard = [
    "Cell Temp. [Average(celsius)]_SYS", 
    "Cell Temp. [Average(celsius)]_AMB", 
    "PV Energy [Interval Sum(kilowatt-hours)]",
    "PV Power [Average(watts)]"
]
if system_name == SYSTEM_NAMES[0]:
    columns_to_discard.append("Insolation [Interval Sum(kilowatt-hours-per-meter-sq)]")   
inv_data[inv_names[0]].drop(columns = columns_to_discard, inplace=True)
print(f"B) Some columns ({len(columns_to_discard)}) have been DISCARTED. \n   {[item.split('[')[0].rstrip() for item in columns_to_discard]}\n")

# TASC C: Keep only observation after the cutoff date (before the corrupted irradiance values)
if system_name == SYSTEM_NAMES[0]:
    cond = inv_data[inv_names[0]]["Date/Time [Europe/Rome]"].dt.date > cutoff_date
    filtered_df = inv_data[inv_names[0]][cond]
    ratio_discarted = round(((len(inv_data[inv_names[0]]) - len(filtered_df)) / len(inv_data[inv_names[0]])) *100,2)
    inv_data[inv_names[0]] = filtered_df
    print(f"C) [only for Binetto 1] A period covered by the dataset ({ratio_discarted} %) has been DISCARTED due to currupted irradiance values. \n   "\
          f"(observations after {cutoff_date} have been kept)\n")
    inv_data[inv_names[0]].reset_index(inplace=True, drop=True)
    
# TASK E: Value transformation (from Watt to kilowatt) --> for improving the coherence and readability
watt_cols = ["DC Gen. Power [Average(watts)]", "Generated Power [Average(watts)]"]
to_kilowatt = lambda watt_value: watt_value/1000
for w_col in watt_cols: 
    inv_data[inv_names[0]][w_col] = to_kilowatt(inv_data[inv_names[0]][w_col])
    inv_data[inv_names[0]].rename(columns = {w_col: w_col.replace("watt", "kilowatt")}, inplace=True)
print(f"D) The watts ({len(watt_cols)} columns) have been TRANSFORMED into kilowatts\n   "\
      f"{[item.split('[')[0].rstrip() for item in watt_cols]}\n")

# TASK D: Cast variables (i.e., from 'float64' to 'int64')
columns_to_cast = integer_columns - set(columns_to_discard) - set(watt_cols)
if len(columns_to_cast) > 0:
    for int_col in columns_to_cast:
        inv_data[inv_names[0]][int_col] = inv_data[inv_names[0]][int_col].astype("Int64")
    print(f"E) Some columns ({len(columns_to_cast)}) have been CAST (from 'float64' to 'int64').\n   "\
          f"{[item.split('[')[0].rstrip() for item in columns_to_cast]}\n")

# TASK F: RENAME COLUMNS 
col_names = inv_data[inv_names[0]].columns
col_names = [col.replace("[Average", "").replace("[Interval Sum", "").rstrip(']') for col in col_names]
col_names[0] = "Date/Time (Europe/Rome)"
inv_data[inv_names[0]].columns = col_names 
print("F) All the columns have been RENAMED to improve the readability\n")

# TASK G: REORDER COLUMNS
new_col_order = [
    "Date/Time (Europe/Rome)",
    "Voltage AN (volts)",
    "Voltage BN (volts)",
    "Voltage CN (volts)",
    "Voltage (volts)",
    "Current (amps)",
    "Generated Power (kilowatts)",
    "Generated Energy (kilowatt-hours)",
    "DC Voltage (volts)",
    "DC Gen. Power (kilowatts)",
    "Ambiental Temp. (celsius)",
    "Irradiance (watts-per-meter-sq)"
]
inv_data[inv_names[0]] = inv_data[inv_names[0]].reindex(columns = new_col_order)
print("G) The columns have been SORTED to improve the readability.\n")

# VISUALIZE OUTCOMES
print("-"*40, "THE OUTCOME","-"*40)
inv_data[inv_names[0]].info()

# Outliers

## VARIABLE: *Generated Energy (kWh)*

In [None]:
def outlier_correction_gen_energy(df): #, kWp, tolerance_max_power = 0.3, threshold_zscore = 3
    columns = ["Date/Time (Europe/Rome)", "Generated Energy (kilowatt-hours)"]

    # Use different strategies to find the abnormal values 
    # STRAT 1: Find negative values
    neg_cond = df["Generated Energy (kilowatt-hours)"] < 0
    negative_values = df[neg_cond]
    print("-"*20, f"NEGATIVE kWh VALUES ({len(negative_values)})", "-"*20)
    
    # STRAT 2: Find values beyond the theoretical limit
    # Compute the theoretical maximum
    #h = 1
    #tolerance = 1 + tolerance_max_power # Add a tolerance of 30 %
    #max_theoretical_kwh = (kWp * h) * tolerance
    #print(f"THEORETICAL MAX: {max_theoretical_kwh} kWh (kWp system: {kWp})")
    
    #theoretical_invalid_values = df[df["Generated Energy (kilowatt-hours)"] > max_theoretical_kwh]
    #print("-"*10, f"kWh VALUES ({len(theoretical_invalid_values)}) BEYOND THE THEORETICAL LIMIT (kWp * h)", "-"*10)
    #if len(negative_values) > 0:
        #display(theoretical_invalid_values.iloc[:, [0, 4, 5, 6, 7, -1]])
    #else:
        #print("No values beyond the theoretical limit have been found. That's good. \n")

    # STRAT 3: Find outliers by using the z-score
    #extreme_outlier = find_outliers(df[columns], verbose=True, threshold = threshold_zscore)

    # Merge all the outliers discovered with these three strategies
    #idk_outliers = sorted(set(negative_values.index.tolist() + theoretical_invalid_values.index.tolist() + extreme_outlier.index.tolist()))
    idk_outliers = negative_values.index.tolist()# + high_values.index.tolist()
    
    if len(idk_outliers) == 0:
        print("Oh, no extreme outliers found. That's good.\n")
    else:
        display(negative_values)
        #print(50 * "-", f"\n{len(idk_outliers)} outliers have been found for the variable 'Generated Energy'. \n")

        # Set a copy of the outlier indexes used to remove them after correcting them
        list_outliers = idk_outliers.copy()

        # Compute and assign the estimated value (weighted average value from its neigbours)
        number_neighbours = 6 
        for idk, idk_outlier in enumerate(idk_outliers):

            # Visualize the outlier and its neighours
            print(f"\nOutlier {idk + 1}/{len(idk_outliers)} (idk: {idk_outlier}) "\
                  f"and its neighborhood [{number_neighbours//2} || {number_neighbours//2}]")
            display(df[columns].loc[range(idk_outlier - number_neighbours//2, idk_outlier + number_neighbours//2 + 1), :])

            # Correct the outlier 
            computed_value = weighted_knn(df[columns], idk_outlier, list_outliers, "Generated Energy (kilowatt-hours)",
                                          K = number_neighbours, verbose=True)
            df.loc[idk_outlier, "Generated Energy (kilowatt-hours)"] = computed_value
    print(20*"-","FINISHED",20*"-")
    return df
# ----------------------------------------
#nominal_max_power = 997.92 # SOURCE: Binetto X - Schema unifilare
inv_data[inv_names[0]] = outlier_correction_gen_energy(inv_data[inv_names[0]])

# FINDINGS
# STRAT 1: It's perfectly legitime (negative values are not valid)
# STRAT 2: Theoretical limit --> the variable is an "interval sum" of the energy generated.
# ---> Possible issue --> it's up to the sampling? kWh?

## VARIABLE: **Voltage  & Voltage(AN/BN/CN)**

In [None]:
pd.set_option('display.max_rows', 500)

# Set the dataset
df = inv_data[inv_names[0]]
columns = ["Date/Time (Europe/Rome)", "Voltage (volts)", "Voltage AN (volts)", "Voltage BN (volts)", "Voltage CN (volts)"]

# 1A) Detect extreme outliers 
extreme_vac_outlier = find_outliers(df[columns], threshold = 5, verbose=True,)

if len(extreme_vac_outlier) == 0:
    print("Oh, no extreme outliers found. That's good.\n")
else:
    #display(extreme_vac_outlier)
    
    # Carry out a first analysis
    # a) Hours of the outliers
    extreme_vac_outlier["Hour"] = extreme_vac_outlier["Date/Time (Europe/Rome)"].dt.time.apply(lambda time: time.strftime("%H"))
    
    # b) Group according to zero and non-zero values
    extreme_vac_outlier["Zero Vac values"] = extreme_vac_outlier["Voltage (volts)"].apply(lambda voltage: "Yes" if voltage == 0 else "No")
    grouped_vac_outlier = extreme_vac_outlier.groupby(by =["Hour", "Zero Vac values"]).count()["Date/Time (Europe/Rome)"]
    display(grouped_vac_outlier.to_frame())
    
    zero_values = extreme_vac_outlier[extreme_vac_outlier["Voltage (volts)"] == 0]
    print(f"Extreme outliers ({len(extreme_vac_outlier)}) of Vac have found "\
          f"(zero values: {round((len(zero_values)/len(extreme_vac_outlier))*100)} %)")
    
    # Identfy the index of the outliers 
    idk_outliers = extreme_vac_outlier.index.tolist()
        
# ---- FINDINGS -----
# The outliers are many (i.e., around 1K) and they are scattered throughout the day 
# (even though the most of the outiliers are in the morning hours).
# The zero values are around 33 % of these outliers. 
# Therefore, considering this behaviour and the importance of these variables (AC Voltages) for the future ML model

# STARTEGY --> Do nothing

## VARIABLE: **Irradiance (watts-per-meter-sq)**

In [None]:
df = inv_data[inv_names[0]]
irr = df["Irradiance (watts-per-meter-sq)"]

# Count zero values
irr_zero_values = irr[irr == 0]
available_pos_values = irr[irr != 0].dropna()
print(f"\nZERO VALUES: {len(irr_zero_values)} ({ round((len(irr_zero_values)/len(irr))*100, 2) } %)")
print(f"AVAILABLE POSITIVE VALUES: {len(available_pos_values)} ({ round((len(available_pos_values)/len(irr))*100, 2) } %)\n")
print("-" * 20, "DESCRIPTIVE STATISTICS", "-" * 20)
display(irr.describe())

# Investigate high values
emp_threshold = 1300
likely_outliers = df[irr > emp_threshold]
print("\n","-" * 20, f"PROBABLE OUTLIERS (> {emp_threshold} w/mq)", "-" * 20)
display(likely_outliers)

# Visualize the neighbours of a possible outlier 
idk = likely_outliers.index.tolist()[0]
window = 2
print("-" * 20, "VISUALIZE AN EXAMPLE OF OUTLIERS" , "-" * 20 )
print(f"IDK: {idk} ({df.iloc[idk, 0]}) and its neighbourhood ({window} | idk | {window})")
display(df.iloc[idk - window:idk + window + 1,:])

# ---- FINDINGS -----
# The values beyond an emperical threshold of 1400 w/mq seems to be valid
# The neighbours are often different, but other parameters (Current, Power) seem to be coherent with this high irradiance value

# STARTEGY --> Do nothing

# STRING INVERTER

## Exploration

In [None]:
# --------------- STRING INVERTER: Special -----------------------------
idk_special = 52 if system_name == SYSTEM_NAMES[0] else 0
special_df = string_inv_data[string_inv_names[idk_special]]

# Visualize info columns
print(30 * "-", f"Special STRING INVERTER ({string_inv_names[idk_special]})", 30 * "-", "\n")
special_df.info()

# Check out the difference between the variables (Generated Energy and PV Energy)
cols = ["Generated Energy [Interval Sum(kilowatt-hours)]", "PV Energy [Interval Sum(kilowatt-hours)]"] 
energy = special_df[["Date/Time [Europe/Rome]"] + cols]
energy = energy.dropna(subset = cols)
diff = np.abs(energy[cols[0]] - energy[cols[1]])
print("\n", 20 * "-", f"COMPARISON between '{cols[0].split('[')[0].rstrip().upper()}' and '{cols[1].split('[')[0].rstrip().upper()}'", 20 * "-",)
print(f"MEAN (abs diff): {np.mean(diff)}\nSTD (abs diff): {np.std(diff)}")

# Check out the column of DC power
dc_power_col = "DC Gen. Power [Average(watts)]"
if dc_power_col in special_df.columns:
    dc_power = special_df[~ special_df[dc_power_col].isnull()]
    print(20 * "-", f"Checking out the variable {dc_power_col.split('[')[0]}", 20 * "-")
    print(f"'{dc_power_col.split('[')[0].rstrip()}' column is available only for the {round((len(dc_power) / len(special_df))*100, 2)} % "\
          f"of the entire dataset ({int(round(len(special_df)/1000, 0))} K obs.)")
    print(f"PERIOD AVAILABLE: FROM '{dc_power.iloc[0,0].date()}' TO '{dc_power.iloc[-1,0].date()}' ({(dc_power.iloc[-1,0] - dc_power.iloc[0,0]).components[0]} days)\n", 90 * "-")

# ------ FINDINGS ------
# 1) CHECK OUT --> Generated energy = PV energy
# 2) OUTLIERS
# ---> Generated Energy 
# ---> DC VOLTAGE --> 0 values
# ---> Voltage (+ AN/BN/CN) --> 0 Values 

# --------------- STRING INVERTER: Standard -----------------------------
idk = 0
example_df = string_inv_data[string_inv_names[idk]]
print("\n\n", 40 * "-", "Standard STRING INVERTER", 40 * "-", "\n")
example_df.info()

# Count PV POWER observation
col = "PV Power [Average(watts)]"
if col in example_df.columns:
    pv_power = example_df[~ example_df[col].isnull()]
    print("\n", 20 * "-", f"Checking out the variable {col.split('[')[0]}", 20 * "-")
    print(f"'{col.split('[')[0].rstrip()}' column is available only for the {round((len(pv_power) / len(example_df))*100, 2)} % "\
          f"of the entire dataset ({int(round(len(example_df)/1000, 0))} K obs.)")
    print(f"PERIOD:FROM '{pv_power.iloc[0,0].date()}' TO '{pv_power.iloc[-1,0].date()}'"\
          f"({(pv_power.iloc[-1,0] - pv_power.iloc[0,0]).components[0]} days)\n", 90 * "-")

for col in example_df.columns[1:]:
    stats = example_df[col].describe()
    stats["median"] = example_df[col].median()
    display(stats)
    
# ------ FINDINGS ------
# 1) DISCARD 
# --> PV Power --> it's available only for the 8% of the entire dataset ()
# --> [Special inverter] DC Gen. Power: it's available only for the 5% of the entire dataset (2021/05 - 2021/06)
# --> [Special inverter] PV Energy --> it's a duplicated column (= Generated Energy)
# 2) OUTLIERS
# --> Generated energy (negative + out of range: > 500, maybe 100)
# --> DC voltage --> 0?

# Check uniqueness

In [None]:
item_to_delate = dict()
for inv_str_name in string_inv_names:
    item_to_delate[inv_str_name] = check_datetime_uniqueness(string_inv_data[inv_str_name])

## Carry out some transformation 
1. **Discarted** duplicated observations (due to the time changes).
2. **Discard** some (3) *columns*: 
    1. 'PV Power' due to its limited availbility *(8%: 2021/05 - 2021/07)* of the entire dataset.
    2. *[Special String Inverter]* 'DC Gen. Power' due to its limited availbility *(5%: 2021/05 - 2021/06)* of the entire dataset..
    3. *[Special String Inverter]* 'PV Energy' is a duplicated column. (= Generated Energy)
2. **Rename** the *columns* to improve the readiability and coherence
3. **Convert** the *watts (W)* in *kilowatts (kW)* for the variable 'Generated Power' to improve the coherence
4. **Merge** the string inverter data with the *ambiental conditions (Temp. & Irradiance)*.
5. **Reorder** the *columns* to improve the readability.

In [None]:
# Dataset of ambiental condition ambiental condition (i.e., Temperature & Irradiance)
# amb_cond_df = inv_data[inv_names[0]][["Date/Time (Europe/Rome)", "Irradiance (watts-per-meter-sq)", "Ambiental Temp. (celsius)"]]
irr_values = raw_irr_data[["Date/Time [Europe/Rome]", "Irradiance [Average(watts-per-meter-sq)]"]]
if system_name == SYSTEM_NAMES[0]:
    cond = irr_values["Date/Time [Europe/Rome]"].dt.date > cutoff_date
    irr_values = irr_values[cond]
temp_values = merged_temp
amb_cond_df = irr_values.merge(temp_values, how="inner")
amb_cond_df.rename(
    columns = {
        "Date/Time [Europe/Rome]": "Date/Time (Europe/Rome)",
        "Irradiance [Average(watts-per-meter-sq)]": "Irradiance (watts-per-meter-sq)"},
    inplace = True
)

# TASK: Carry out some transformation for each string inverter
for name in string_inv_names:
    print("\n", 30*"-", "STRING INVERTER:", name,  30*"-", "\n")
    
    # TASK A: Drop duplicated indexes
    string_inv_data[name].drop(index = item_to_delate[name], inplace=True)
    string_inv_data[name].reset_index(inplace = True, drop=True)
    print(f"A) Some ({len(item_to_delate[name])}) duplicated rows (i.e., due to the time changes) have been DISCARTED")
    
    # TASK A.2: Drop all empty observations for the special inverter (timestamps always available, but data starts in 2021)
    if (system_name == SYSTEM_NAMES[0] and name == "INV53"):
        string_inv_data[name].dropna(subset = string_inv_data[name].columns[1:], how="all", inplace=True)
        string_inv_data[name].reset_index(inplace = True, drop=True)
        display(string_inv_data[name])
        
    # TASK B: Discard some columns
    col_to_discard = []
    if system_name == SYSTEM_NAMES[0]:
        col_to_discard.append("PV Power [Average(watts)]")
        if name == "INV53":
            col_to_discard.append("PV Energy [Interval Sum(kilowatt-hours)]")
            col_to_discard.append("DC Gen. Power [Average(watts)]")
    else:
        if name == "INV01":
            col_to_discard.append("PV Energy [Interval Sum(kilowatt-hours)]")
    if len(col_to_discard) > 0:
        string_inv_data[name].drop(columns = col_to_discard, inplace=True)
        print(f"\nB) {len(col_to_discard)} columns has been discarted (due to their limitated availability or because their were duplicated).\n   " \
              f"{[item.split('[')[0].rstrip() for item in col_to_discard]}")
    else:
        print(f"\nB) All the columns have been selected. There's no need to discard any columns")
    
    # TASK C: Rename columns 
    # 1) Remove unnecessary words
    col_names = string_inv_data[name].columns
    col_names = [col.replace("[Average", "").replace("[Interval Sum", "").rstrip(']') for col in col_names]
    
    # 2) Use round brackets instead of square brackets (to improve visual coherence) 
    col_names[0] =  "Date/Time (Europe/Rome)"
    
    # 3) Rename the two DC current columns
    dc_current_idk_cols  = [col_names.index(col) for col in col_names if col.startswith('DC Current')]
    suffixes = ["A", "B"]
    for idk, col_idk in enumerate(dc_current_idk_cols):
        col_parts = col_names[col_idk].split("(")
        col_name = col_parts[0]
        measure_unit = " (" + col_parts[1].split(")")[0] + ")"
        col_names[col_idk] = col_name + suffixes[idk] + measure_unit 

    # 4) Set the new column names to the dataframe
    string_inv_data[name].columns = col_names 
    print("\nC) All the columns have been RENAMED to improve the readability.")
    
    # TASK D: to watt to kilowatt
    watt_cols = [col for col in col_names if "watts" in col] #["Generated Power (watts)"]
    to_kilowatt = lambda watt_value: watt_value/1000
    for col in watt_cols:
        string_inv_data[name][col] = to_kilowatt(string_inv_data[name][col])
        string_inv_data[name].rename(columns = {col : col.replace("watt", "kilowatt")}, inplace=True)
    print(f"\nD) The watts(W) values included in {len(watt_cols)} column(s) have been TRANSFORMED into kilowatts(kW).\n   "\
          f"{[item.split('[')[0].rstrip() for item in watt_cols]}")

    # TASK E: Merge the string inverter data with the ambiental condition
    string_inv_data[name] = string_inv_data[name].merge(amb_cond_df, how="inner")
    print(f"\nE) The string inverter data ({name}) has been merged with the ambiental conditions (Amb. Temp & irradiance).")

    # TASK F: Reorder columns 
    # Columns of the special string inverter
    if (system_name == SYSTEM_NAMES[0] and name == "INV53") or (system_name == SYSTEM_NAMES[1] and name == "INV01"):
        new_col_order = [
            "Date/Time (Europe/Rome)",
            "Voltage AN (volts)",
            "Voltage BN (volts)",
            "Voltage CN (volts)",
            "Voltage (volts)",
            "Current (amps)",
            "Generated Power (kilowatts)",
            "Generated Energy (kilowatt-hours)",
            "DC Voltage (volts)",
            "DC Current A (amps)",
            "DC Current B (amps)",
            "Device Temp. (celsius)",
            "Ambiental Temp. (celsius)",
            "Irradiance (watts-per-meter-sq)"
        ]
    else:
         # Columns of the standard string inverter
        new_col_order =  [
            "Date/Time (Europe/Rome)",
            "DC Voltage (volts)", 
            "DC Current A (amps)",
            "DC Current B (amps)",
            "Generated Power (kilowatts)", 
            "Generated Energy (kilowatt-hours)",
            "Ambiental Temp. (celsius)",
            "Irradiance (watts-per-meter-sq)"
        ] 
    string_inv_data[name] = string_inv_data[name].reindex(columns = new_col_order)
    print(f"\nF) The ({len(string_inv_data[name].columns)}) columns have been SORTED to improve the readability.\n")
    
    # Final version of the dataframe
    print(40 * "-", "OUTCOME", 40 * "-")
    string_inv_data[name].info()

## OUTLIERS CORRECTION

### VARIABLE: *Generated Energy*

In [None]:
for str_inv_name in string_inv_names:
    print("\n", 50*"-", f"STRING INVERTER: {str_inv_name}", 50*"-")
    
    # Currect negative energy values 
    string_inv_data[name] = outlier_correction_gen_energy(string_inv_data[str_inv_name])

### VARIABLE: *DC Voltage*
ISSUE: Zero values

In [None]:
for str_inv_name in string_inv_names:
    print(50*"-", f"STRING INVERTER: {str_inv_name}", 50*"-")
    
    # Find the observation with the variable equal to zero (i.e., the problematic behaviour)
    zero_values = string_inv_data[str_inv_name][string_inv_data[str_inv_name]["DC Voltage (volts)"] == 0].index.tolist()
    print(f"Occurrences with the DC voltage equal to zero: {len(zero_values)}")
    
    for idk_obs in zero_values:
        to_visualize = string_inv_data[str_inv_name].iloc[idk_obs - 1: idk_obs + 2, :]
        print("PREVIOUS VALUE: ",to_visualize.iloc[0, 1])
        #print("OUTLIER:        ", string_inv_data[str_inv_name].iloc[idk_obs, 1])
        print("NEXT VALUE:     ", round(to_visualize.iloc[-1, 1], 2) ,"\n")
        

# ----- FINDINGS -------
# The occurrences of zero values of the variable DC Voltage occurr when there's the start-up phase 
# BEFORE: NaN values || OBS: DC Voltage = 0 || AFTER: Positive values 
# ---> STARTEGY ---> Do nothing!

### VARIABLE: *AC Voltages (AN/BN/CN & avg)* 
####  [ONLY SPECIAL STRING INV]

In [None]:
columns = ["Date/Time (Europe/Rome)", "Voltage AN (volts)", "Voltage BN (volts)", "Voltage CN (volts)", "Voltage (volts)"]

# Set the dataset
df = string_inv_data[string_inv_names[idk_special]]

# 1A) Detect extreme outliers 
extreme_vac_outlier = find_outliers(df[columns], verbose=True, threshold = 5)

if len(extreme_vac_outlier) == 0:
    print("Oh, no extreme outliers found. That's good.\n")
else:
    print(f"The extreme outliers ({len(extreme_vac_outlier)}) of Vac.")
    display(extreme_vac_outlier)
    
    # Identfy the index of the outliers 
    idk_outliers = extreme_vac_outlier.index.tolist()
    list_outliers = idk_outliers.copy()

    # Compute and assign the estimated value (weighted average value from its neigbours)
    for idk_outlier in idk_outliers:

        # Visualize the outlier and its neighours
        print(f"Outlier (idk: {idk_outlier}) and its neighborhood")
        display(df.loc[range(idk_outlier - 3, idk_outlier + 4), :])

        # Compute the estimated value and assign the the original dataframe
        df.loc[idk_outlier, "Voltage (volts)"] = weighted_knn(df[columns], idk_outlier, list_outliers, "Voltage (volts)",Fill_nan = False)
        df.loc[idk_outlier, "Voltage AN (volts)"] = weighted_knn(df[columns], idk_outlier, list_outliers,"Voltage AN (volts)",Fill_nan = False)
        df.loc[idk_outlier, "Voltage BN (volts)"] = weighted_knn(df[columns], idk_outlier, list_outliers, "Voltage BN (volts)",Fill_nan = False)
        df.loc[idk_outlier, "Voltage CN (volts)"] = weighted_knn(df[columns], idk_outlier, list_outliers, "Voltage CN (volts)",Fill_nan = False)

        # Visualize the outcome
        print("\n", 20 * "-", "New data (with the filled value(s))",  20 * "-")
        display(string_inv_data[string_inv_names[idk_special]].loc[idk_outlier][columns])
        print("-" * 80)

# Save outcomes as CSV files

In [None]:
# FINAL TASK: Save the cleared datasets 
saving_folder_name = "Cleaned"
saving_string_inv_name = "String Inverters"
saving_folder_path = path.join(system_path, saving_folder_name)
saving_stringInv_folder_path = path.join(system_path, saving_folder_name, saving_string_inv_name)

print(f"TIME: {datetime.now().strftime('%H:%M')}",f"\nPV System --> {system_name.upper()}")

# Create the main saving folder
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path)
    print(f"A new saving folder has been created: {saving_folder_path}\n")

# Create the saving folder for the cleaned inverter data
if not path.exists(saving_stringInv_folder_path):
    makedirs(saving_stringInv_folder_path)
    print(f"A new saving folder has been created: {saving_stringInv_folder_path}\n")
    
# Save the files as CSV files 
for inv_name in inv_names:
    
    # DF: System
    file_name = f"cleaned_{inv_name.upper()}_data.csv"
    inv_data[inv_name].to_csv(path.join(saving_folder_path, file_name), index=False)
    print(f"--> The cleaned data for '{inv_name}' has been saved.\n")
    
     # DFs: String inverter 
    for string_inv_name in string_inv_names:
        file_name = f"cleaned_INV_{string_inv_name.replace('INV', '')}_data.csv"
        string_inv_data[string_inv_name].to_csv(path.join(saving_stringInv_folder_path, file_name), index=False)
        print(f"--> The cleaned data for '{string_inv_name}' has been saved.")
        
print("\n","-"*40, "FINISHED", "-"*40)