In [24]:
from sys import path
if '../..' not in path:
    path.insert(0, '../..')

In [25]:
from os import path, makedirs
from _library.utils import SYSTEM_NAMES_FULL, load_datasets
from scipy.stats import zscore
from IPython.display import clear_output
import numpy as np
import pandas as pd

In [26]:
# Select the main folder 
%cd /mnt/data/vieri/projects/SAMPLE/

# Visualize names of PV systems
print(SYSTEM_NAMES_FULL)
# --- 0 ---------- 1 --------- 2 ------ 3 ------ 4 --------- 5 --------- 6 -------- 7 ---

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Cantore', 'Emi', 'Soleto 1', 'Soleto 2', 'Galatina', 'Verone']


# Selecting the PV system

In [27]:
system_name = SYSTEM_NAMES_FULL[3]
print("PV SYSTEM --> ", system_name)

PV SYSTEM -->  Emi


# Loading dataset

In [28]:
# Loading the datasets
path_file, inv_data, inv_names, raw_irr_data, string_inv_data, string_inv_names = load_datasets(system_name, verbose=True)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> EMI 
--------------------------------------------------------------------------------

Loading inverter data...
EMI: OK, component data loaded (4) --> INV1, INV2, INV3, INV4

Loading irradiance values...
EMI: OK, raw irradiance data (238822 observations) have been loaded

-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 1)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Emi: INV1 (FROM '2018-07-27' TO '2021-06-30': 1069 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149899 entries, 0 to 149898
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype       

# Exploration

In [29]:
# TASK (EXPLORE): Carry out some descriptive statistics on problematic columns (i.e., those identified through numerical and temporal distribution graphs)
to_load = inv_names[0]

# Check numerical distribution via some descriptive statistics (mean, IQR, etc)
check_emptiness = ["Pac S (kW)", "Pac T (kW)", "Stato", "Fac (Hz)"] 
varibles_to_analyse = ["Irr. medio (W/mq)", "Inverter temp. (°C)"]
for var in [check_emptiness + varibles_to_analyse]:
    print(inv_data[to_load][var].describe(), "\n")

# Quick check at the invalid temperature values (above the likely physical limit)
valid_temperature = 200 # °C
valid_tempValue = inv_data[to_load].loc[inv_data[to_load]["Inverter temp. (°C)"] <= valid_temperature, :]
print(f"VALID TEMP. VALUE: {len(valid_tempValue)} ({round((len(valid_tempValue)/len(inv_data[to_load]['Inverter temp. (°C)']))*100, 2)}%)")

# FINDINGS: 
# 1) "Pac S (kW)", "Pac T (kW)", "Stato", "Fac (Hz)" --> OK: 0 valid values --> They can be discarded
# 2) Irr. medio --> it seems okay (mean, std, IQR, min, max)
# 3) Inverter temp [°C] --> Strange 
#   --->  mean: 1064, std: 8223, IQR: 11-21, max = 65535
#   ---> value above 1000 should be analyse 

       Pac S (kW)  Pac T (kW)  Stato  Fac (Hz)  Irr. medio (W/mq)  \
count         0.0         0.0    0.0       0.0      128781.000000   
mean          NaN         NaN    NaN       NaN         293.305224   
std           NaN         NaN    NaN       NaN         295.555958   
min           NaN         NaN    NaN       NaN           3.000000   
25%           NaN         NaN    NaN       NaN          25.000000   
50%           NaN         NaN    NaN       NaN         177.000000   
75%           NaN         NaN    NaN       NaN         536.000000   
max           NaN         NaN    NaN       NaN        1301.000000   

       Inverter temp. (°C)  
count        149899.000000  
mean           1659.574427  
std           10242.864404  
min               1.000000  
25%              11.000000  
50%              17.000000  
75%              23.000000  
max           65535.000000   

VALID TEMP. VALUE: 146141 (97.49%)


In [30]:
# TASK (EXPLORE): Check whather a type converstion is necessary?
# VARIABLE: Irr. medio (W/mq) --> From 'float64' to int64
irr_values = inv_data[to_load]["Irr. medio (W/mq)"]
non_integer = [value for value in irr_values if not value.is_integer()]
non_nanInteger = [value for value in non_integer if not np.isnan(value)]
print("Number of non integer values (that are not NaN values):", len(non_nanInteger))

# Check its validity 
integer_irr_values = pd.array(irr_values, dtype="Int64")
diff = np.abs(irr_values - integer_irr_values)
diff.dropna(inplace = True)
mean = np.mean(diff)
std = np.std(diff)
print(f"\nCASTING to Int64: Compute difference to check validity.\nMEAN (diff): {mean}, STD (diff):{std}")

# FINDIGS: Irradiance values are all integer values, but the empty values (numpy NaN) was forcing the column to be a float number

Number of non integer values (that are not NaN values): 0

CASTING to Int64: Compute difference to check validity.
MEAN (diff): 0.0, STD (diff):0.0


# fix uniqueness of the datetimes

In [31]:
# Check uniqueness --> Date/time
def check_datetime_uniqueness():
    index_to_delate = dict()
    
    for inv_name in inv_names:
        df = inv_data[inv_name]
        print(f"\nAnalysing {inv_name}...")
        
        if set(check_emptiness).issubset(df.columns):
            df = df.drop(columns = ["Irr. medio (W/mq)"] + check_emptiness)

        # Check duplicates
        condition = df["Date/Time"].duplicated(keep=False)
        duplicated_datetime = df[condition]
        
        # Continue to analyse other inverter in case of no duplicated observations
        if len(duplicated_datetime) == 0:
            print(f"Oh, that's good. No duplicated observations have been found for {inv_name}!")
            continue
        
        # Extraxct unique datetimes
        unique_duplicated_dt = [pd.to_datetime(datetime) for datetime in duplicated_datetime["Date/Time"].unique()]
        print(f"DUPLICATE DATES: {len(unique_duplicated_dt)}: {[dt.strftime('%Y-%m-%d, %H:%M') for dt in unique_duplicated_dt]} "\
              f"(Total observations: {len(duplicated_datetime)})")
        #display(duplicated_datetime)

        # Investigate to this behaviour for a duplicate observations
        datetime_to_investigate = unique_duplicated_dt[0]
        delta = pd.Timedelta(15, unit="minutes") #datetime.timedelta(minutes=60)
        period = (datetime_to_investigate - delta, datetime_to_investigate + delta)
        #display(df.loc[df["Date/Time"].between(period[0], period[1]), :])

        # Compute difference 
        to_discard = set()
        for datetime in unique_duplicated_dt:
            daily_duplicated_indexes = df[df["Date/Time"] == datetime].index
            duplicated_obs = df.loc[daily_duplicated_indexes, :].drop(columns = "Date/Time")
            obs = pd.Series()
            
            daily_duplicated_indexes
            
            if len(unique_duplicated_dt) > 300:
                clear_output(wait=True)
            print(f"\nAnalysing {datetime} - (observations: {len(daily_duplicated_indexes)})")
            print("-"*120)
            
            equal_observations = set()
            for idk_check, index_obs in enumerate(daily_duplicated_indexes):
                
                other_duplicated_obs = duplicated_obs.drop(index_obs, axis=0)

                check = duplicated_obs.loc[index_obs,:].eq(other_duplicated_obs)
                find_equal_obs = check.all(axis=1)[check.all(axis=1) == True].index.tolist()

                print(f"--> Analysing idk: {index_obs}...")                
                if find_equal_obs:
                    print(f"    Equal observation(s) have been found: {find_equal_obs}")
                    idk_equal_obs = [index_obs] + [idk for idk in find_equal_obs]
                    idk_equal_obs.sort()
                    
                    #display(duplicated_obs.loc[idk_equal_obs, :])
                    #idk_equal_obs = [idk for idk in find_equal_obs]

                    # Save pairs of equal duplicated observations
                    equal_observations.add(tuple(idk_equal_obs))
   
            # Keep the first equal and discard the other equal observations
            duplicated_diff_obs = sorted([obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:]])
            
            if len(duplicated_diff_obs) != 0:
                to_discard.update(duplicated_diff_obs)
                print("\n--> OK, discarding the identical observations: ", duplicated_diff_obs)
            else:
                print("No equal observations have been found")
        
            # Highlight potential issues: same timestamp not equal values
            remaining_duplicated_obs = sorted(set(daily_duplicated_indexes) - set(duplicated_diff_obs))
            
            if len(remaining_duplicated_obs) > 1:
                print(f"\n--> Issue, duplicated observations with different values have been found! --> {remaining_duplicated_obs}")
                
                mean_deltas = []
                to_discard_strat_2 = []
                for idk_observation in remaining_duplicated_obs:
                    print(f"    FOCUSING ON IDK: {idk_observation}")

                    # Compute the idk of previous and upcoming observations
                    prev_idk = idk_observation - 1
                    if prev_idk in daily_duplicated_indexes:
                        prev_idk = idk_observation - len(duplicated_diff_obs) - 1 
      
                    upcom_idk = idk_observation + 1
                    if upcom_idk in daily_duplicated_indexes:
                        upcom_idk = idk_observation + len(duplicated_diff_obs) + 1

                    # Compute previous and upcoming observations
                    obs_kwh = df.loc[idk_observation, :]["E. totale (kWh)"]
                    prev = df.loc[prev_idk, :]["E. totale (kWh)"]
                    upcom = df.loc[upcom_idk, :]["E. totale (kWh)"]

                    # Compute delta
                    mean_deltas.append((idk_observation,np.mean([np.abs(obs_kwh - prev),np.abs(upcom - obs_kwh)]) ))

                    # Check validity 
                    if prev <= obs_kwh <= upcom:
                        print("    Valid observation.")
                    else:
                        print("    Invalid values of E. total (kWH) the observation will be discarded. ")
                        to_discard_strat_2.append(idk_observation)
                        
                if len(to_discard_strat_2) == len(remaining_duplicated_obs):
                    print(prev_idk, "- ",idk_observation, "-", upcom_idk)
                    neighbours = df.loc[prev_idk:upcom_idk, :]
                    display(neighbours)
                    to_discard_strat_2 = to_discard_strat_2[1:]
                    print(to_discard_strat_2)
                    
                # IN CASE: Multiple valid observations (according to the )
                if len(remaining_duplicated_obs) - len(to_discard_strat_2) > 1:
                    print("\n    Multiple valid duplicated observations have been found. "\
                          f"{sorted(set(remaining_duplicated_obs) - set(to_discard_strat_2))}")

                    # Delete deltas of already selected discarted observations
                    if to_discard_strat_2:
                        idx_to_delate = [item for item in to_discard_strat_2]
                        mean_deltas = [delta for delta in mean_deltas if delta[0] not in idx_to_delate]

                    # Sort according to the deltas
                    mean_deltas.sort(key = lambda item: item[1])

                    if len(mean_deltas) >= 3: # Multiple items to discard
                        to_discard_strat_2.extend([item[0] for item in mean_deltas[1:]])
                    else:
                        to_discard_strat_2.append(*[item[0] for item in mean_deltas[1:]])
                    print(f"    Keeping the one (idk: {mean_deltas[0][0]}) with the less delta with the previous and next observations")                        
            
                # Add the discarted index found with this stategy
                to_discard.update(to_discard_strat_2)
                    
                # Visualize final outcome for the day
                idx_to_discard = to_discard_strat_2 + duplicated_diff_obs
                print(f"\n--> TO KEEP: {set(daily_duplicated_indexes) - set(idx_to_discard)} \---/ "\
                      f"TO DISCARD ({len(idx_to_discard)} out of {len(daily_duplicated_indexes)}): "\
                      f"{sorted(to_discard_strat_2)} + {sorted(duplicated_diff_obs)}")
                print("-"*120)
        
        # Select the index to remove (keep the first one, as they are the equal)
        idx_to_delate = set(obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:])
        item_to_delate = sorted(idx_to_delate.union(to_discard))
        print(f"TO DELATE ({len(item_to_delate)} out of {len(duplicated_datetime)}):", item_to_delate)
        
        # Delate the equal duplicate
        index_to_delate[inv_name] = item_to_delate
        
    return index_to_delate
# -------------------------------------------------------------------------------
index_to_delate = check_datetime_uniqueness()


Analysing INV1...
DUPLICATE DATES: 10: ['2019-01-22, 15:10', '2019-03-05, 16:20', '2019-03-15, 10:25', '2019-03-15, 10:30', '2019-03-15, 11:40', '2019-05-02, 17:55', '2019-05-31, 11:40', '2020-02-12, 11:15', '2020-07-20, 13:10', '2021-02-10, 12:35'] (Total observations: 48)

Analysing 2019-01-22 15:10:00 - (observations: 4)
------------------------------------------------------------------------------------------------------------------------
--> Analysing idk: 14265...
    Equal observation(s) have been found: [14266]
--> Analysing idk: 14266...
    Equal observation(s) have been found: [14265]
--> Analysing idk: 14267...
    Equal observation(s) have been found: [14268]
--> Analysing idk: 14268...
    Equal observation(s) have been found: [14267]

--> OK, discarding the identical observations:  [14266, 14268]

--> Issue, duplicated observations with different values have been found! --> [14265, 14267]
    FOCUSING ON IDK: 14265
    Valid observation.
    FOCUSING ON IDK: 14267
    V

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()



Analysing 2020-07-20 13:10:00 - (observations: 9)
------------------------------------------------------------------------------------------------------------------------
--> Analysing idk: 97966...
    Equal observation(s) have been found: [97967, 97968]
--> Analysing idk: 97967...
    Equal observation(s) have been found: [97966, 97968]
--> Analysing idk: 97968...
    Equal observation(s) have been found: [97966, 97967]
--> Analysing idk: 97969...
    Equal observation(s) have been found: [97971, 97974]
--> Analysing idk: 97970...
    Equal observation(s) have been found: [97972, 97973]
--> Analysing idk: 97971...
    Equal observation(s) have been found: [97969, 97974]
--> Analysing idk: 97972...
    Equal observation(s) have been found: [97970, 97973]
--> Analysing idk: 97973...
    Equal observation(s) have been found: [97970, 97972]
--> Analysing idk: 97974...
    Equal observation(s) have been found: [97969, 97971]

--> OK, discarding the identical observations:  [97967, 97968,

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()



--> TO KEEP: {28581} \---/ TO DISCARD (8 out of 9): [28576, 28578] + [28577, 28579, 28580, 28582, 28583, 28584]
------------------------------------------------------------------------------------------------------------------------

Analysing 2019-05-31 11:40:00 - (observations: 2)
------------------------------------------------------------------------------------------------------------------------
--> Analysing idk: 33360...
--> Analysing idk: 33361...
No equal observations have been found

--> Issue, duplicated observations with different values have been found! --> [33360, 33361]
    FOCUSING ON IDK: 33360
    Valid observation.
    FOCUSING ON IDK: 33361
    Valid observation.

    Multiple valid duplicated observations have been found. [33360, 33361]
    Keeping the one (idk: 33360) with the less delta with the previous and next observations

--> TO KEEP: {33360} \---/ TO DISCARD (1 out of 2): [33361] + []
-----------------------------------------------------------------------

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()



Analysing 2019-03-15 10:25:00 - (observations: 6)
------------------------------------------------------------------------------------------------------------------------
--> Analysing idk: 21300...
    Equal observation(s) have been found: [21301]
--> Analysing idk: 21301...
    Equal observation(s) have been found: [21300]
--> Analysing idk: 21302...
    Equal observation(s) have been found: [21303]
--> Analysing idk: 21303...
    Equal observation(s) have been found: [21302]
--> Analysing idk: 21304...
    Equal observation(s) have been found: [21305]
--> Analysing idk: 21305...
    Equal observation(s) have been found: [21304]

--> OK, discarding the identical observations:  [21301, 21303, 21305]

--> Issue, duplicated observations with different values have been found! --> [21300, 21302, 21304]
    FOCUSING ON IDK: 21300
    Valid observation.
    FOCUSING ON IDK: 21302
    Valid observation.
    FOCUSING ON IDK: 21304
    Valid observation.

    Multiple valid duplicated observa

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()



--> OK, discarding the identical observations:  [127032, 127034]

--> Issue, duplicated observations with different values have been found! --> [127031, 127033]
    FOCUSING ON IDK: 127031
    Valid observation.
    FOCUSING ON IDK: 127033
    Valid observation.

    Multiple valid duplicated observations have been found. [127031, 127033]
    Keeping the one (idk: 127031) with the less delta with the previous and next observations

--> TO KEEP: {127031} \---/ TO DISCARD (3 out of 4): [127033] + [127032, 127034]
------------------------------------------------------------------------------------------------------------------------
TO DELATE (38 out of 48): [14366, 14367, 14369, 19931, 19932, 19934, 21301, 21302, 21303, 21304, 21305, 21307, 21320, 21321, 21322, 28998, 28999, 29000, 29001, 29002, 29004, 29005, 29006, 33786, 72824, 72825, 72827, 98475, 98476, 98477, 98478, 98479, 98481, 98482, 98483, 127032, 127033, 127034]

Analysing INV4...
DUPLICATE DATES: 10: ['2019-01-22, 15:10', '20

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()


Unnamed: 0,Date/Time,Iac R (A),Iac S (A),Iac T (A),Vac R (V),Vac S (V),Vac T (V),Pac R (kW),E. totale (kWh),Cc 1 (A),Vcc 1 (V),Allarme,Inverter temp. (°C)
33791,2019-05-31 11:40:00,533,531,541,121,121,120,187,1359427.4,455,409,553701696,25
33792,2019-05-31 11:40:00,627,621,633,120,119,119,219,1359420.9,527,413,553701696,25
33793,2019-05-31 11:50:00,277,267,279,119,119,119,94,1359438.3,233,400,553701696,23


[33792]

--> TO KEEP: {33791} \---/ TO DISCARD (1 out of 2): [33792] + []
------------------------------------------------------------------------------------------------------------------------

Analysing 2020-02-12 11:15:00 - (observations: 4)
------------------------------------------------------------------------------------------------------------------------
--> Analysing idk: 72829...
    Equal observation(s) have been found: [72830]
--> Analysing idk: 72830...
    Equal observation(s) have been found: [72829]
--> Analysing idk: 72831...
    Equal observation(s) have been found: [72832]
--> Analysing idk: 72832...
    Equal observation(s) have been found: [72831]

--> OK, discarding the identical observations:  [72830, 72832]

--> Issue, duplicated observations with different values have been found! --> [72829, 72831]
    FOCUSING ON IDK: 72829
    Valid observation.
    FOCUSING ON IDK: 72831
    Valid observation.

    Multiple valid duplicated observations have been found. [7

  obs = pd.Series()
  obs = pd.Series()
  obs = pd.Series()


# Carry out some transformations

In [32]:
# TASK: Carry out the necessary stategies to tackle the issues discovered in the above analyses
for inv_name in inv_names:
    print(f"\n{inv_name.upper()}...")
    
    # TASK: Delate duplicate observations (same timestamps)
    inv_data[inv_name].drop(index = index_to_delate[inv_name], inplace=True)
    inv_data[inv_name].reset_index(drop=True, inplace = True)
    print(f"--> Discared some duplicated observations (i.e., same timestamps) ({len(index_to_delate[inv_name])})")
    
    # TASK: Delate the columns (i.e., variables) that have been confirmed empty
    inv_data[inv_name].drop(columns = check_emptiness, inplace=True)
    print(f"--> Discared some empty columns (i.e., {check_emptiness})")
    
     # TASK: Remove the irradiance column that was joined previously --> it'll be re-added after fixing the sampling of both side
    inv_data[inv_name].drop(columns = ["Irr. medio (W/mq)"], inplace=True)
    print("--> Removed the column of irradiance values, since it will be re-added after tackling the sampling issue.")
# ---------------------------------------------------------------------------------
# TASK: Carry out some minor changes on the "raw irradiance file"
print("\nRAW IRRADIANCE FILE ...")
# Rename the column of the data
raw_irr_data.rename(columns = {"data":"Date/Time", "irr. medio 1 W/mq":"Irradiance (W/mq)"}, inplace=True)
print("--> Renamed the columns to improve readability and compatibility with other datasets")


INV1...
--> Discared some duplicated observations (i.e., same timestamps) (38)
--> Discared some empty columns (i.e., ['Pac S (kW)', 'Pac T (kW)', 'Stato', 'Fac (Hz)'])
--> Removed the column of irradiance values, since it will be re-added after tackling the sampling issue.

INV2...
--> Discared some duplicated observations (i.e., same timestamps) (38)
--> Discared some empty columns (i.e., ['Pac S (kW)', 'Pac T (kW)', 'Stato', 'Fac (Hz)'])
--> Removed the column of irradiance values, since it will be re-added after tackling the sampling issue.

INV3...
--> Discared some duplicated observations (i.e., same timestamps) (38)
--> Discared some empty columns (i.e., ['Pac S (kW)', 'Pac T (kW)', 'Stato', 'Fac (Hz)'])
--> Removed the column of irradiance values, since it will be re-added after tackling the sampling issue.

INV4...
--> Discared some duplicated observations (i.e., same timestamps) (38)
--> Discared some empty columns (i.e., ['Pac S (kW)', 'Pac T (kW)', 'Stato', 'Fac (Hz)'])
--

#### check

In [33]:
# Check validity of the transformation
inv_data[inv_names[0]].info()
print("-"*80)
index_to_delate = check_datetime_uniqueness()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149861 entries, 0 to 149860
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Date/Time            149861 non-null  datetime64[ns]
 1   Iac R (A)            149861 non-null  int64         
 2   Iac S (A)            149861 non-null  int64         
 3   Iac T (A)            149861 non-null  int64         
 4   Vac R (V)            149861 non-null  int64         
 5   Vac S (V)            149861 non-null  int64         
 6   Vac T (V)            149861 non-null  int64         
 7   Pac R (kW)           149861 non-null  int64         
 8   E. totale (kWh)      149861 non-null  float64       
 9   Cc 1 (A)             149861 non-null  int64         
 10  Vcc 1 (V)            149861 non-null  int64         
 11  Allarme              149861 non-null  string        
 12  Inverter temp. (°C)  149861 non-null  int64         
dtypes: datetime64[

# Detect and correct outliers

In [34]:
# -----------------------------------------------------
# FUNCTION: Find the outliers by adopting the z-score
# -----------------------------------------------------
def find_outliers(df, threshold = 3, verbose = False): 
    numerical_df = df.select_dtypes(include = np.number)
    
    # THE MEASURAMENT: Z-score
    # It describes a value's relationship to the mean of a group of values 
    # It's the number of standard deviations by which the observed value is above/below the mean value of what is being measured
    z_score = np.abs(zscore(numerical_df)) 

    # Filter the observations that are above the threshold (from literature: 3 is a typical cut-off point to detect the outliers)
    if len(z_score.columns) == 3:
        cond_outlier = (z_score.iloc[:, 0] > threshold) & (z_score.iloc[:, 1] > threshold) & (z_score.iloc[:, 2] > threshold)
    elif len(z_score.columns) == 2:
         cond_outlier = (z_score.iloc[:, 0] > threshold) & (z_score.iloc[:, 1] > threshold)
    else:
        cond_outlier = z_score.iloc[:, 0] > threshold 

    # Find the outliers according to the threshold of the z-scores
    outliers_idk = z_score[cond_outlier].index
    outliers = df.loc[outliers_idk, :]
    
    if verbose & len(outliers) != 0:
        print(f"Z-score values (threshold: {threshold})")
        print("MIN:", round(np.min(min(z_score[cond_outlier].values.tolist())), 2), \
              "\nMAX:",round(np.max(max(z_score[cond_outlier].values.tolist())), 2))
              
        display(z_score[cond_outlier])
        
    return outliers
# ------------------------------------------------------------------------------------------
# FUNCTION: Compute a weighted average value according to the observation's neighbours (KNN)
# ------------------------------------------------------------------------------------------
def weighted_knn(df, idk_position, list_outliers, column,  K = 6, verbose=True):
    # Define the interval before and after the position
    k_before = K//2 
    k_after = K//2
    
    # Define the range of neighbours 
    idk_range = np.arange(idk_outlier - k_before, idk_outlier + k_after + 1)
    
    # Find the indexes of other outliers (for excluding them in the weighted average)
    if idk_position in list_outliers:
        list_outliers.remove(idk_position)
    
    # Detect potential outliers in its neighbours
    problematic_idk_neighbours = [idk_outlier for idk_outlier in list_outliers if idk_outlier in idk_range]
        
    if len(problematic_idk_neighbours) != 0:
        if verbose:
            print(f"\nOPS: Problematic IDK neighbours discovered {len(problematic_idk_neighbours)} "\
                  f"({round(((len(problematic_idk_neighbours)/len(idk_range))*100), 2)} %) --> {problematic_idk_neighbours}")
        
        # Try to increase the number of neighbours 
        MAX_K = 30
        if len(problematic_idk_neighbours) >= K//2:
            new_k = K*2 
            if new_k <= MAX_K:
                if verbose: 
                    print(f"Trying with more neighbours ({new_k})...")
                weighted_knn(df, idk_position, list_outliers, column,  K = new_k, verbose=False)
            else:
                if verbose:
                    print("Reach maximum of K neighbours")
        
        # Remove problematic neighbours
        idk_range = idk_range[~ np.isin(idk_range, problematic_idk_neighbours)]
        
        if verbose:
            print("\nEnd process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)")
            print(f"The problematic neighbours ({len(problematic_idk_neighbours)}) have been removed from the neighbour list")
    
    # Find the neigbourhood 
    idk_outlier_pos = np.argwhere(idk_range == idk_position)[0][0]
    neighbours_idk = np.delete(idk_range, idk_outlier_pos)
    
    neighbours = np.array(df.loc[neighbours_idk, :][column]) # iloc
    outlier_value = df.loc[idk_position, :][column]
    
    if verbose:
        print(f"\nVARIABLE: {column}")
        print(f"NEIGHBOURES ({len(neighbours)} out of {K}):{neighbours[:idk_outlier_pos]} \--/ {neighbours[idk_outlier_pos:]}")
    
    # Define the weights
    dist_idk = [np.abs(idk - idk_position) for idk in neighbours_idk]
    penalize = lambda dist: min(dist_idk)/dist if dist != min(dist_idk) else 1 #min(dist_idk)/(1 + (dist/10))
    weights = [round(penalize(dist), 2) for dist in dist_idk]
    
    if verbose:
        print("WEIGHTS:", weights)

    # Compute the weighted average 
    weighted_average_value = np.average(neighbours, weights = weights)
    
    # Round and cast the value to an integer value
    candidate_value = int(round(weighted_average_value, 0))
    
    if verbose:
        print(f"COMPUTED VALUE: {candidate_value}")
        print("-" * 80)
    return candidate_value

### Variable: *AC voltage values* (Vac R/S/T)

In [13]:
# TASK: A) Analyse and correct the outliers
# VARIABLES: AC voltage values (Vac R/S/T)
# MOTIVATION: Some invalid values (i.e., zero values) have been detected visually (i.e., numerical distribution graphs)
columns = ["Date/Time", "Vac R (V)", "Vac S (V)", "Vac T (V)"]

# Carry out the analysis (and correction) on all the inverters
for inv_name in inv_names:
    print("INVERTER NAME:", inv_name, "\n", "-"*80)
    
    # 1A) Detect extreme outliers 
    # Setting threshold: a very high threshold to detect (and correct) only extreme outliers (e.g. zero values)
    # Threshold has been set according to emperical tests (default outlier threshold is equal to 3)
    extreme_vac_outlier = find_outliers(inv_data[inv_name][columns], verbose=True, threshold = 5)
    
    if len(extreme_vac_outlier) == 0:
        print("Oh, no extreme outliers found. That's good.\n")
        continue
        
    print(f"The extreme outliers ({len(extreme_vac_outlier)}) of Vac")
    display(extreme_vac_outlier)

    # 1B) Replacing criterion --> Weighted KNN (k = 6)
    # Identfy the index of the outliers 
    idk_outliers = extreme_vac_outlier.index.tolist()
    list_outliers = idk_outliers.copy()
    
    # Compute and assign the estimated value (weighted average value from its neigbours)
    for idk_outlier in idk_outliers:
        
        # Visualize the outlier and its neighours
        print(f"Outlier (idk: {idk_outlier}) and its neighborhood")
        display(inv_data[inv_name][columns].loc[range(idk_outlier - 3, idk_outlier + 4), :])
        
        # Compute the estimated value and assign the the original dataframe
        inv_data[inv_name].loc[idk_outlier, "Vac R (V)"] = weighted_knn(inv_data[inv_name][columns], 
                                                                        idk_outlier, list_outliers, "Vac R (V)")
        inv_data[inv_name].loc[idk_outlier, "Vac S (V)"] = weighted_knn(inv_data[inv_name][columns], 
                                                                        idk_outlier, list_outliers, "Vac S (V)")
        inv_data[inv_name].loc[idk_outlier, "Vac T (V)"] = weighted_knn(inv_data[inv_name][columns],
                                                                        idk_outlier, list_outliers, "Vac T (V)")

        # Visualize the outcome
        print("\nNew data (with the filled value(s))")
        display(inv_data[inv_name].loc[idk_outlier][columns])
        print("-" * 80)

INVERTER NAME: INV1 
 --------------------------------------------------------------------------------
Z-score values (threshold: 5)
MIN: 42.42 
MAX: 46.52


Unnamed: 0,Vac R (V),Vac S (V),Vac T (V)
59112,42.424222,46.522341,45.31855


The extreme outliers (1) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
59112,2019-11-12 13:00:00,0,0,0


Outlier (idk: 59112) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
59109,2019-11-12 12:40:00,111,112,111
59110,2019-11-12 12:45:00,111,111,111
59111,2019-11-12 12:55:00,113,113,113
59112,2019-11-12 13:00:00,0,0,0
59113,2019-11-12 13:05:00,115,115,115
59114,2019-11-12 13:10:00,118,118,118
59115,2019-11-12 13:15:00,117,117,117



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[111 111 113] \--/ [115 118 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[112 111 113] \--/ [115 118 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[111 111 113] \--/ [115 118 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-11-12 13:00:00
Vac R (V)                    114
Vac S (V)                    114
Vac T (V)                    114
Name: 59112, dtype: object

--------------------------------------------------------------------------------
INVERTER NAME: INV2 
 --------------------------------------------------------------------------------
Z-score values (threshold: 5)
MIN: 41.43 
MAX: 46.28


Unnamed: 0,Vac R (V),Vac S (V),Vac T (V)
70876,41.429549,46.277999,43.9042


The extreme outliers (1) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
70876,2020-02-05 13:20:00,0,0,0


Outlier (idk: 70876) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
70873,2020-02-05 13:05:00,121,121,122
70874,2020-02-05 13:10:00,119,119,119
70875,2020-02-05 13:15:00,121,120,121
70876,2020-02-05 13:20:00,0,0,0
70877,2020-02-05 13:25:00,120,120,121
70878,2020-02-05 13:35:00,121,120,121
70879,2020-02-05 13:40:00,121,121,121



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[121 119 121] \--/ [120 121 121]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 120
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[121 119 120] \--/ [120 120 121]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 120
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[122 119 121] \--/ [121 121 121]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 121
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2020-02-05 13:20:00
Vac R (V)                    120
Vac S (V)                    120
Vac T (V)                    121
Name: 70876, dtype: object

--------------------------------------------------------------------------------
INVERTER NAME: INV3 
 --------------------------------------------------------------------------------
Z-score values (threshold: 5)
MIN: 41.28 
MAX: 45.7


Unnamed: 0,Vac R (V),Vac S (V),Vac T (V)
71807,41.280791,45.695119,43.809058


The extreme outliers (1) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
71807,2020-02-05 13:20:00,0,0,0


Outlier (idk: 71807) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
71804,2020-02-05 13:05:00,121,122,122
71805,2020-02-05 13:10:00,119,120,120
71806,2020-02-05 13:15:00,121,122,121
71807,2020-02-05 13:20:00,0,0,0
71808,2020-02-05 13:25:00,120,122,121
71809,2020-02-05 13:35:00,121,121,121
71810,2020-02-05 13:40:00,121,122,121



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[121 119 121] \--/ [120 121 121]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 120
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[122 120 122] \--/ [122 121 122]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 122
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[122 120 121] \--/ [121 121 121]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 121
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2020-02-05 13:20:00
Vac R (V)                    120
Vac S (V)                    122
Vac T (V)                    121
Name: 71807, dtype: object

--------------------------------------------------------------------------------
INVERTER NAME: INV4 
 --------------------------------------------------------------------------------
The extreme outliers (2) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
67397,2020-01-05 19:00:00,51,48,15
95538,2020-07-02 17:10:00,9,6,9


Outlier (idk: 67397) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
67394,2020-01-05 18:45:00,113,114,113
67395,2020-01-05 18:50:00,113,114,113
67396,2020-01-05 18:55:00,114,114,113
67397,2020-01-05 19:00:00,51,48,15
67398,2020-01-05 19:10:00,114,114,113
67399,2020-01-05 19:15:00,114,115,114
67400,2020-01-05 19:20:00,114,114,113



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[113 113 114] \--/ [114 114 114]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[114 114 114] \--/ [114 115 114]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[113 113 113] \--/ [113 114 113]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 113
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2020-01-05 19:00:00
Vac R (V)                    114
Vac S (V)                    114
Vac T (V)                    113
Name: 67397, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 95538) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
95535,2020-07-02 16:50:00,114,114,113
95536,2020-07-02 16:55:00,116,115,115
95537,2020-07-02 17:00:00,114,114,113
95538,2020-07-02 17:10:00,9,6,9
95539,2020-07-02 17:15:00,114,115,114
95540,2020-07-02 17:20:00,114,114,113
95541,2020-07-02 17:25:00,113,114,113



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[114 116 114] \--/ [114 114 113]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[114 115 114] \--/ [115 114 114]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[113 115 113] \--/ [114 113 113]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 114
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2020-07-02 17:10:00
Vac R (V)                    114
Vac S (V)                    114
Vac T (V)                    114
Name: 95538, dtype: object

--------------------------------------------------------------------------------


In [14]:
# Irradiance values
columns = ["Date/Time", "Irradiance (W/mq)"]
irr = raw_irr_data[columns].dropna().copy()
irr.reset_index(inplace=True, drop=True)

extreme_outliers = find_outliers(irr, verbose=True, threshold = 5)

if len(extreme_outliers) == 0:
    print("Oh, no outliers found. That's good.\n")
else:
    print(f"The extreme outliers (N = {len(extreme_outliers)}) of the variable 'inverter Temp'")
    display(extreme_outliers)
    
    idk_outliers = extreme_outliers.index.tolist()
    list_outliers = idk_outliers.copy()

    for idk, idk_outlier in enumerate(idk_outliers):
        print(f"Outlier {idk +1}/{len(idk_outliers)}: ({irr.loc[idk_outlier, 'Irradiance (W/mq)']} W/mq)")

        # Visualize the outlier and its neighours
        print(f"\nOutlier (idk: {idk_outlier}) and its neighborhood")
        display(irr.iloc[range(idk_outlier - 3, idk_outlier + 4), :])

        # Compute the estimated value and assign the the original dataframe
        computed_value = weighted_knn(irr, idk_outlier, list_outliers, "Irradiance (W/mq)", verbose=True)
        irr.loc[idk_outlier, "Irradiance (W/mq)"] = computed_value
        print(f"COMPUTED VALUE: {computed_value}\n","-" * 80)

        # Visualize the outcome
        print("\nNew data (with the filled value(s))")
        display(irr.iloc[range(idk_outlier - 3, idk_outlier + 4)][columns])

    print("-" * 40, f"END correction","-" * 40)
irr["Irradiance (W/mq)"] = irr["Irradiance (W/mq)"].astype("Int64")
raw_irr_data = irr


Oh, no outliers found. That's good.



In [15]:
display(raw_irr_data.describe())

Unnamed: 0,Irradiance (W/mq)
count,238796.0
mean,186.836517
std,270.687225
min,3.0
25%,8.0
50%,22.0
75%,286.0
max,1301.0


### Variable: *Inverter Temp* 

In [16]:
# TASK: B) Analyse and correct the outliers
# VARIABLES: Inverter Temp
# MOTIVATION: Some invalid values have been detected visually (i.e., numerical distribution graphs)
# e.g., Detect around 2K observations with a value in the range [65.529 - 65.535 °C]
columns = ["Date/Time", "Inverter temp. (°C)"]

# Carry out the analysis (and correction) on all the inverters
for inv_name in inv_names:#[3:4]:
    print("INVERTER NAME:", inv_name, "\n", "-"*80)
    
    # 1A) Detect extreme outliers 
    # Setting threshold: a very high threshold to detect (and correct) only extreme outliers (e.g. above 1000 °C)
    # Threshold has been set according to emperical tests (default outlier threshold is equal to 3)
    extreme_outliers = find_outliers(inv_data[inv_name][columns], verbose=True, threshold = 4)
    if len(extreme_outliers) == 0:
        print("Oh, no outliers found. That's good.\n")
        continue
        
    print(f"The extreme outliers (N = {len(extreme_outliers)}) of the variable 'inverter Temp'")
    display(extreme_outliers)
    
    # 1B) Replacing criterion --> Weighted KNN (k = 6)
    # Identfy the index of the outliers 
    idk_outliers = extreme_outliers.index.tolist()
    list_outliers = idk_outliers.copy()
    
    # Compute and assign the estimated value (weighted average value from its neigbours)
    for idk, idk_outlier in enumerate(idk_outliers):
        if len(idk_outliers) > 300:
            clear_output(wait=True)
            
        print(f"Outlier {idk +1}/{len(idk_outliers)}: ({inv_data[inv_name].loc[idk_outlier, 'Inverter temp. (°C)']} °C)")
        
        # Visualize the outlier and its neighours
        # print(f"\nOutlier (idk: {idk_outlier}) and its neighborhood")
        # display(inv_data[inv_name][columns].iloc[range(idk_outlier - 3, idk_outlier + 4), :])
        
        # Compute the estimated value and assign the the original dataframe
        computed_value = weighted_knn(inv_data[inv_name][columns], idk_outlier, list_outliers, "Inverter temp. (°C)", 
                                      verbose=False)
        inv_data[inv_name].loc[idk_outlier, "Inverter temp. (°C)"] = computed_value
        print(f"COMPUTED VALUE: {computed_value}\n","-" * 80)
                                                                      
        # Visualize the outcome
        # print("\nNew data (with the filled value(s))")
        # display(inv_data[inv_name].iloc[range(idk_outlier - 3, idk_outlier + 4)][columns])
        
    print("-" * 40, f"END {inv_name}","-" * 40)

Outlier 389/389: (65535 °C)
COMPUTED VALUE: 12
 --------------------------------------------------------------------------------
---------------------------------------- END INV3 ----------------------------------------
INVERTER NAME: INV4 
 --------------------------------------------------------------------------------
Z-score values (threshold: 4)
MIN: 51.16 
MAX: 51.16


Unnamed: 0,Inverter temp. (°C)
12917,51.159046
12923,51.159046
12925,51.159046
12926,51.159046
12927,51.159046
12930,51.159046
12943,51.159046
12969,51.159046
12971,51.159046
12973,51.159046


The extreme outliers (N = 57) of the variable 'inverter Temp'


Unnamed: 0,Date/Time,Inverter temp. (°C)
12917,2019-01-04 06:55:00,65535
12923,2019-01-04 08:00:00,65535
12925,2019-01-04 08:20:00,65535
12926,2019-01-04 08:30:00,65535
12927,2019-01-04 08:40:00,65535
12930,2019-01-04 09:15:00,65535
12943,2019-01-04 11:35:00,65535
12969,2019-01-04 16:35:00,65535
12971,2019-01-04 16:55:00,65535
12973,2019-01-04 17:15:00,65535


Outlier 1/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------
Outlier 2/57: (65535 °C)
COMPUTED VALUE: 2
 --------------------------------------------------------------------------------
Outlier 3/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------
Outlier 4/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------
Outlier 5/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------
Outlier 6/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------
Outlier 7/57: (65535 °C)
COMPUTED VALUE: 2
 --------------------------------------------------------------------------------
Outlier 8/57: (65535 °C)
COMPUTED VALUE: 1
 --------------------------------------------------------------------------------


In [17]:
# TASK: B2) Check validity of the outlier correction for the inverter temperature
validity = []
inv_temp_values = []
for inv_name in inv_names:
    inv_temp = inv_data[inv_name]["Inverter temp. (°C)"]
    inv_temp_values.append(inv_temp.rename(index = inv_name))
    
    # Check invalid values
    threshold_inv_temp = 100
    invalid_values = inv_temp.loc[inv_temp >= threshold_inv_temp]
    
    if len(invalid_values) == 0:
        validity.append(True)
    else: 
        validity.append(False)

display(pd.concat(inv_temp_values, axis=1, names =inv_names).describe().round(decimals=2))
if all(validity):
    print(f"The outlier correction is valid for all the {len(inv_names)} inverters!")
else:
    print("There is some error in the outlier conversion")

Unnamed: 0,INV1,INV2,INV3,INV4
count,149861.0,149636.0,148146.0,149243.0
mean,16.7,23.29,19.39,21.61
std,8.17,7.31,8.46,6.76
min,1.0,1.0,1.0,1.0
25%,11.0,19.0,13.0,17.0
50%,17.0,25.0,19.0,23.0
75%,23.0,29.0,25.0,27.0
max,41.0,49.0,45.0,45.0


The outlier correction is valid for all the 4 inverters!


# Final task: Save outcomes 

In [18]:
# FINAL TASK: Save the cleared datasets 
saving_folder_name = "Cleaned"
saving_folder_path = path.join(path_file, saving_folder_name)

print("PV System --> ", system_name.upper())

# Create the saving folder
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path)
    print(f"A new saving folder has been created: {saving_folder_path}\n")
    
# Save the files as CSV files 
for inv_name in inv_names:
    file_name = f"cleaned_{inv_name.upper()}_data.csv"
    inv_data[inv_name].to_csv(path.join(saving_folder_path, file_name), index=False)
    print(f"The cleaned data for '{inv_name}' has been saved.")

raw_irr_data.to_csv(path.join(saving_folder_path, "raw_irr_data.csv"), index=False) 
print("\nThe cleaned data for the 'irradiance values' has been saved.")

PV System -->  EMI
The cleaned data for 'INV1' has been saved.
The cleaned data for 'INV2' has been saved.
The cleaned data for 'INV3' has been saved.
The cleaned data for 'INV4' has been saved.

The cleaned data for the 'irradiance values' has been saved.
