In [13]:
from sys import path
if '../..' not in path:
    path.insert(0, '../..')

In [14]:
from os import path, makedirs
from _library.utils import SYSTEM_NAMES_FULL, load_datasets
from scipy.stats import zscore
from IPython.display import clear_output
import numpy as np
import pandas as pd

In [15]:
# Select the main folder 
%cd /mnt/data/vieri/projects/SAMPLE/

# Visualize names of PV systems
print(SYSTEM_NAMES_FULL)
# --- 0 ---------- 1 --------- 2 ------ 3 ------ 4 --------- 5 --------- 6 -------- 7 ---

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Cantore', 'Emi', 'Soleto 1', 'Soleto 2', 'Galatina', 'Verone']


# Selecting the PV system

In [16]:
system_name = SYSTEM_NAMES_FULL[4]
print("PV SYSTEM --> ", system_name)

PV SYSTEM -->  Soleto 1


# Loading dataset

In [17]:
# Loading the datasets
path_file, inv_data, inv_names, raw_irr_data, string_inv_data, string_inv_names = load_datasets(system_name, verbose=True)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> SOLETO 1 
--------------------------------------------------------------------------------

Loading inverter data...
SOLETO 1: OK, component data loaded (4) --> INV1, INV2, INV3, INV4

Loading irradiance values...
SOLETO 1: OK, raw irradiance data (234304 observations) have been loaded

-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 1)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Soleto 1: INV1 (FROM '2018-08-08' TO '2021-06-30': 1057 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146605 entries, 0 to 146604
Data columns (total 18 columns):
 #   Column               Non-Null 

# Exploration

In [6]:
# TASK (EXPLORE): Carry out some descriptive statistics on problematic columns (i.e., those identified through numerical and temporal distribution graphs)
to_load = inv_names[0]

# Check numerical distribution via some descriptive statistics (mean, IQR, etc)
check_emptiness = ["Pac S (kW)", "Pac T (kW)", "Stato", "Fac (Hz)"] 
varibles_to_analyse = ["Irr. medio (W/mq)", "Inverter temp. (°C)"]
for var in [check_emptiness + varibles_to_analyse]:
    print(inv_data[to_load][var].describe(), "\n")

# Quick check at the invalid temperature values (above the likely physical limit)
valid_temperature = 200 # °C
valid_tempValue = inv_data[to_load].loc[inv_data[to_load]["Inverter temp. (°C)"] <= valid_temperature, :]
print(f"VALID TEMP. VALUE: {len(valid_tempValue)} ({round((len(valid_tempValue)/len(inv_data[to_load]['Inverter temp. (°C)']))*100, 2)}%)")

# FINDINGS: 
# 1) "Pac S (kW)", "Pac T (kW)", "Stato", "Fac (Hz)" --> OK: 0 valid values --> They can be discarded
# 2) Irr. medio --> it seems okay (mean, std, IQR, min, max)
# 3) Inverter temp [°C] --> Strange 
#   --->  mean: 1064, std: 8223, IQR: 11-21, max = 65535
#   ---> value above 1000 should be analyse

       Pac S (kW)  Pac T (kW)  Stato  Fac (Hz)  Irr. medio (W/mq)  \
count         0.0         0.0    0.0       0.0      125184.000000   
mean          NaN         NaN    NaN       NaN         306.464061   
std           NaN         NaN    NaN       NaN         371.583198   
min           NaN         NaN    NaN       NaN           4.000000   
25%           NaN         NaN    NaN       NaN          33.000000   
50%           NaN         NaN    NaN       NaN         121.000000   
75%           NaN         NaN    NaN       NaN         578.000000   
max           NaN         NaN    NaN       NaN        5040.000000   

       Inverter temp. (°C)  
count        146605.000000  
mean             41.598738  
std            1121.816403  
min               1.000000  
25%              17.000000  
50%              21.000000  
75%              29.000000  
max           65535.000000   

VALID TEMP. VALUE: 146562 (99.97%)


In [7]:
# TASK (EXPLORE): Check whather a type converstion is necessary?
# VARIABLE: Irr. medio (W/mq) --> From 'float64' to int64
irr_values = inv_data[to_load]["Irr. medio (W/mq)"]
non_integer = [value for value in irr_values if not value.is_integer()]
non_nanInteger = [value for value in non_integer if not np.isnan(value)]
print("Number of non integer values (that are not NaN values):", len(non_nanInteger))

# Check its validity 
integer_irr_values = pd.array(irr_values, dtype="Int64")
diff = np.abs(irr_values - integer_irr_values)
diff.dropna(inplace = True)
mean = np.mean(diff)
std = np.std(diff)
print(f"\nCASTING to Int64: Compute difference to check validity.\nMEAN (diff): {mean}, STD (diff):{std}")

# FINDIGS: Irradiance values are all integer values, but the empty values (numpy NaN) was forcing the column to be a float number

Number of non integer values (that are not NaN values): 0

CASTING to Int64: Compute difference to check validity.
MEAN (diff): 0.0, STD (diff):0.0


# fix uniqueness of the datetimes

In [None]:
# Check uniqueness --> Date/time
def check_datetime_uniqueness():
    index_to_delate = dict()
    
    for inv_name in inv_names:
        df = inv_data[inv_name]
        print(f"\nAnalysing {inv_name}...")
        
        if set(check_emptiness).issubset(df.columns):
            df = df.drop(columns = ["Irr. medio (W/mq)"] + check_emptiness)

        # Check duplicates
        condition = df["Date/Time"].duplicated(keep=False)
        duplicated_datetime = df[condition]
        
        # Continue to analyse other inverter in case of no duplicated observations
        if len(duplicated_datetime) == 0:
            print(f"Oh, that's good. No duplicated observations have been found for {inv_name}!")
            continue
        
        # Extraxct unique datetimes
        unique_duplicated_dt = [pd.to_datetime(datetime) for datetime in duplicated_datetime["Date/Time"].unique()]
        print(f"DUPLICATE DATES: {len(unique_duplicated_dt)}: {[dt.strftime('%Y-%m-%d, %H:%M') for dt in unique_duplicated_dt]} "\
              f"(Total observations: {len(duplicated_datetime)})")
        #display(duplicated_datetime)

        # Investigate to this behaviour for a duplicate observations
        datetime_to_investigate = unique_duplicated_dt[0]
        delta = pd.Timedelta(15, unit="minutes") #datetime.timedelta(minutes=60)
        period = (datetime_to_investigate - delta, datetime_to_investigate + delta)
        #display(df.loc[df["Date/Time"].between(period[0], period[1]), :])

        # Compute difference 
        to_discard = set()
        for datetime in unique_duplicated_dt:
            daily_duplicated_indexes = df[df["Date/Time"] == datetime].index
            duplicated_obs = df.loc[daily_duplicated_indexes, :].drop(columns = "Date/Time")
            obs = pd.Series()
            
            daily_duplicated_indexes
            
            if len(unique_duplicated_dt) > 300:
                clear_output(wait=True)
            print(f"\nAnalysing {datetime} - (observations: {len(daily_duplicated_indexes)})")
            print("-"*120)
            
            equal_observations = set()
            for idk_check, index_obs in enumerate(daily_duplicated_indexes):
                
                other_duplicated_obs = duplicated_obs.drop(index_obs, axis=0)

                check = duplicated_obs.loc[index_obs,:].eq(other_duplicated_obs)
                find_equal_obs = check.all(axis=1)[check.all(axis=1) == True].index.tolist()

                print(f"--> Analysing idk: {index_obs}...")                
                if find_equal_obs:
                    print(f"    Equal observation(s) have been found: {find_equal_obs}")
                    idk_equal_obs = [index_obs] + [idk for idk in find_equal_obs]
                    idk_equal_obs.sort()
                    
                    #display(duplicated_obs.loc[idk_equal_obs, :])
                    #idk_equal_obs = [idk for idk in find_equal_obs]

                    # Save pairs of equal duplicated observations
                    equal_observations.add(tuple(idk_equal_obs))
   
            # Keep the first equal and discard the other equal observations
            duplicated_diff_obs = sorted([obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:]])
            
            if len(duplicated_diff_obs) != 0:
                to_discard.update(duplicated_diff_obs)
                print("\n--> OK, discarding the identical observations: ", duplicated_diff_obs)
            else:
                print("No equal observations have been found")
        
            # Highlight potential issues: same timestamp not equal values
            remaining_duplicated_obs = sorted(set(daily_duplicated_indexes) - set(duplicated_diff_obs))
            
            if len(remaining_duplicated_obs) > 1:
                print(f"\n--> Issue, duplicated observations with different values have been found! --> {remaining_duplicated_obs}")
                
                mean_deltas = []
                to_discard_strat_2 = []
                for idk_observation in remaining_duplicated_obs:
                    print(f"    FOCUSING ON IDK: {idk_observation}")

                    # Compute the idk of previous and upcoming observations
                    prev_idk = idk_observation - 1
                    if prev_idk in daily_duplicated_indexes:
                        prev_idk = idk_observation - len(duplicated_diff_obs) - 1 
      
                    upcom_idk = idk_observation + 1
                    if upcom_idk in daily_duplicated_indexes:
                        upcom_idk = idk_observation + len(duplicated_diff_obs) + 1

                    # Compute previous and upcoming observations
                    obs_kwh = df.loc[idk_observation, :]["E. totale (kWh)"]
                    prev = df.loc[prev_idk, :]["E. totale (kWh)"]
                    upcom = df.loc[upcom_idk, :]["E. totale (kWh)"]

                    # Compute delta
                    mean_deltas.append((idk_observation,np.mean([np.abs(obs_kwh - prev),np.abs(upcom - obs_kwh)]) ))

                    # Check validity 
                    if prev <= obs_kwh <= upcom:
                        print("    Valid observation.")
                    else:
                        print("    Invalid values of E. total (kWH) the observation will be discarded. ")
                        to_discard_strat_2.append(idk_observation)
                        
                if len(to_discard_strat_2) == len(remaining_duplicated_obs):
                    print(prev_idk, "- ",idk_observation, "-", upcom_idk)
                    neighbours = df.loc[prev_idk:upcom_idk, :]
                    display(neighbours)
                    to_discard_strat_2 = to_discard_strat_2[1:]
                    print(to_discard_strat_2)
                    
                # IN CASE: Multiple valid observations (according to the )
                if len(remaining_duplicated_obs) - len(to_discard_strat_2) > 1:
                    print("\n    Multiple valid duplicated observations have been found. "\
                          f"{sorted(set(remaining_duplicated_obs) - set(to_discard_strat_2))}")

                    # Delete deltas of already selected discarted observations
                    if to_discard_strat_2:
                        idx_to_delate = [item for item in to_discard_strat_2]
                        mean_deltas = [delta for delta in mean_deltas if delta[0] not in idx_to_delate]

                    # Sort according to the deltas
                    mean_deltas.sort(key = lambda item: item[1])

                    if len(mean_deltas) >= 3: # Multiple items to discard
                        to_discard_strat_2.extend([item[0] for item in mean_deltas[1:]])
                    else:
                        to_discard_strat_2.append(*[item[0] for item in mean_deltas[1:]])
                    print(f"    Keeping the one (idk: {mean_deltas[0][0]}) with the less delta with the previous and next observations")                        
            
                # Add the discarted index found with this stategy
                to_discard.update(to_discard_strat_2)
                    
                # Visualize final outcome for the day
                idx_to_discard = to_discard_strat_2 + duplicated_diff_obs
                print(f"\n--> TO KEEP: {set(daily_duplicated_indexes) - set(idx_to_discard)} \---/ "\
                      f"TO DISCARD ({len(idx_to_discard)} out of {len(daily_duplicated_indexes)}): "\
                      f"{sorted(to_discard_strat_2)} + {sorted(duplicated_diff_obs)}")
                print("-"*120)
        
        # Select the index to remove (keep the first one, as they are the equal)
        idx_to_delate = set(obs_to_discard for pairs in equal_observations for obs_to_discard in pairs[1:])
        item_to_delate = sorted(idx_to_delate.union(to_discard))
        print(f"TO DELATE ({len(item_to_delate)} out of {len(duplicated_datetime)}):", item_to_delate)
        
        # Delate the equal duplicate
        index_to_delate[inv_name] = item_to_delate
        
    return index_to_delate
# -------------------------------------------------------------------------------
index_to_delate = check_datetime_uniqueness()

# Carry out some transformations

In [None]:
# TASK: Carry out the necessary stategies to tackle the issues discovered in the above analyses
for inv_name in inv_names:
    print(f"\n{inv_name.upper()}...")
    
    # TASK: Delate duplicate observations (same timestamps)
    inv_data[inv_name].drop(index = index_to_delate[inv_name], inplace=True)
    inv_data[inv_name].reset_index(drop=True, inplace = True)
    print(f"--> Discared some duplicated observations (i.e., same timestamps) ({len(index_to_delate[inv_name])})")
    
    # TASK: Delate the columns (i.e., variables) that have been confirmed empty
    inv_data[inv_name].drop(columns = check_emptiness, inplace=True)
    print(f"--> Discared some empty columns (i.e., {check_emptiness})")
    
    # TASK: Remove the irradiance column that was joined previously --> it'll be re-added after fixing the sampling of both side
    inv_data[inv_name].drop(columns = ["Irr. medio (W/mq)"], inplace=True)
    print("--> Removed the column of irradiance values, since it will be re-added after tackling the sampling issue.")
# ---------------------------------------------------------------------------------
# TASK: Carry out some minor changes on the "raw irradiance file"
print("\nRAW IRRADIANCE FILE ...")
# Rename the column of the data
raw_irr_data.rename(columns = {"data":"Date/Time", "irr. medio 1 W/mq":"Irradiance (W/mq)"}, inplace=True)
print("--> Renamed the columns to improve readability and compatibility with other datasets")

#### check

# Detect and correct outliers

In [18]:
# -----------------------------------------------------
# FUNCTION: Find the outliers by adopting the z-score
# -----------------------------------------------------
def find_outliers(df, threshold = 3, verbose = False): 
    numerical_df = df.select_dtypes(include = np.number)
    
    # THE MEASURAMENT: Z-score
    # It describes a value's relationship to the mean of a group of values 
    # It's the number of standard deviations by which the observed value is above/below the mean value of what is being measured
    z_score = np.abs(zscore(numerical_df)) 

    # Filter the observations that are above the threshold (from literature: 3 is a typical cut-off point to detect the outliers)
    if len(z_score.columns) == 3:
        cond_outlier = (z_score.iloc[:, 0] > threshold) & (z_score.iloc[:, 1] > threshold) & (z_score.iloc[:, 2] > threshold)
    elif len(z_score.columns) == 2:
         cond_outlier = (z_score.iloc[:, 0] > threshold) & (z_score.iloc[:, 1] > threshold)
    else:
        cond_outlier = z_score.iloc[:, 0] > threshold 

    # Find the outliers according to the threshold of the z-scores
    outliers_idk = z_score[cond_outlier].index
    outliers = df.loc[outliers_idk, :]
    
    if verbose & len(outliers) != 0:
        print(f"Z-score values (threshold: {threshold})")
        print("MIN:", round(np.min(min(z_score[cond_outlier].values.tolist())), 2), \
              "\nMAX:",round(np.max(max(z_score[cond_outlier].values.tolist())), 2))
              
        display(z_score[cond_outlier])
        
    return outliers
# ------------------------------------------------------------------------------------------
# FUNCTION: Compute a weighted average value according to the observation's neighbours (KNN)
# ------------------------------------------------------------------------------------------
def weighted_knn(df, idk_position, list_outliers, column,  K = 6, verbose=True):
    # Define the interval before and after the position
    k_before = K//2 
    k_after = K//2
    
    # Define the range of neighbours 
    idk_range = np.arange(idk_outlier - k_before, idk_outlier + k_after + 1)
    
    # Find the indexes of other outliers (for excluding them in the weighted average)
    if idk_position in list_outliers:
        list_outliers.remove(idk_position)
    
    # Detect potential outliers in its neighbours
    problematic_idk_neighbours = [idk_outlier for idk_outlier in list_outliers if idk_outlier in idk_range]
        
    if len(problematic_idk_neighbours) != 0:
        if verbose:
            print(f"\nOPS: Problematic IDK neighbours discovered {len(problematic_idk_neighbours)} "\
                  f"({round(((len(problematic_idk_neighbours)/len(idk_range))*100), 2)} %) --> {problematic_idk_neighbours}")
        
        # Try to increase the number of neighbours 
        MAX_K = 30
        if len(problematic_idk_neighbours) >= K//2:
            new_k = K*2 
            if new_k <= MAX_K:
                if verbose: 
                    print(f"Trying with more neighbours ({new_k})...")
                weighted_knn(df, idk_position, list_outliers, column,  K = new_k, verbose=False)
            else:
                if verbose:
                    print("Reach maximum of K neighbours")
        
        # Remove problematic neighbours
        idk_range = idk_range[~ np.isin(idk_range, problematic_idk_neighbours)]
        
        if verbose:
            print("\nEnd process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)")
            print(f"The problematic neighbours ({len(problematic_idk_neighbours)}) have been removed from the neighbour list")
    
    # Find the neigbourhood 
    idk_outlier_pos = np.argwhere(idk_range == idk_position)[0][0]
    neighbours_idk = np.delete(idk_range, idk_outlier_pos)
    
    neighbours = np.array(df.loc[neighbours_idk, :][column]) # iloc
    outlier_value = df.loc[idk_position, :][column]
    
    if verbose:
        print(f"\nVARIABLE: {column}")
        print(f"NEIGHBOURES ({len(neighbours)} out of {K}):{neighbours[:idk_outlier_pos]} \--/ {neighbours[idk_outlier_pos:]}")
    
    # Define the weights
    dist_idk = [np.abs(idk - idk_position) for idk in neighbours_idk]
    penalize = lambda dist: min(dist_idk)/dist if dist != min(dist_idk) else 1 #min(dist_idk)/(1 + (dist/10))
    weights = [round(penalize(dist), 2) for dist in dist_idk]
    
    if verbose:
        print("WEIGHTS:", weights)

    # Compute the weighted average 
    weighted_average_value = np.average(neighbours, weights = weights)
    
    # Round and cast the value to an integer value
    candidate_value = int(round(weighted_average_value, 0))
    
    if verbose:
        print(f"COMPUTED VALUE: {candidate_value}")
        print("-" * 80)
    return candidate_value

### Variable: *AC voltage values* (Vac R/S/T)

In [19]:
# TASK: A) Analyse and correct the outliers
# VARIABLES: AC voltage values (Vac R/S/T)
# MOTIVATION: Some invalid values (i.e., zero values) have been detected visually (i.e., numerical distribution graphs)
columns = ["Date/Time", "Vac R (V)", "Vac S (V)", "Vac T (V)"]

# Carry out the analysis (and correction) on all the inverters
for inv_name in inv_names:
    print("INVERTER NAME:", inv_name, "\n", "-"*80)
    
    # 1A) Detect extreme outliers 
    # Setting threshold: a very high threshold to detect (and correct) only extreme outliers (e.g. zero values)
    # Threshold has been set according to emperical tests (default outlier threshold is equal to 3)
    extreme_vac_outlier = find_outliers(inv_data[inv_name][columns], verbose=True, threshold = 5)
    
    if len(extreme_vac_outlier) == 0:
        print("Oh, no extreme outliers found. That's good.\n")
        continue
        
    print(f"The extreme outliers ({len(extreme_vac_outlier)}) of Vac")
    display(extreme_vac_outlier)

    # 1B) Replacing criterion --> Weighted KNN (k = 6)
    # Identfy the index of the outliers 
    idk_outliers = extreme_vac_outlier.index.tolist()
    list_outliers = idk_outliers.copy()
    
    # Compute and assign the estimated value (weighted average value from its neigbours)
    for idk_outlier in idk_outliers:
        
        # Visualize the outlier and its neighours
        print(f"Outlier (idk: {idk_outlier}) and its neighborhood")
        display(inv_data[inv_name][columns].loc[range(idk_outlier - 3, idk_outlier + 4), :])
        
        # Compute the estimated value and assign the the original dataframe
        inv_data[inv_name].loc[idk_outlier, "Vac R (V)"] = weighted_knn(inv_data[inv_name][columns], 
                                                                        idk_outlier, list_outliers, "Vac R (V)")
        inv_data[inv_name].loc[idk_outlier, "Vac S (V)"] = weighted_knn(inv_data[inv_name][columns], 
                                                                        idk_outlier, list_outliers, "Vac S (V)")
        inv_data[inv_name].loc[idk_outlier, "Vac T (V)"] = weighted_knn(inv_data[inv_name][columns],
                                                                        idk_outlier, list_outliers, "Vac T (V)")

        # Visualize the outcome
        print("\nNew data (with the filled value(s))")
        display(inv_data[inv_name].loc[idk_outlier][columns])
        print("-" * 80)

INVERTER NAME: INV1 
 --------------------------------------------------------------------------------
Z-score values (threshold: 5)
MIN: 61.58 
MAX: 65.35


Unnamed: 0,Vac R (V),Vac S (V),Vac T (V)
34906,61.582879,64.854861,65.354862
36749,61.582879,64.854861,65.354862
37400,61.582879,64.854861,65.354862


The extreme outliers (3) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34906,2019-06-24 14:30:00,0,0,0
36749,2019-07-06 08:25:00,0,0,0
37400,2019-07-10 07:40:00,0,0,0


Outlier (idk: 34906) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34903,2019-06-24 14:05:00,118,118,118
34904,2019-06-24 14:20:00,118,118,118
34905,2019-06-24 14:25:00,118,118,118
34906,2019-06-24 14:30:00,0,0,0
34907,2019-06-24 14:40:00,118,118,118
34908,2019-06-24 14:55:00,118,118,118
34909,2019-06-24 15:00:00,118,118,118



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [118 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [118 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [118 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-06-24 14:30:00
Vac R (V)                    118
Vac S (V)                    118
Vac T (V)                    118
Name: 34906, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 36749) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
36746,2019-07-06 06:15:00,116,116,116
36747,2019-07-06 06:20:00,115,115,116
36748,2019-07-06 06:25:00,115,116,116
36749,2019-07-06 08:25:00,0,0,0
36750,2019-07-06 08:40:00,116,115,116
36751,2019-07-06 09:05:00,115,115,116
36752,2019-07-06 09:10:00,115,115,115



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[116 115 115] \--/ [116 115 115]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 115
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[116 115 116] \--/ [115 115 115]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 115
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[116 116 116] \--/ [116 116 115]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-06 08:25:00
Vac R (V)                    115
Vac S (V)                    115
Vac T (V)                    116
Name: 36749, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 37400) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
37397,2019-07-10 07:10:00,116,115,116
37398,2019-07-10 07:15:00,115,115,116
37399,2019-07-10 07:20:00,115,115,115
37400,2019-07-10 07:40:00,0,0,0
37401,2019-07-10 08:40:00,116,115,116
37402,2019-07-10 08:45:00,116,115,116
37403,2019-07-10 08:50:00,116,115,116



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[116 115 115] \--/ [116 116 116]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[115 115 115] \--/ [115 115 115]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 115
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[116 116 115] \--/ [116 116 116]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-10 07:40:00
Vac R (V)                    116
Vac S (V)                    115
Vac T (V)                    116
Name: 37400, dtype: object

--------------------------------------------------------------------------------
INVERTER NAME: INV2 
 --------------------------------------------------------------------------------
The extreme outliers (6) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
33152,2019-06-14 06:20:00,103,103,104
33154,2019-06-14 06:35:00,100,101,102
34849,2019-06-24 08:10:00,0,0,0
34979,2019-06-25 08:20:00,0,0,0
35949,2019-07-01 07:10:00,0,0,0
35950,2019-07-01 08:10:00,0,0,0


Outlier (idk: 33152) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
33149,2019-06-14 06:05:00,112,113,114
33150,2019-06-14 06:10:00,112,112,114
33151,2019-06-14 06:15:00,113,113,114
33152,2019-06-14 06:20:00,103,103,104
33153,2019-06-14 06:30:00,113,113,114
33154,2019-06-14 06:35:00,100,101,102
33155,2019-06-14 06:40:00,105,106,107



OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [33154]

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (1) have been removed from the neighbour list

VARIABLE: Vac R (V)
NEIGHBOURES (5 out of 6):[112 112 113] \--/ [113 105]
WEIGHTS: [0.33, 0.5, 1, 1, 0.33]
COMPUTED VALUE: 112
--------------------------------------------------------------------------------

OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [33154]

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (1) have been removed from the neighbour list

VARIABLE: Vac S (V)
NEIGHBOURES (5 out of 6):[113 112 113] \--/ [113 106]
WEIGHTS: [0.33, 0.5, 1, 1, 0.33]
COMPUTED VALUE: 112
--------------------------------------------------------------------------------

OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [331

Date/Time    2019-06-14 06:20:00
Vac R (V)                    112
Vac S (V)                    112
Vac T (V)                    113
Name: 33152, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 33154) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
33151,2019-06-14 06:15:00,113,113,114
33152,2019-06-14 06:20:00,112,112,113
33153,2019-06-14 06:30:00,113,113,114
33154,2019-06-14 06:35:00,100,101,102
33155,2019-06-14 06:40:00,105,106,107
33156,2019-06-14 06:45:00,112,113,114
33157,2019-06-14 06:50:00,114,114,115



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[113 112 113] \--/ [105 112 114]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 111
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[113 112 113] \--/ [106 113 114]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 111
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[114 113 114] \--/ [107 114 115]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 112
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-06-14 06:35:00
Vac R (V)                    111
Vac S (V)                    111
Vac T (V)                    112
Name: 33154, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 34849) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34846,2019-06-24 06:05:00,116,117,117
34847,2019-06-24 06:10:00,116,117,117
34848,2019-06-24 08:00:00,117,118,119
34849,2019-06-24 08:10:00,0,0,0
34850,2019-06-24 08:20:00,116,116,118
34851,2019-06-24 08:45:00,116,116,118
34852,2019-06-24 08:50:00,116,117,118



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[116 116 117] \--/ [116 116 116]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[117 117 118] \--/ [116 116 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[117 117 119] \--/ [118 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-06-24 08:10:00
Vac R (V)                    116
Vac S (V)                    117
Vac T (V)                    118
Name: 34849, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 34979) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34976,2019-06-25 06:30:00,116,117,118
34977,2019-06-25 06:50:00,116,117,118
34978,2019-06-25 07:55:00,116,117,118
34979,2019-06-25 08:20:00,0,0,0
34980,2019-06-25 08:35:00,117,117,118
34981,2019-06-25 08:45:00,117,118,119
34982,2019-06-25 09:05:00,117,118,118



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[116 116 116] \--/ [117 117 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [117 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [118 119 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-06-25 08:20:00
Vac R (V)                    116
Vac S (V)                    117
Vac T (V)                    118
Name: 34979, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 35949) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
35946,2019-07-01 05:25:00,117,118,118
35947,2019-07-01 05:30:00,117,118,118
35948,2019-07-01 05:35:00,117,118,118
35949,2019-07-01 07:10:00,0,0,0
35950,2019-07-01 08:10:00,0,0,0
35951,2019-07-01 08:20:00,118,120,119
35952,2019-07-01 08:30:00,117,119,119



OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [35950]

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (1) have been removed from the neighbour list

VARIABLE: Vac R (V)
NEIGHBOURES (5 out of 6):[117 117 117] \--/ [118 117]
WEIGHTS: [0.33, 0.5, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [35950]

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (1) have been removed from the neighbour list

VARIABLE: Vac S (V)
NEIGHBOURES (5 out of 6):[118 118 118] \--/ [120 119]
WEIGHTS: [0.33, 0.5, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> 

Date/Time    2019-07-01 07:10:00
Vac R (V)                    117
Vac S (V)                    118
Vac T (V)                    118
Name: 35949, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 35950) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
35947,2019-07-01 05:30:00,117,118,118
35948,2019-07-01 05:35:00,117,118,118
35949,2019-07-01 07:10:00,117,118,118
35950,2019-07-01 08:10:00,0,0,0
35951,2019-07-01 08:20:00,118,120,119
35952,2019-07-01 08:30:00,117,119,119
35953,2019-07-01 08:45:00,117,118,119



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [118 117 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [120 119 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 119
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [119 119 119]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-01 08:10:00
Vac R (V)                    117
Vac S (V)                    119
Vac T (V)                    118
Name: 35950, dtype: object

--------------------------------------------------------------------------------
INVERTER NAME: INV3 
 --------------------------------------------------------------------------------
Oh, no extreme outliers found. That's good.

INVERTER NAME: INV4 
 --------------------------------------------------------------------------------
The extreme outliers (4) of Vac


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34976,2019-06-25 06:50:00,0,0,46
36762,2019-07-06 09:20:00,96,25,126
36911,2019-07-07 08:10:00,0,0,0
37109,2019-07-08 11:30:00,0,0,0


Outlier (idk: 34976) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
34973,2019-06-25 06:20:00,119,117,117
34974,2019-06-25 06:25:00,119,117,117
34975,2019-06-25 06:30:00,119,117,117
34976,2019-06-25 06:50:00,0,0,46
34977,2019-06-25 07:50:00,117,115,115
34978,2019-06-25 07:55:00,118,116,116
34979,2019-06-25 08:30:00,118,117,117



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[119 119 119] \--/ [117 118 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [115 116 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [115 116 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 116
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-06-25 06:50:00
Vac R (V)                    118
Vac S (V)                    116
Vac T (V)                    116
Name: 34976, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 36762) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
36759,2019-07-06 09:05:00,118,117,117
36760,2019-07-06 09:10:00,118,116,116
36761,2019-07-06 09:15:00,119,117,118
36762,2019-07-06 09:20:00,96,25,126
36763,2019-07-06 09:30:00,119,117,117
36764,2019-07-06 09:35:00,119,118,118
36765,2019-07-06 09:45:00,119,117,117



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[118 118 119] \--/ [119 119 119]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 119
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[117 116 117] \--/ [117 118 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[117 116 118] \--/ [117 118 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-06 09:20:00
Vac R (V)                    119
Vac S (V)                    117
Vac T (V)                    117
Name: 36762, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 36911) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
36908,2019-07-07 07:05:00,119,118,118
36909,2019-07-07 07:10:00,119,118,118
36910,2019-07-07 07:15:00,120,118,118
36911,2019-07-07 08:10:00,0,0,0
36912,2019-07-07 08:30:00,118,117,117
36913,2019-07-07 09:30:00,118,116,117
36914,2019-07-07 09:35:00,119,118,118



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[119 119 120] \--/ [118 118 119]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 119
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [117 116 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[118 118 118] \--/ [117 117 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 118
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-07 08:10:00
Vac R (V)                    119
Vac S (V)                    117
Vac T (V)                    118
Name: 36911, dtype: object

--------------------------------------------------------------------------------
Outlier (idk: 37109) and its neighborhood


Unnamed: 0,Date/Time,Vac R (V),Vac S (V),Vac T (V)
37106,2019-07-08 11:10:00,119,117,117
37107,2019-07-08 11:20:00,120,117,117
37108,2019-07-08 11:25:00,119,117,117
37109,2019-07-08 11:30:00,0,0,0
37110,2019-07-08 11:40:00,119,118,117
37111,2019-07-08 11:45:00,119,117,117
37112,2019-07-08 11:50:00,120,118,117



VARIABLE: Vac R (V)
NEIGHBOURES (6 out of 6):[119 120 119] \--/ [119 119 120]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 119
--------------------------------------------------------------------------------

VARIABLE: Vac S (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [118 117 118]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

VARIABLE: Vac T (V)
NEIGHBOURES (6 out of 6):[117 117 117] \--/ [117 117 117]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 117
--------------------------------------------------------------------------------

New data (with the filled value(s))


Date/Time    2019-07-08 11:30:00
Vac R (V)                    119
Vac S (V)                    117
Vac T (V)                    117
Name: 37109, dtype: object

--------------------------------------------------------------------------------


In [20]:
# Irradiance values
columns = ["Date/Time", "Irradiance (W/mq)"]
irr = raw_irr_data[columns].dropna().copy()
irr.reset_index(inplace=True, drop=True)

extreme_outliers = find_outliers(irr, verbose=True, threshold = 5)

if len(extreme_outliers) == 0:
    print("Oh, no outliers found. That's good.\n")
else:
    print(f"The extreme outliers (N = {len(extreme_outliers)}) of the variable 'inverter Temp'")
    display(extreme_outliers)
    
    idk_outliers = extreme_outliers.index.tolist()
    list_outliers = idk_outliers.copy()

    for idk, idk_outlier in enumerate(idk_outliers):
        print(f"Outlier {idk +1}/{len(idk_outliers)}: ({irr.loc[idk_outlier, 'Irradiance (W/mq)']} W/mq)")

        # Visualize the outlier and its neighours
        print(f"\nOutlier (idk: {idk_outlier}) and its neighborhood")
        display(irr.iloc[range(idk_outlier - 3, idk_outlier + 4), :])

        # Compute the estimated value and assign the the original dataframe
        computed_value = weighted_knn(irr, idk_outlier, list_outliers, "Irradiance (W/mq)", verbose=True)
        irr.loc[idk_outlier, "Irradiance (W/mq)"] = computed_value
        print(f"COMPUTED VALUE: {computed_value}\n","-" * 80)

        # Visualize the outcome
        print("\nNew data (with the filled value(s))")
        display(irr.iloc[range(idk_outlier - 3, idk_outlier + 4)][columns])

    print("-" * 40, f"END correction","-" * 40)
irr["Irradiance (W/mq)"] = irr["Irradiance (W/mq)"].astype("Int64")
raw_irr_data = irr


KeyError: "None of [Index(['Date/Time', 'Irradiance (W/mq)'], dtype='object')] are in the [columns]"

In [None]:
display(raw_irr_data.describe())

### Variable: *Inverter Temp* 

In [21]:
# TASK: B) Analyse and correct the outliers
# VARIABLES: Inverter Temp
# MOTIVATION: Some invalid values have been detected visually (i.e., numerical distribution graphs)
# e.g., Detect around 2K observations with a value in the range [65.529 - 65.535 °C]
columns = ["Date/Time", "Inverter temp. (°C)"]

# Carry out the analysis (and correction) on all the inverters
for inv_name in inv_names:#[3:4]:
    print("INVERTER NAME:", inv_name, "\n", "-"*80)
    
    # 1A) Detect extreme outliers 
    # Setting threshold: a very high threshold to detect (and correct) only extreme outliers (e.g. above 1000 °C)
    # Threshold has been set according to emperical tests (default outlier threshold is equal to 3)
    extreme_outliers = find_outliers(inv_data[inv_name][columns], verbose=True, threshold = 4)
    if len(extreme_outliers) == 0:
        print("Oh, no outliers found. That's good.\n")
        continue
        
    print(f"The extreme outliers (N = {len(extreme_outliers)}) of the variable 'inverter Temp'")
    display(extreme_outliers)
    
    # 1B) Replacing criterion --> Weighted KNN (k = 6)
    # Identfy the index of the outliers 
    idk_outliers = extreme_outliers.index.tolist()
    list_outliers = idk_outliers.copy()
    
    # Compute and assign the estimated value (weighted average value from its neigbours)
    for idk, idk_outlier in enumerate(idk_outliers):
        if len(idk_outliers) > 300:
            clear_output(wait=True)
            
        print(f"Outlier {idk +1}/{len(idk_outliers)}: ({inv_data[inv_name].loc[idk_outlier, 'Inverter temp. (°C)']} °C)")
        
        # Visualize the outlier and its neighours
        # print(f"\nOutlier (idk: {idk_outlier}) and its neighborhood")
        # display(inv_data[inv_name][columns].iloc[range(idk_outlier - 3, idk_outlier + 4), :])
        
        # Compute the estimated value and assign the the original dataframe
        computed_value = weighted_knn(inv_data[inv_name][columns], idk_outlier, list_outliers, "Inverter temp. (°C)", 
                                      verbose=True)
        inv_data[inv_name].loc[idk_outlier, "Inverter temp. (°C)"] = computed_value
        print(f"COMPUTED VALUE: {computed_value}\n","-" * 80)
                                                                      
        # Visualize the outcome
        # print("\nNew data (with the filled value(s))")
        # display(inv_data[inv_name].iloc[range(idk_outlier - 3, idk_outlier + 4)][columns])
        
    print("-" * 40, f"END {inv_name}","-" * 40)

INVERTER NAME: INV1 
 --------------------------------------------------------------------------------
Z-score values (threshold: 4)
MIN: 58.38 
MAX: 58.38


Unnamed: 0,Inverter temp. (°C)
9457,58.379985
9460,58.381768
9510,58.379985
9515,58.378202
9526,58.381768
9537,58.379985
9551,58.381768
9573,58.381768
9584,58.378202
9586,58.379985


The extreme outliers (N = 43) of the variable 'inverter Temp'


Unnamed: 0,Date/Time,Inverter temp. (°C)
9457,2019-01-03 08:00:00,65533
9460,2019-01-03 08:30:00,65535
9510,2019-01-03 17:20:00,65533
9515,2019-01-03 18:15:00,65531
9526,2019-01-04 06:20:00,65535
9537,2019-01-04 08:15:00,65533
9551,2019-01-04 10:45:00,65535
9573,2019-01-04 14:35:00,65535
9584,2019-01-04 16:35:00,65531
9586,2019-01-04 16:55:00,65533


Outlier 1/43: (65533 °C)

OPS: Problematic IDK neighbours discovered 1 (14.29 %) --> [9460]

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (1) have been removed from the neighbour list

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (5 out of 6):[7 5 5] \--/ [3 5]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5]
COMPUTED VALUE: 5
--------------------------------------------------------------------------------
COMPUTED VALUE: 5
 --------------------------------------------------------------------------------
Outlier 2/43: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[5 3 5] \--/ [13  7  5]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 7
--------------------------------------------------------------------------------
COMPUTED VALUE: 7
 --------------------------------------------------------------------------------
Outlier 3/43: (65533 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (

Unnamed: 0,Date/Time,Inverter temp. (°C)
7333,2018-12-07 07:45:00,65535
7725,2018-12-12 07:10:00,65535
8443,2018-12-21 08:00:00,65535
9153,2018-12-30 07:55:00,65533
9476,2019-01-03 08:40:00,65535
...,...,...
123268,2021-02-16 08:25:00,65535
123701,2021-02-19 07:10:00,65531
127748,2021-03-18 07:00:00,65531
127895,2021-03-19 06:45:00,65535


Outlier 1/142: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[ 9  9 15] \--/ [5 1 7]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 8
--------------------------------------------------------------------------------
COMPUTED VALUE: 8
 --------------------------------------------------------------------------------
Outlier 2/142: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[9 9 7] \--/ [13  7  5]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 9
--------------------------------------------------------------------------------
COMPUTED VALUE: 9
 --------------------------------------------------------------------------------
Outlier 3/142: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[11  9 13] \--/ [11  7  9]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 11
--------------------------------------------------------------------------------
COMPUTED VALUE: 11
 -----------------------------------------------

Unnamed: 0,Date/Time,Inverter temp. (°C)
8079,2018-12-17 08:20:00,65535
8149,2018-12-18 06:50:00,65535
8466,2018-12-22 07:00:00,65535
8940,2018-12-28 07:05:00,65535
9252,2019-01-01 06:10:00,65535
...,...,...
128676,2021-03-17 06:40:00,65535
128824,2021-03-18 06:35:00,65535
128825,2021-03-18 06:40:00,65535
128829,2021-03-18 07:00:00,65533


Outlier 1/106: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[7 7 9] \--/ [9 7 9]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 8
--------------------------------------------------------------------------------
COMPUTED VALUE: 8
 --------------------------------------------------------------------------------
Outlier 2/106: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[9 9 9] \--/ [9 9 9]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 9
--------------------------------------------------------------------------------
COMPUTED VALUE: 9
 --------------------------------------------------------------------------------
Outlier 3/106: (65535 °C)

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (6 out of 6):[ 3 11 13] \--/ [7 9 7]
WEIGHTS: [0.33, 0.5, 1, 1, 0.5, 0.33]
COMPUTED VALUE: 9
--------------------------------------------------------------------------------
COMPUTED VALUE: 9
 ----------------------------------------------------------

Unnamed: 0,Inverter temp. (°C)
9529,35.374934
9530,35.374934
9531,35.374934
9532,35.374934
9533,35.374934
...,...
118907,35.374934
119049,35.374934
119051,35.374934
119052,35.374934


The extreme outliers (N = 113) of the variable 'inverter Temp'


Unnamed: 0,Date/Time,Inverter temp. (°C)
9529,2019-01-04 06:05:00,65535
9530,2019-01-04 06:20:00,65535
9531,2019-01-04 06:30:00,65535
9532,2019-01-04 06:40:00,65535
9533,2019-01-04 06:50:00,65535
...,...,...
118907,2021-02-16 07:50:00,65535
119049,2021-02-17 07:05:00,65535
119051,2021-02-17 07:20:00,65535
119052,2021-02-17 07:25:00,65535


Outlier 1/113: (65535 °C)

OPS: Problematic IDK neighbours discovered 3 (42.86 %) --> [9530, 9531, 9532]
Trying with more neighbours (12)...

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (3) have been removed from the neighbour list

VARIABLE: Inverter temp. (°C)
NEIGHBOURES (3 out of 6):[3 3 3] \--/ []
WEIGHTS: [0.33, 0.5, 1]
COMPUTED VALUE: 3
--------------------------------------------------------------------------------
COMPUTED VALUE: 3
 --------------------------------------------------------------------------------
Outlier 2/113: (65535 °C)

OPS: Problematic IDK neighbours discovered 3 (42.86 %) --> [9531, 9532, 9533]
Trying with more neighbours (12)...

End process of increasing number of neighbours (i.e., number of problematic neigbours is acceptable (fewer than K/2)
The problematic neighbours (3) have been removed from the neighbour list

VARIABLE: Inverter temp. (°C)
NEIGHBOURE

In [None]:
# TASK: B2) Check validity of the outlier correction for the inverter temperature
validity = []
inv_temp_values = []
for inv_name in inv_names:
    inv_temp = inv_data[inv_name]["Inverter temp. (°C)"]
    inv_temp_values.append(inv_temp.rename(index = inv_name))
    
    # Check invalid values
    threshold_inv_temp = 100
    invalid_values = inv_temp.loc[inv_temp >= threshold_inv_temp]
    
    if len(invalid_values) == 0:
        validity.append(True)
    else: 
        validity.append(False)

display(pd.concat(inv_temp_values, axis=1, names =inv_names).describe().round(decimals=2))
if all(validity):
    print(f"The outlier correction is valid for all the {len(inv_names)} inverters!")
else:
    print("There is some error in the outlier conversion")

# Final task: Save outcomes 

In [None]:
# FINAL TASK: Save the cleared datasets 
saving_folder_name = "Cleaned"
saving_folder_path = path.join(path_file, saving_folder_name)

print("PV System --> ", system_name.upper())

# Create the saving folder
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path)
    print(f"A new saving folder has been created: {saving_folder_path}\n")
    
# Save the files as CSV files 
for inv_name in inv_names:
    file_name = f"cleaned_{inv_name.upper()}_data.csv"
    inv_data[inv_name].to_csv(path.join(saving_folder_path, file_name), index=False)
    print(f"The cleaned data for '{inv_name}' has been saved.")

raw_irr_data.to_csv(path.join(saving_folder_path, "raw_irr_data.csv"), index=False) 
print("\nThe cleaned data for the 'irradiance values' has been saved.")