In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [2]:
from _library.utils import SYSTEM_NAMES, SUBFOLDERS, load_datasets
from _library.som_utils import compute_metrics
import _library.fault_utils as fault_utils
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
import _library.uc2_interpolation as interpolation_utils
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
from os import path, makedirs
from collections import defaultdict

In [3]:
%cd /mnt/data/vieri/projects/SAMPLE/

/mnt/data/vieri/projects/SAMPLE


# The photovoltaic systems

In [4]:
print(SYSTEM_NAMES, "\nSUBFOLDERS: -->", SUBFOLDERS)
# --- 0 ---------- 1 ---------- 2 --------- 3 ---------- 4 -------

['Binetto 1', 'Binetto 2', 'Soleto 1', 'Soleto 2', 'Galatina'] 
SUBFOLDERS: --> ['Cleaned', '1-hour sampling', '1-hour averaged sampling', 'Residuals', 'Residuals_analytical', None]


## A) Selecting the PV system

In [5]:
system_name = SYSTEM_NAMES[2]
print(f"PV SYSTEM --> {system_name}")

PV SYSTEM --> Soleto 1


## b) Selecting the dataset type

In [6]:
dataset_name = "Residuals" #+ "_analytical"

## c) Loading the dataset

In [7]:
system_path, inv_data, inv_names, *_ = load_datasets(system_name, subfolder = dataset_name)

-------------------------------------------------------------------------------- 
				PV SYSTEM --> SOLETO 1 
--------------------------------------------------------------------------------

Loading inverter data...
SOLETO 1: OK, component data loaded (4) --> INV1, INV2, INV3, INV4
-------------------------------------------------------------------------------- 
FINISHED!: All datasets have been loaded. (SYS: 4 - IRR FILE: 0)
--------------------------------------------------------------------------------
-------------------------------------------------------------------------------- 
EXAMPLE --> Soleto 1: INV1 (FROM '2018-08-08' TO '2021-06-30': 1057 days).
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Date/Tim

## d) Statistic description

In [8]:
relevant_columns = ['Cc 1 (A)', 'Maxiumum Current (A)', 'Residuals (A)', 'Vcc 1 (V)', 'Maxiumum Voltage (V)', 'Residuals (V)',
                    'Amb. Temp (°C)', 'Irradiance (W/mq)']

In [9]:
for inv_name in inv_names:
    print(120 * "-" +"\n" + 57 * "-", inv_name, 57 * "-" + "\n" + 120 * "-")
    df = inv_data[inv_name]
    
    if 'Date/Time' in df.columns:
        df.index = df['Date/Time']
        df.drop(columns = 'Date/Time', inplace=True)
        print("The timetamps are now used as index")
    
    # Visualize some statistics (e.g., means, minimumus, maximums)
    display(df[relevant_columns].describe().round(decimals = 2))

------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------- INV1 ---------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
The timetamps are now used as index


Unnamed: 0,Cc 1 (A),Maxiumum Current (A),Residuals (A),Vcc 1 (V),Maxiumum Voltage (V),Residuals (V),Amb. Temp (°C),Irradiance (W/mq)
count,14277.0,14277.0,14277.0,14277.0,14277.0,14277.0,14277.0,14277.0
mean,136.45,178.47,-42.02,321.61,463.27,-141.66,19.48,325.17
std,156.95,171.24,69.25,146.74,20.64,153.59,8.1,333.82
min,0.0,0.0,-448.52,12.0,315.94,-671.88,-0.13,4.0
25%,0.0,25.09,-48.63,302.0,451.35,-174.34,13.25,33.0
50%,62.0,94.03,-23.49,387.0,474.08,-67.91,18.6,173.0
75%,259.0,347.55,-17.86,414.0,478.24,-41.19,25.87,614.0
max,545.0,504.07,313.3,580.0,1072.88,116.01,41.35,1336.0


------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------- INV2 ---------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
The timetamps are now used as index


Unnamed: 0,Cc 1 (A),Maxiumum Current (A),Residuals (A),Vcc 1 (V),Maxiumum Voltage (V),Residuals (V),Amb. Temp (°C),Irradiance (W/mq)
count,14175.0,14175.0,14175.0,14175.0,14175.0,14175.0,14175.0,14175.0
mean,140.94,179.27,-38.33,331.96,463.16,-131.2,19.56,326.51
std,160.3,171.45,68.94,149.76,20.7,156.46,8.08,334.13
min,0.0,0.0,-452.52,12.0,315.94,-664.88,-0.13,4.0
25%,0.0,25.16,-41.75,322.0,451.23,-154.03,13.34,33.0
50%,65.0,95.21,-22.7,404.0,473.89,-52.28,18.71,175.0
75%,268.0,348.28,-14.03,422.0,478.24,-32.98,25.94,618.0
max,556.0,504.07,320.3,566.0,1072.88,105.49,41.35,1336.0


------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------- INV3 ---------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
The timetamps are now used as index


Unnamed: 0,Cc 1 (A),Maxiumum Current (A),Residuals (A),Vcc 1 (V),Maxiumum Voltage (V),Residuals (V),Amb. Temp (°C),Irradiance (W/mq)
count,14257.0,14257.0,14257.0,14257.0,14257.0,14257.0,14257.0,14257.0
mean,146.93,179.01,-32.08,358.16,463.22,-105.05,19.5,326.01
std,166.79,171.27,70.7,159.06,20.65,164.62,8.08,333.67
min,0.0,0.0,-485.62,20.0,315.94,-614.88,-0.13,4.0
25%,0.0,25.15,-31.04,366.0,451.32,-110.29,13.29,33.0
50%,69.0,94.98,-21.84,432.0,473.99,-21.13,18.62,175.0
75%,280.0,347.99,-7.33,456.0,478.24,-8.38,25.89,616.0
max,564.0,504.07,340.3,552.0,1072.88,129.06,41.35,1336.0


------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------- INV4 ---------------------------------------------------------
------------------------------------------------------------------------------------------------------------------------
The timetamps are now used as index


Unnamed: 0,Cc 1 (A),Maxiumum Current (A),Residuals (A),Vcc 1 (V),Maxiumum Voltage (V),Residuals (V),Amb. Temp (°C),Irradiance (W/mq)
count,14212.0,14212.0,14212.0,14212.0,14212.0,14212.0,14212.0,14212.0
mean,147.35,179.82,-32.48,368.15,463.12,-94.97,19.52,327.62
std,166.5,171.69,70.12,163.09,20.67,168.21,8.09,334.58
min,0.0,0.0,-488.38,18.0,315.94,-585.88,-0.13,4.0
25%,0.0,25.09,-30.37,382.0,451.18,-89.95,13.3,33.0
50%,70.0,95.75,-21.79,440.0,473.85,-10.81,18.65,176.0
75%,284.0,350.04,-7.57,472.0,478.24,3.22,25.93,621.0
max,547.0,504.07,340.3,561.0,1072.88,127.43,41.35,1336.0


In [10]:
#inv_names
#inv_data['INV3'].loc["2019-11-08", :]

# Train/test split (i.e., nominal and abnormal observations)

## 0) Retrieve *failure events* 
Possible priority values for the *alarm logs*: 
- "High"
- "Medium"
- "Low"

NOTE: The *(general) faults* will always be included.

In [11]:
fault_priorities = ["High"] #"Medium"

### 0.1) Select which data to load
- **Alarm** logs (True/False)
- **String Box alarm** logs (True/False)
- **Included anonimous faults**: (True/False

In [12]:
to_load = {
    'faults': True, 
    'inv_alarms': False, 
    'stringBox_alarms': True
}
include_faults_notRelatedToInverters = True

### 0.2) Indentify failure events

In [13]:
# Load the fault dataset: Storico guasti.xlsx (a.k.a., 'General faults') & PV SYSTEM - Storico Allarme.xlsx (a.k.a., Log - X)
fault_df = fault_utils.load_faults(system_name, include_faults_notRelatedToInverters, to_load['inv_alarms'], fault_priorities, 
                                   to_load['stringBox_alarms'], to_load['faults'], verbose = False)

# Isolate the fault in the period covered by the dataset
start_dates = [inv_data[inv_name].index.tolist()[0] for inv_name in inv_names]
first_start_date = sorted(start_dates)[0]
fault_df = fault_df[fault_df["Inizio"] >= first_start_date]

print("\n"+"-"* 120 + f"\n\t\t\t\t\t\tFAUL EVENTS (period >= {first_start_date.strftime('%Y-%m-%d')})")
print("\t\t\t\t\tPRIORITIES:", (', ').join([priority for priority in fault_df['Tipo'].unique()]) 
      + "\n" + "-"* 120)
display(fault_df)
print(f"TOTAL: {len(fault_df)} failure events")

# --------------- Isolate only the (general) faults observations ---------------------------------
fault_to_vis = "General Fault" 
fault_type_cond = fault_df["Tipo"] == fault_to_vis
print("\n"+"-"* 105 + f"\n\t\t\t\t\tFAULT EVENTS ('{fault_to_vis}')\n" + "-"* 105)
only_fault_df = fault_df[fault_type_cond]
if len(only_fault_df) > 0:
    display(only_fault_df)
else:
    print(f"\n[{system_name}] No faults available for this PV system.\n")

----------------------------------------------------------------------------------------------------
					FAULTS: Soleto 1
				PRIORITIES: High, Medium & faults
----------------------------------------------------------------------------------------------------
0) 1 fault(s) called 'scheda di comunicazione' have/has been discarted. As it's not related to inverter operation.

0) SELECTING only the string box-related faults

--> A) General faults loaded (1)
--> [B) Inverter logs have been skipped (0)]
--> C) String-box logs loaded (INV1: 31230, INV2: 3055, INV3: 5075, INV4: 1615)

Loading completed!

FAUL CAUSES (4):
--------------------
1) Allarme string-box
2) Ritardo comunicazione dispositivo
3) String-box con produzione anomala
4) Unknown

------------------------------------------------------------------------------------------------------------------------
						FAUL EVENTS (period >= 2018-08-08)
					PRIORITIES: Log_stringBox - Medium, Log_stringBox - High, General Fault
---------

Unnamed: 0,Inverter,Componente Guasto,Causa Guasto,Inizio,Fine,Tipo
3,3,CSP3.6 V130086: String-box con produzione anomala,String-box con produzione anomala,2018-08-08 11:51:00,2018-10-18 14:50:00,Log_stringBox - Medium
4,1,CSP1.6 V180544 s1: [3] Corrente di stringa fuo...,Allarme string-box,2018-08-08 13:26:00,2018-08-08 14:52:00,Log_stringBox - Medium
5,2,CSP2.6 V180556 s9: [3] Corrente di stringa fuo...,Allarme string-box,2018-08-08 14:13:00,2018-08-08 14:21:00,Log_stringBox - Medium
6,2,CSP2.6 V180556 s9: [3] Corrente di stringa fuo...,Allarme string-box,2018-08-08 14:33:00,2018-08-08 15:26:00,Log_stringBox - Medium
7,1,CSP1.6 V180544 s10: [3] Corrente di stringa fu...,Allarme string-box,2018-08-08 14:41:00,2018-08-08 14:52:00,Log_stringBox - Medium
...,...,...,...,...,...,...
39915,3,CSP3.6 V130086 s6: [3] Corrente di stringa fuo...,Allarme string-box,2021-09-16 14:32:00,2021-09-16 14:44:00,Log_stringBox - Medium
39916,3,CSP3.6 V130086 s6: [3] Corrente di stringa fuo...,Allarme string-box,2021-09-16 15:34:00,2021-09-16 15:40:00,Log_stringBox - Medium
39917,3,CSP3.6 V130086 s6: [3] Corrente di stringa fuo...,Allarme string-box,2021-09-16 15:45:00,2021-09-16 15:51:00,Log_stringBox - Medium
39918,4,CSP4.6 V180543 s11: [3] Corrente di stringa fu...,Allarme string-box,2021-09-17 08:46:00,2021-09-17 08:52:00,Log_stringBox - Medium


TOTAL: 39917 failure events

---------------------------------------------------------------------------------------------------------
					FAULT EVENTS ('General Fault')
---------------------------------------------------------------------------------------------------------


Unnamed: 0,Inverter,Componente Guasto,Causa Guasto,Inizio,Fine,Tipo
11291,,datexel - scheda PV isolation di un quadro di ...,Unknown,2019-11-08 08:39:00,2019-11-08 12:30:00,General Fault


## 1) Train and test split
1) **'random'**: randomly splitting (by a percentage)

2) **'cutting_date'**: splitting according to a cutting date (computed by a percentage)

3) **'failure_events'**: splitting according to periods concerning failure events

In [14]:
train_test_split_stategy = 'random' 

In [15]:
train_data = dict()
test_data = dict()

## STRAT 1: Split train/test randomly

In [16]:
test_size =  0.4

In [17]:
if train_test_split_stategy == 'random':
    for inv_name in inv_names:
        print("\n"+14*"-", inv_name, 14*"-")
        df = inv_data[inv_name]
        
        train_df, test_df = train_test_split(df, test_size = test_size, random_state = 101)
        train_data[inv_name] = train_df
        test_data[inv_name] = test_df
        
        print(f"TRAIN SET ({int((1- test_size)*100)} %): {len(train_df)} obs. \n "\
              f"TEST SET ({int(test_size * 100)} %): {len(test_df)} obs.")


-------------- INV1 --------------
TRAIN SET (60 %): 8566 obs. 
 TEST SET (40 %): 5711 obs.

-------------- INV2 --------------
TRAIN SET (60 %): 8505 obs. 
 TEST SET (40 %): 5670 obs.

-------------- INV3 --------------
TRAIN SET (60 %): 8554 obs. 
 TEST SET (40 %): 5703 obs.

-------------- INV4 --------------
TRAIN SET (60 %): 8527 obs. 
 TEST SET (40 %): 5685 obs.


## STRAT 2: Simple train/test split (based on the number of days available) 

**STAT 1**: *simple_train_test_split* = **True**

- **TRAIN**: [starting day : cutoff_date]

- **TEST**: [cutoff_date + 1 : ending date]

In [18]:
test_size =  0.4

In [19]:
if train_test_split_stategy == 'cutting_date':
    for inv_name in inv_names:
        print("\n"+30*"-", inv_name, 30*"-")
        df = inv_data[inv_name]
        
        # Define the cutoff date
        starting_date = pd.to_datetime(df.index[0])
        ending_date = pd.to_datetime(df.index[-1])
        days_available = (ending_date - starting_date).components[0]
        num_test_days = int(round(days_available * test_size, 0))
        cutoff_date = ending_date - pd.Timedelta(num_test_days, unit="days")

        # Create the subsets
        train_data[inv_name] = df[df.index <= cutoff_date]
        test_data[inv_name] = df[df.index > cutoff_date]

        print(f"DAYS AVAILABLE: {days_available} {starting_date.strftime('%Y-%m'), ending_date.strftime('%Y-%m')}"\
              f"\n--> TRAIN: {days_available - num_test_days} days || TEST ({int(test_size*100)} %): {num_test_days} days "\
              f"\n--> Cut-off date: {cutoff_date.strftime('%Y-%m-%d')}")
else:
    print("This splitting stategy has not been selected.")

This splitting stategy has not been selected.


## STRAT 3: Split the data according to the faults/allarm logs
**STAT 2**: *simple_train_test_split* = **False**
- **TEST**: [Faults/allarm logs]
- **TRAIN**: [not test observations]

In [20]:
test_fault_priorities = ["High"] #"Medium"
days_to_include = 0

In [21]:
verbose = False

In [22]:
generate_validation = False

In [23]:
valid_data = dict()
if train_test_split_stategy == 'failure_events':
    dimensions = []
    
    for inv_name in inv_names:
        print("\n" + 25 * "-", inv_name, 25*"-")
        df = inv_data[inv_name]

        # Carry out the split according to the faults available 
        train_idk, test_idk = fault_utils.train_test_split(fault_df, df, inv_name,
                                                           priorities_to_consider = test_fault_priorities,
                                                           time_window = days_to_include, verbose = False)

        # Save the train/test dataframe for each inverter
        train_data[inv_name] = df.loc[train_idk, :]
        test_data[inv_name] = df.loc[test_idk, :]
        
        print(f"TRAIN SET SIZE: {len(train_data[inv_name])} hourly observations "\
              f"({round((len(train_data[inv_name])/len(df))*100, 2)} %)")
        
        # Generate the test/validation split 
        if generate_validation:
            valid_dimension = 0.5
            val_set, test_set, tot_periods = utils.split_test_validation_sets(test_data[inv_name], valid_dimension, 
                                                                              verbose = True)
            # Set the validation/test sets
            valid_data[inv_name] = val_set
            test_data[inv_name] = test_set
            print(f"VALID SET SIZE: {' ' if len(train_idk) >= 10000 else ''}"\
                  f"{len(valid_data[inv_name])} hourly observations ({round((len(valid_data[inv_name])/len(df))*100 , 2)} %)")
            
            # Save the dimensions of the split for each inverter
            dimensions.append((len(df), len(train_data[inv_name]), len(test_data[inv_name]), len(valid_data[inv_name])))
        else:
            # Save the dimensions of the split for each inverter
            dimensions.append((len(df), len(train_data[inv_name]), len(test_data[inv_name])))
            
        print(f" TEST SET SIZE: {' 'if len(train_idk) >= 10000 else ''}{len(test_data[inv_name])} hourly observations "\
              f"({round((len(test_data[inv_name])/len(df))*100, 2)} %)")
        
        #print("[!!] CHECK:", (len(train_data[inv_name]) + len(valid_data[inv_name]) + len(test_data[inv_name]))/len(df))
        
    # Visualize the average set dimensions
    avg_train_dim = np.mean([dim[1]/dim[0] for dim in dimensions])
    avg_test_dim = np.mean([dim[2]/dim[0] for dim in dimensions])
    if generate_validation:
        avg_val_dim = np.mean([dim[3]/dim[0] for dim in dimensions])
        
    print("\n\t\t" + "-" * 26 + f"\n\t\t TRAIN SET (AVG): {round(avg_train_dim * 100, 1)} % \n\t\t "
          + (f"VALID SET (AVG): {round(avg_val_dim * 100, 1)} % \n\t\t  " if generate_validation else ' ')
          + f"TEST SET (AVG): {round(avg_test_dim * 100, 1)} %" + "\n\t\t"+ "-" * 26)
else:
    print("This cutting-edge splitting stategy has not been selected. Sorry :/")

This cutting-edge splitting stategy has not been selected. Sorry :/


### a) Retrieve the observations concerning fault events

#### a.0) Load the pre-computed failure events for each inverter

In [24]:
load_failure_events = False

In [25]:
saving_folder_name = "UC2 - oneClassSVM"
saving_folder_path = path.join(system_path, "..", "..", saving_folder_name)

In [26]:
file_names = ['fault_events', 'unique_faults']

In [27]:
train_inv_failure_events = defaultdict()

In [28]:
if load_failure_events:
    dataset_name = 'train'
    
    for file_name in file_names:
        for inv_name in inv_names:
            print(f'[{dataset_name}] {inv_name}: {file_name}')
                
            # File path
            loading_path = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_name}.csv')
            
            # Load data
            loaded_data = pd.read_csv(loading_path, header=None, index_col=0, dtype = pd.array).squeeze().to_dict()
            display(loaded_data.info())
            
            # Save the data 
            train_inv_failure_events[inv_name].append(loaded_data)
            
        print('-' * 20)
else:
    print("Pre-loading: False")

Pre-loading: False


#### a.1) Retrieve the failure events

In [29]:
verbose = False

In [30]:
for inv_name in inv_names:
    if not load_failure_events:
        timestamp_faults = dict()
        fault_events, unique_faults = fault_utils.find_fault_observation(fault_df, train_data[inv_name], inv_name, 
                                                                        include_faults_notRelatedToInverters = False, 
                                                                        verbose = False)

        ts_to_remove = sorted(fault_events.keys())
        fault_types = set(fault_events[0][0] for ts, fault_events in fault_events.items())
        print(f"{inv_name.upper()} (TRAIN DATA): found {len(unique_faults)} unique fault events ({len(ts_to_remove)} obs.)",
              f"\n\t--> Faults/allarms types ({str(len(fault_types))}): {(', ').join(fault_types)}\n" 
              if not verbose and len(fault_types) > 0 else "--> Nice, that's perfect!")
        timestamp_faults[inv_name] = ts_to_remove

        if verbose:
            print("\t"+ 80 * "-")
            for ts, fault_events in fault_events.items():
                print("\t", pd.to_datetime(ts).strftime('%Y-%m-%d (%H:%M)'))
                for fault_event in fault_events:
                    print("\t --> " + fault_event[0].upper() + " ("+fault_event[1] +") "+ fault_event[2][:80] +  
                          "\n\t\t\t [" +  fault_event[3][0].strftime('%Y-%m-%d (%H:%M)') + " -  " + 
                          fault_event[3][1].strftime('%Y-%m-%d (%H:%M)') + "]\n")

        train_inv_failure_events[inv_name] = (fault_events, unique_faults)
    else:
        failure_events_ts = train_inv_failure_events[inv_name][0].keys()
        timestamp_faults[inv_name] = failure_events_ts
        
        print(f"{inv_name} {len(failure_events_ts)} Failure events have been loaded")

KeyboardInterrupt: 

#### a.2) Drop faulty observations from the train data 

In [None]:
drop_obs = False

In [None]:
if drop_obs:
    for inv_name in inv_names:
        idk_to_drop = timestamp_faults[inv_name]

        if len(idk_to_drop) > 0:
            train_data[inv_name] = train_data[inv_name].drop(index = idk_to_drop)
            print(f"{inv_name}: {len(idk_to_drop)} timestamps ({round(len(idk_to_drop)/len(train_data[inv_name])*100, 2)} %) "\
                  f"concerning fault events have been removed from the TRAIN DATA.")
        else:
            print(f"{inv_name}: Nice, everything is okay :) --> No fault events have been found in the TRAIN data.")
else:
    print("This step will be skipped")

#### A.2) Save the data as csv

In [None]:
# Create the saving folder
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path) 
    print(f"PV System --> {system_name.upper()}\nA new saving folder has been created: {saving_folder_path}\n")

In [None]:
if not load_failure_events:
    for inv_name in inv_names: 
        print('-' * 20, inv_name, '-' * 20)
        
        dataset_name = 'train'
        
        # Paths 
        file_path_a = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_names[0]}.csv')
        file_path_b = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_names[1]}.csv')

        # CSV Writers
        fault_events_writer = csv.writer(open(file_path_a, "w"))
        unique_faults_writer = csv.writer(open(file_path_b, "w"))

        # Failure items
        fault_events, unique_faults = train_inv_failure_events[inv_name]
        
        # Item 1
        for key, val in fault_events.items():
            fault_events_writer.writerow([key, val])
        print(f"a) [{dataset_name.upper()} DATA] Fault events have been saved")
            
        # Item 2
        for failure_event in unique_faults:
            unique_faults_writer.writerow(failure_event)
        print(f"b) [{dataset_name.upper()} DATA] All the unique fault events have been saved\n")
else:
    print("Pre-loading: False")

### b ) [Test data] Check *whether* and *how many* fault events are included in the data

#### b.0) Load the pre-computed failure events for each inverter

In [None]:
test_inv_failure_events = defaultdict()

In [None]:
if load_failure_events:
    dataset_name = 'test'
    
    for file_name in file_names:
        for inv_name in inv_names:
            print(f'[{dataset_name}] {inv_name}: {file_name}')
                
            # File path
            loading_path = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_name}.csv')
            
            # Load data
            loaded_data = pd.read_csv(loading_path, header=None, index_col=0, dtype = pd.array).squeeze().to_dict()
            display(loaded_data.info())
            
            # Save the data 
            train_inv_failure_events[inv_name].append(loaded_data)
            
        print('-' * 20)
else:
    print("Pre-loading: False")

#### b.1) Compute the failure events (for the test set)

In [None]:
verbose = False

In [None]:
if not load_failure_events:
    print("-"*40, "TEST DATA", "-"*40)
    test_fault_events = dict()
    test_inv_failure_events = dict()
    for inv_name in inv_names:
        fault_events, unique_faults = fault_utils.find_fault_observation(fault_df, test_data[inv_name], inv_name, verbose=False)
        test_fault_events[inv_name] = fault_events
        test_inv_failure_events[inv_name] = (fault_events, unique_faults)

        if len(test_fault_events[inv_name]) > 1:
            print(f"{inv_name} --> OK: {len(test_fault_events[inv_name])} timestamps "\
                  f"({round(len(test_fault_events[inv_name])/len(test_data[inv_name])*100, 2)} %) concerning "\
                  f"{len(unique_faults)} fault events have been found in the test set.")

            if verbose:
                print("\t"+ 85 * "-")
                for idk, fault_event in enumerate(unique_faults):
                    print(f"\t--> ({idk +1 }/{len(unique_faults)}) " + fault_event[0].upper() + " ("+fault_event[1] +") "+ 
                          fault_event[2][:65] +  "...\n\t\t\t [" +  pd.to_datetime(fault_event[3][0]).strftime('%Y-%m-%d (%H:%M)') 
                          + " -  " + pd.to_datetime(fault_event[3][1]).strftime('%Y-%m-%d (%H:%M)') + "]\n")            
        else:
            print(f"{inv_name} (ISSUE): No fault events have been found in the test data "\
                  f"(TOTAL: {len(test_data[inv_name])} obs.)")
else:
    failure_events_ts = test_inv_failure_events[inv_name][0].keys()
    test_fault_events[inv_name] = failure_events_ts

#### b.2) Save the failure events (for the test set)

In [None]:
if not load_failure_events:
    for inv_name in inv_names: 
        print('-' * 20, inv_name, '-' * 20)
        
        dataset_name = 'test'
        
        # Paths 
        file_path_a = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_names[0]}.csv')
        file_path_b = path.join(saving_folder_path, f'{inv_name}_{dataset_name}_{file_names[1]}.csv')

        # CSV Writers
        fault_events_writer = csv.writer(open(file_path_a, "w"))
        unique_faults_writer = csv.writer(open(file_path_b, "w"))

        # Failure items
        fault_events, unique_faults = test_inv_failure_events[inv_name]
        #print(unique_faults)
        
        # Item 1
        for key, val in fault_events.items():
            fault_events_writer.writerow([key, val])
        print(f"a) [{dataset_name.upper()} DATA] Fault events have been saved")
            
        # Item 2
        for failure_event in unique_faults:
            unique_faults_writer.writerow(failure_event)
        print(f"b) [{dataset_name.upper()} DATA] All the unique fault events have been saved\n")
else:
    print("Pre-loading: False")

# One-Class SVM

## Input features

In [None]:
input_cases = [
    # ----- 0 -----------------------
    ['Residuals (A)', 'Residuals (V)'],
    # ----- 1 -------
    ['Residuals (A)'],
    # ----- 2 ------------------------
    ['Cc 1 (A)', 'Residuals (A)', 'Vcc 1 (V)', 'Residuals (V)', 'Amb. Temp (°C)', 'Cell Temp (°C)', 'Irradiance (W/mq)']
]
input_columns = input_cases[0]

In [None]:
#inv_name = 'INV3'
#test_data[inv_name]

In [None]:
verbose = False

In [None]:
for inv_name in inv_names:
    print(120 * "-" +"\n" + 57 * "-", inv_name, 57 * "-" + "\n" + 120 * "-")
    
    # Dataset
    df = inv_data[inv_name]
    train_obs = train_data[inv_name]
    test_obs = test_data[inv_name]
    
    # 0) Input for the SVM
    input_train_data = train_obs[input_columns]
    input_test_data = test_obs[input_columns]
    print("\n" + "-" * 40, "FITTING THE (ONE-CLASS) SVM", "-" * 40)
    print(f"FEATURES ({len(input_columns)}):", ' || '.join(input_columns), "\n")
    print(f"TRAIN DATA: {len(input_train_data)} observations "\
          f"({round((len(input_train_data)/len(df))*100, 2)} %)")
    print(f"TEST  DATA: {len(input_test_data)} observations "\
          f"({round((len(input_test_data)/len(df))*100, 2)} %)\n")
        
    # 1) One-class Support Vector Machine (SVM)
    print(f"--> Fitting the 'one-class SVM' on the residuals of {len(input_train_data)} train observations...")
    
    # 1.1) Input Data
    input_svm_data = input_train_data
    
    # 1.2) Kernel 
    # -------------------- 0 ------ 1 ----- 2 ------ 3 --- 
    possible_kernels = ['linear', 'poly', 'rbf', 'sigmoid'] 
    svm_kernel = possible_kernels[3]
    poly_degree = 3
    
    print("--> KERNEL:", svm_kernel)
    if svm_kernel == possible_kernels[1]:
        print(f"--> Polynomial degree:", poly_degree)
    
    # 1.3) Choose the NU parameter: it controls the sensitivity of the support vectors 
    # --> it should be tuned to the approximate ratio of outliers in the data (e.g. 0.01%)
    num_failure_events = len(timestamp_faults[inv_name])
    support_vectors_sensitivity =  num_failure_events / len(input_svm_data) 
    if support_vectors_sensitivity == 0:
        support_vectors_sensitivity = 1 * (10 ** -5)
    print(f"--> Support vector sensitivity: {np.round(support_vectors_sensitivity, 5)} "\
          f"({num_failure_events} obs concerning failure events)\n")

    # 1.4) Define and train the One-Class SVM
    svm = OneClassSVM(kernel = svm_kernel, degree = poly_degree, nu = support_vectors_sensitivity)
    svm.fit(input_svm_data) 
    
    # 2.a) [TRAIN] Predict the classes
    print("-" * 40, "[TRAIN] PREDICT CLASSES", "-" * 40)
    print(f"--> Predicting the class on the residuals of {len(input_train_data)} train observations...")
    predicted_train_classes = svm.predict(input_train_data)
    idk_train_class_a = [value[0] for value in np.argwhere(predicted_train_classes == -1)]
    idk_train_class_b = [value[0] for value in np.argwhere(predicted_train_classes == 1)]
    print(f"--> PREDICTED CLASSES for {len(predicted_train_classes)} observations.")
    print(f"\t--> CLASS '-1': {len(idk_train_class_a)} obs. "\
          f"({round((len(idk_train_class_a)/len(predicted_train_classes))*100, 2)} %)")
    print(f"\t--> CLASS '+1': {len(idk_train_class_b)} obs. "\
          f"({round((len(idk_train_class_b)/len(predicted_train_classes))*100, 2)} %)\n")
    
    # 2.b) [TEST] Predict the class
    print("-" * 40, "[TEST] PREDICT CLASSES", "-" * 40)
    print(f"--> Predicting the class on the residuals of {len(input_test_data)} test observations...")
    predicted_test_classes = svm.predict(input_test_data)
    idk_test_class_a = [value[0] for value in np.argwhere(predicted_test_classes == -1)]
    idk_test_class_b = [value[0] for value in np.argwhere(predicted_test_classes == 1)]
    print(f"--> PREDICTED CLASSES for {len(predicted_test_classes)} observations.")
    print(f"\t--> CLASS '-1': {len(idk_test_class_a)} obs. "\
          f"({round((len(idk_test_class_a)/len(predicted_test_classes))*100, 2)} %)")
    print(f"\t--> CLASS '+1': {len(idk_test_class_b)} obs. "\
          f"({round((len(idk_test_class_b)/len(predicted_test_classes))*100, 2)} %)\n")
    
    if verbose:
        print("-" * 30, "CLASS '-1'", "-" * 30)
        display(sorted(set(test_obs.loc[test_obs.index[idk_test_class_a], :].index.strftime('%Y-%m-%d')))) # (%H:%M)
        print("\n"+ "-" * 30, "CLASS '1'", "-" * 30)
        display(sorted(set(test_obs.loc[test_obs.index[idk_test_class_b], :].index.strftime('%Y-%m-%d'))))
    
    # 2.c) [ALL DATA] Predict the class
    input_data = df[input_columns]
    print("-" * 38, "[ALL DATA] PREDICT CLASSES", "-" * 38)
    print(f"--> Predicting the class on the residuals of {len(input_data)} all the observations...")
    predicted_classes = svm.predict(input_data)
    idk_class_a = [value[0] for value in np.argwhere(predicted_classes == -1)]
    idk_class_b = [value[0] for value in np.argwhere(predicted_classes == 1)]
    print(f"--> PREDICTED CLASSES for {len(predicted_classes)} observations.")
    print(f"\t--> CLASS '-1': {len(idk_class_a)} obs. "\
          f"({round((len(idk_class_a)/len(predicted_classes))*100, 2)} %)")
    print(f"\t--> CLASS '+1': {len(idk_class_b)} obs. "\
          f"({round((len(idk_class_b)/len(predicted_classes))*100, 2)} %)\n")
    
    # 3) Create a new column for the predicted class
    df.loc[:, 'Predicted class'] = 99
    df.loc[:, "Dataset_type"] = 99
    
    # 3.1) Filling the data with the predicted classes
    df.loc[df.index[idk_class_a], 'Predicted class'] = -1
    df.loc[df.index[idk_class_b], 'Predicted class'] = 1
    
    # Label the dataset with the dataset type
    df.loc[train_obs.index, "Dataset_type"] = -10
    df.loc[test_obs.index, "Dataset_type"] = 10

# Label the dataset

## 0) Retrieve the failure events

In [None]:
inv_fault_events = dict()
for inv_name in inv_names:
    df = inv_data[inv_name]
    fault_events, unique_faults = fault_utils.find_fault_observation(fault_df, df, inv_name, verbose=False)
    
    inv_fault_events[inv_name] = (fault_events, unique_faults)
    print(inv_name, f"[TEST DATA]: {len(unique_faults)} unique fault events ({len(fault_events)} obs.)")

## b) Assign a class depending on the failure timestamps

In [None]:
failure_col_name = 'Failure event'

In [None]:
class_names = {'Nominal obs': 'No', 'Failure events': 'Yes'}

In [None]:
for inv_name in inv_names:
    print("-" * 30, inv_name, "-" * 30)
    df = inv_data[inv_name]
    fault_events, unique_faults = inv_fault_events[inv_name]
    
    print(f"TOTAL (All data): {len(df)} obs.")
    print(f"FAILURE EVENTS: {len(fault_events.keys())} obs.")
    
    # Default class
    df.loc[:, failure_col_name] = class_names['Nominal obs']
    
    # Class failure events
    failure_ts = fault_events.keys()
    df.loc[failure_ts, failure_col_name] = class_names['Failure events']
    
    # Retrieve the classes 
    classes = df.groupby(by = failure_col_name).count()['Predicted class'].to_dict()
    assert classes['Yes'] == len(failure_ts)
    
    print("\n" + "-" * 10, "CLASSES", "-" * 10)
    print(classes, "\n")

# Visualize the outcomes visually

In [None]:
temp_col = 'Cell Temp (°C)' #'Amb. Temp (°C)'

In [None]:
dataset_types = ['train', 'test', 'All']

In [None]:
data_to_visualize = dataset_types[2]

In [None]:
for inv_name in inv_names:
    print(120 * "-" +"\n" + 57 * "-", inv_name, 57 * "-" + "\n" + 120 * "-")
    
    # Retrieve the main dataset
    data = inv_data[inv_name]
    
    if data_to_visualize == 'test':
        test_ts = test_data[inv_name].index
        test_df = data.loc[test_ts, :]
        df = test_df
    elif data_to_visualize == 'train':
        train_ts = train_data[inv_name].index
        train_df = data.loc[test_ts, :]
        df = train_df
    else:
        df = data
    
    # 0) Create the inputs data
    amb_conditions = np.array(list(zip(df[temp_col].values, df['Irradiance (W/mq)'].values)))
    voltage_values = np.array(df['Vcc 1 (V)'].values)
    current_values = np.array(df['Cc 1 (A)'].values)
    max_voltage = np.array(df['Maxiumum Voltage (V)'].values)
    max_current = np.array(df['Maxiumum Current (A)'].values)
    
    # Visualize both the predicted classes and the residuals
    title_types = [
        f"({data_to_visualize.upper()}) Classification (A)", 
        f'({data_to_visualize.upper()}) Actual failure events (B)', 
        f"({data_to_visualize.upper()}) Residuals (C)", 
        f"({data_to_visualize.upper()}) Dataset type (D)"
    ]
    
    hue_types = [
            df['Predicted class'].values, 
            df[failure_col_name].map({'No': 99, 'Yes': 101}).values,
            (df['Residuals (V)'].values, df['Residuals (A)'].values), 
            df['Dataset_type'].values
        ]
    for idk, hue_values in enumerate(hue_types):
        if data_to_visualize != 'All' and idk == 3:
                continue
        
        # 1) Create the visual panel
        fig = plt.figure(figsize=(20, 20))
        fig.suptitle(f"[{inv_name}] {title_types[idk]}", size = 40, y = 0.8)

        # 1.1) [VOLTAGE] Generate the 3-dimensional subplot
        #interpolation_utils.
        voltage_hue_values = hue_values[0] if isinstance(hue_values, tuple) else hue_values
        interpolation_utils.generate_sub_graph(fig, 0, amb_conditions, voltage_values, max_voltage, 'Voltage (V)', 
                                               pov_elev = 20, pov_angle = -140, visualize_actual_points = True,
                                               visualize_surface = False,  hue_values = voltage_hue_values)

        # 1.2) [CURRENT] Generate the 3-dimensional subplot
        current_hue_values = hue_values[1] if isinstance(hue_values, tuple) else hue_values
        interpolation_utils.generate_sub_graph(fig, 1, amb_conditions, current_values, max_current, 'Current (A)', 
                                               pov_elev = 20, pov_angle = -140, visualize_actual_points = True, 
                                               visualize_surface = False, hue_values = current_hue_values)

        # Visualize the graphical panel
        fig.tight_layout()
        plt.show()

# Compute metrics

## 1) Compute TP, FP, TN & FN

In [None]:
metrics_types = dict()
for selected_dataset in dataset_types:
    print("\n" + "-" * 104, "\n" + "-" * 50 + selected_dataset.upper() + "-" * 50 + "\n" + "-" * 104)
    
    inv_metrics = dict()
    for inv_name in inv_names:
        print("-" * 30, inv_name, "-" * 30)

        # Retrieve the main dataset
        data = inv_data[inv_name]
        if selected_dataset == 'test':
            test_ts = test_data[inv_name].index
            test_df = data.loc[test_ts, :]
            df = test_df
        elif selected_dataset == 'train':
            train_ts = train_data[inv_name].index
            train_df = data.loc[train_ts, :]
            df = train_df
        else:
            df = data
        print(f"TOTAL ({selected_dataset}): {len(df)} obs.")

        # Retrieve the failure events
        fault_events, unique_faults = inv_fault_events[inv_name]

        # Label the dataset
        failure_col_name = 'Failure event'
        df.loc[:, failure_col_name] = "No"

        failure_ts = [ts for ts in fault_events.keys() if ts in df.index]
        print(f"FAILURE EVENTS: {len(failure_ts)} obs.")
        df.loc[failure_ts, failure_col_name] = "Yes"

        # Compute the metrics
        col_name = 'metrics'
        df.loc[:, col_name] = "Unknown"

        # FUNCTIONS: True/False POSTIVE 
        tp_cond = (df[failure_col_name] == "Yes") & (df['Predicted class'] == -1)
        fp_cond = (df[failure_col_name] == "No") & (df['Predicted class'] == -1)

        # FUNCTIONS: True/False NEGATIVE 
        tn_cond = (df[failure_col_name] == "No") & (df['Predicted class'] == 1)
        fn_cond = (df[failure_col_name] == "Yes") & (df['Predicted class'] == 1)

        # COMPUTE the metrics 
        df.loc[tp_cond, col_name] = "TP"
        df.loc[fp_cond, col_name] = "FP"
        df.loc[tn_cond, col_name] = "TN"
        df.loc[fn_cond, col_name] = "FN"

        # Retrieve the values for the metrics
        grouped_df = df.groupby(by = col_name).count()['Predicted class']
        raw_metrics = grouped_df.to_dict()

        # Add potential missing metrics (due to missing cases) --> e.g., TP may be missing
        metrics_names = ['TP', 'FP', 'TN', 'FN']
        missing_metrics = [metrics_name for metrics_name in metrics_names if metrics_name not in list(raw_metrics.keys())]
        if len(missing_metrics) > 0:
            for metrics_name in missing_metrics:
                raw_metrics[metrics_name] = 0
        assert np.sum([value for metrics_name, value in raw_metrics.items()]) == len(df)

        # Save the metrics
        inv_metrics[inv_name] = raw_metrics
        print(f"\nTP: {raw_metrics['TP']}\nFP: {raw_metrics['FP']}\n{'-' * 10}\n"\
              f"TN: {raw_metrics['TN']}\nFN: {raw_metrics['FN']}\n")
    metrics_types[selected_dataset] = inv_metrics

## 2) Compute the final metrics (F1-score, recall, precision, fall-out)

In [None]:
for selected_dataset in dataset_types:
    print("\n" + "-" * 104, "\n" + "-" * 50 + selected_dataset.upper() + "-" * 50 + "\n" + "-" * 104)

    for inv_name in inv_names:
        print("\n" + "-" * 70, "\n" + "-" * 40 + inv_name + "-" * 40 + "\n" + "-" * 70)
        metrics = metrics_types[selected_dataset][inv_name]

        # Compute the metrics (i.e., recall, ...)
        recall, miss_rate, fall_out, precision, f1_score = compute_metrics(metrics['TP'], metrics['FP'], metrics['FN'], 
                                                                           metrics['TN'], verbose = False)