In [1]:
from sys import path
if '..' not in path:
    path.insert(0, '..')

In [2]:
import pandas as pd
from os import path, makedirs
from _library.utils import SYSTEM_NAMES_FULL

In [3]:
# Folder path
%cd /mnt/data/vieri/projects/SAMPLE/
print(SYSTEM_NAMES_FULL)

/mnt/data/vieri/projects/SAMPLE
['Binetto 1', 'Binetto 2', 'Cantore', 'Emi', 'Soleto 1', 'Soleto 2', 'Galatina', 'Verone']


In [4]:
system_name = "Soleto 1"

In [5]:
# STATIC VARIABLE
folder_path = path.join("./data", system_name.upper(), system_name.upper())
fault_file_name = system_name + " - Storico Allarme.xlsx"

In [6]:
# IMPORT
fault_data = pd.read_excel(path.join(folder_path, fault_file_name), header = [1], sheet_name = None)
sheet_names = list(fault_data.keys())

In [7]:
# Merge data from all components
components_to_merge = [fault_data[name] for name in sheet_names]
full_fault_data = pd.concat(components_to_merge)

display(full_fault_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143545 entries, 0 to 1614
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Tipologia Evento  143545 non-null  object        
 1   Messaggio         143545 non-null  object        
 2   Ricevuto il       143545 non-null  datetime64[ns]
 3   Rientrato il      143545 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 5.5+ MB


None

In [39]:
def compute_frequencies(dataframe, splitSecondaryError = False):
    df = dataframe.copy()
    
    starting_ts = sorted(df['Ricevuto il'].values)
    first_ts = pd.Series(pd.to_datetime(starting_ts[0]).strftime('%Y-%m-%d (%H:%M)'), name = 'First timestamp')
    
    # Extract the meaningful part of the error
    ComponentName = df["Messaggio"].str.split(":").str[0]
    FullErrorMessage = df["Messaggio"].str.split(":").str[1]
    
    ErrorMessage_noSecCode = FullErrorMessage.str.split("(").str[0]
    simple_error_message = ErrorMessage_noSecCode.str.split(']').str[-1].str.strip()
    SecondaryErrorCode = FullErrorMessage.str.split("(").str[1].str[:-1]
    
    df["Secondary error code"] = SecondaryErrorCode
    df["Component"] = ComponentName
    
    if splitSecondaryError:
        df["Messaggio"] = ErrorMessage_noSecCode  
        df = df.groupby(["Tipologia Evento", "Component", "Secondary error code", "Messaggio"]).count()
    else:
        df["Messaggio"] = simple_error_message #FullErrorMessage
        df = df.groupby(["Tipologia Evento", "Messaggio"]).count()
   
    # Keep meaningful columns
    df.rename(columns={"Ricevuto il": "Frequenza"}, inplace = True)
    df = df[["Frequenza"]]

    # Compute total faults for each event type
    df_total = df.groupby(["Tipologia Evento"]).sum()
    df_total["Total faults (%)"] = round(df_total["Frequenza"]/df_total["Frequenza"].sum()*100,2)

    # Order the dataframe according to its frequency
    df_total.sort_values(by=['Frequenza'], 
                         ascending=False, inplace=True)
    if splitSecondaryError:
         df.sort_values(by=['Tipologia Evento',"Component", "Secondary error code",'Frequenza'], 
                        ascending=[True, True, True, False], inplace=True) 
    else:
        df.sort_values(by=['Tipologia Evento', 'Frequenza'], ascending=[True, False], inplace=True) 
    return df, df_total, first_ts


In [40]:
freq_faults, freq_type_faults, first_ts = compute_frequencies(full_fault_data)
freq_faults_SecondErr, *_ = compute_frequencies(full_fault_data, splitSecondaryError = True)

display(freq_type_faults)
display(freq_faults_SecondErr)
display(freq_faults)

First timestamp    2018-08-08 (10:15)
Name: First timestamp, dtype: object

First timestamp    2018-08-08 (10:15)
Name: First timestamp, dtype: object

Unnamed: 0_level_0,Frequenza,Total faults (%)
Tipologia Evento,Unnamed: 1_level_1,Unnamed: 2_level_1
Allarme string-box,128057,89.21
Bassa performance stringhe,6970,4.86
String-box con produzione anomala,6503,4.53
Bassa power ratio inverter,1482,1.03
Allarme inverter,319,0.22
Ritardo comunicazione dispositivo,153,0.11
Inverter con produzione a 0,61,0.04


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Frequenza
Tipologia Evento,Component,Secondary error code,Messaggio,Unnamed: 4_level_1
Allarme inverter,INV1 U090227 250kWp,0x200AD400,[0x20000] Desaturazione IGBT inverter,2
Allarme inverter,INV1 U090227 250kWp,0x200AD400,[0x80000] Inverter fault,2
Allarme inverter,INV1 U090227 250kWp,0x2101D148,[0x10000] Limitazione potenza AC per sovratemperatura,3
Allarme inverter,INV1 U090227 250kWp,0x3001D248,[0x10000] Limitazione potenza AC per sovratemperatura,1
Allarme inverter,INV1 U090227 250kWp,0x3020D240,[0x200000] Tensione rete fuori dai limiti,20
...,...,...,...,...
Allarme string-box,CSP4.6 V180543 s9,0x16DB,[3] Corrente di stringa fuori range,11
Allarme string-box,CSP4.6 V180543 s9,0x365B,[3] Corrente di stringa fuori range,1
Allarme string-box,CSP4.6 V180543 s9,0x36C9,[3] Corrente di stringa fuori range,7
Allarme string-box,CSP4.6 V180543 s9,0x36CB,[3] Corrente di stringa fuori range,1


Unnamed: 0_level_0,Unnamed: 1_level_0,Frequenza
Tipologia Evento,Messaggio,Unnamed: 2_level_1
Allarme inverter,Tensione rete fuori dai limiti,158
Allarme inverter,Limitazione potenza AC per sovratemperatura,104
Allarme inverter,Inverter fault,26
Allarme inverter,Sovratemperatura interno macchina,19
Allarme inverter,Sovracorrente inverter,4
Allarme inverter,Desaturazione IGBT inverter,3
Allarme inverter,Intervento protezione esterna,2
Allarme inverter,Desaturazione IGBT DC/DC inverter,1
Allarme inverter,Emergency power OFF,1
Allarme inverter,Sovratensione DC ingresso convertitore,1


In [41]:
# TASK: Save frequencies
save_folder_name = "Fault analyses"
save_folder_path = path.join(folder_path, save_folder_name)

# 0: Create the folders
if not path.exists(save_folder_path):
    makedirs(save_folder_path)
    print("A new saving folder has been created")
    
# Save as excel
# Writer
file_name = system_name + " - Fault_frequencies_NEW.xlsx"
writer = pd.ExcelWriter(path.join(save_folder_path, file_name))

# Sheets
freq_faults.to_excel(writer,sheet_name = "Faults", freeze_panes = (1, 0))
freq_faults_SecondErr.to_excel(writer, sheet_name = "Raw fault data", freeze_panes = (1, 0))
freq_type_faults.to_excel(writer, sheet_name = "Types", freeze_panes = (1, 0))
first_ts.to_excel(writer, sheet_name = 'Info', index = False)

writer.save()
print("Frequencies have been saved into '{0}'".format(save_folder_path))

Frequencies have been saved into './data/SOLETO 1/SOLETO 1/Fault analyses'


In [None]:
#   --------------------------------------------------------------------------
#  ------------- ABOVE CODE: Compute frequencies (as Excel files) --------------
# -------------------------------------------------------------------------------
#  ------------- BELOW CODE: Compute distributions (as Graphs)------------------
#   --------------------------------------------------------------------------

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.dates import MonthLocator, WeekdayLocator, DateFormatter
from matplotlib.ticker import MaxNLocator
from pandas.plotting import register_matplotlib_converters
from os import path, makedirs

In [None]:
# Folder path
%cd /mnt/data/vieri/projects/SAMPLE/

In [None]:
# ----------- TASK 2 -------------
system_name = "Emi"
folder_path = path.join("./data", system_name.upper(), system_name.upper())
fault_file_name = system_name + " - Storico Allarme.xlsx"

# Load sheets
fault_data = pd.read_excel(path.join(folder_path, fault_file_name), header = [1], sheet_name = None)
sheet_names = list(fault_data.keys())

# Merge sheets
components_to_merge = [fault_data[name] for name in sheet_names]
full_fault_data = pd.concat(components_to_merge)

display(full_fault_data.info())

In [None]:
# Extract meaningful information
FullErrorMessage = full_fault_data["Messaggio"].str.split(":").str[1]
SecondaryErrorCode = FullErrorMessage.str.split("(").str[1].str[:-1]
full_fault_data["Component"] = full_fault_data["Messaggio"].str.split(":").str[0]
full_fault_data["Secondary error code"] = SecondaryErrorCode
full_fault_data["Messaggio"] = FullErrorMessage.str.split("(").str[0]
full_fault_data["Day"] = full_fault_data["Ricevuto il"].dt.date # OR "Rientrato il"

granularity = "Messaggio"
#granularity = "Tipologia Evento"

# Keep meaningful columns
full_fault_data = full_fault_data[["Component", granularity, "Day", "Ricevuto il"]] #"Secondary error code"

# Group by
full_fault_data = full_fault_data.groupby(["Component", granularity, "Day"]).count()
full_fault_data.rename(columns = {"Ricevuto il": "Frequenza"}, inplace=True)

# Order
full_fault_data.sort_values(by=["Component", granularity, "Day", "Frequenza"], 
                            ascending=[False, True, True, False], inplace=True)
#display(full_fault_data)

In [None]:
# TASK: Create subfolders for saving graphs
main_saving_folder_name = "Fault analyses"
sub_saving_folder_name = "Fault distributions"
saving_folder_path = path.join(folder_path, main_saving_folder_name, sub_saving_folder_name)

# 0: Create the folders
if not path.exists(saving_folder_path):
    makedirs(saving_folder_path)
    print("A new saving folder has been created: ", path.join(main_saving_folder_name, sub_saving_folder_name))

In [None]:
def generate_component_fault_distribution(component_name):
    # A: Select and get data from a specific component
    componentNameIndex = full_fault_data.index.get_level_values('Component').str.contains(component_name)
    selected_component_data = full_fault_data.loc[componentNameIndex,:]
    componentFullName = selected_component_data.index.get_level_values("Component")[0]

    # B0: Get fault types from the selected component
    faultTypes = selected_component_data.sort_values(by="Frequenza", ascending=False).index.unique(granularity)
    
    # B0: Get a dense period --> Get all the dates covered by all the faults
    days = selected_component_data.index.get_level_values('Day')
    dateMin = min(days)
    dateMax = max(days)
    maxFreq =  max(selected_component_data["Frequenza"].values)
    filled_dates = pd.date_range(dateMin, dateMax, freq="D").date

    # B1: Create main plot
    fig, axes = plt.subplots(nrows = len(faultTypes), figsize=(20, 5 * len(faultTypes)))
    
    # Fix error in case there is only one fault type (i.e., only one graph/raw)
    if type(axes) is not np.ndarray:
        axes = np.array([axes])
    
    fig.suptitle("[{0}]: ".format(system_name.upper()) + componentFullName,  
                 fontsize=50, color='r')

    # B2: Create subplots for each faul type
    for idk, f_type in enumerate(faultTypes):

        # Get frequencies of the fault type
        fault_indexes = selected_component_data.index.get_level_values(granularity) == f_type
        faults = selected_component_data.loc[fault_indexes, :]

        # Create the data for the axes (X: Dates | Y: Freq)
        dates = list(faults.index.get_level_values('Day'))
        freq = faults["Frequenza"].values

        # Generate filled frequencies (i.e., zero values for empty dates)
        filled_frequencies = np.zeros(len(filled_dates), dtype=int) 
        commonDays_idks = np.argwhere(np.in1d(filled_dates, dates))

        for idk_cd, comDay in enumerate(commonDays_idks):
            filled_frequencies[comDay] = freq[idk_cd]

        idk_worst_day =  np.argmax(filled_frequencies) # np.argwhere(filled_frequencies == np.amax(filled_frequencies))[-1][0]
        worst_day = filled_dates[idk_worst_day]
        worst_day_freq = filled_frequencies[idk_worst_day]
        
        # SUBPLOTS (for each type): FAUL TYPE
        axes[idk].grid(linestyle = '-', linewidth = 0.8, alpha = 0.3)
        axes[idk].bar(x=filled_dates, height=filled_frequencies, edgecolor="r")

        # Highlight the worst day
        axes[idk].axhline(y = worst_day_freq, color='r', linestyle='-.', alpha = 0.3)
        axes[idk].text(x = worst_day - pd.Timedelta(weeks = 11), y = worst_day_freq * 1.05, 
                       fontsize = "xx-large", ha= "left", color = "r",
                       s = "Max faults: {0} [{1}]".format(worst_day_freq, worst_day))

        # Graphical settings
        axes[idk].set_title('FAULT: "{0}"'.format(f_type), fontsize=24, fontweight="semibold")
        axes[idk].set_ylabel('Daily faults', fontsize=20)

        # Tikers 
        axes[idk].tick_params(axis='x', which='minor', length=5)
        axes[idk].tick_params(axis='x', which='major', labelsize=14, labelrotation = 20, 
                              width=1.5, length=20)
        axes[idk].tick_params(axis='y', which='major', labelsize=16, 
                              width=2, length=5, direction = "in", 
                              grid_alpha= 0.5, grid_linestyle = "-.")

        axes[idk].xaxis.set_minor_locator(WeekdayLocator(interval=2))
        axes[idk].xaxis.set_major_locator(MonthLocator(interval=2))
        axes[idk].xaxis.set_major_formatter(DateFormatter("%Y-%m"))

        axes[idk].yaxis.set_major_locator(MaxNLocator(integer=True))

        # Set y limits (i.e., frequencies)
        if max(freq)<= maxFreq * 0.3:     
            axes[idk].set_ylim([0, max(freq) * 1.7])
        else:
            axes[idk].set_ylim([0, maxFreq * 1.2])
            
    try:
        fig.tight_layout(rect=[0, 0.03, 1, 0.96], pad = 2.3)
    except:
        print("Error during set 'tight_layout'")
    
    # Saving the graphs as PNG
    splitted_component_name = component_name.split(" ")
    if "INV" in splitted_component_name[0].upper() or len(splitted_component_name) <= 2:
        simplifed_comp_name = splitted_component_name[0]
    else: # COMPONENT: CSP
        simplifed_comp_name = splitted_component_name[0] + "_" + splitted_component_name[2]
    
    fig.savefig(path.join(saving_folder_path, '{0}_faultDist.png'.format(simplifed_comp_name)))
    print("The fault distribution graphs (Component: {0}) have been saved ".format(simplifed_comp_name))

register_matplotlib_converters()

# Generate fault distribution for each component
plt.close("all")
components = list(full_fault_data.index.unique("Component"))

print("Genereting fault distribution graphs for {0} components...".format(len(components)))
for name in components:
    generate_component_fault_distribution(name)
print("\nFinished. {0} files have been generated".format(len(components)))

In [None]:
# Visualize a fault distribution as an example
component_to_visualize = "INV1"

# Discard all of unrelated graphs and keep only the graph of the selected compoenent
for fig_idk in plt.get_fignums():
    plotted_component = plt.figure(fig_idk)._suptitle.get_text()
    simplified_component_name = plotted_component.split(" ")[1]
    
    if simplified_component_name != component_to_visualize:
        plt.close(fig_idk)
    else:
        fig_to_vis = plt.figure(fig_idk)
        
if len(plt.get_fignums()) != 0:
    print("Rendering fault distribution graphs of the component '{0}'".format(component_to_visualize))
    plt.show()
else:
    print("The component '{0}' has not been found in the memory".format(component_to_visualize))