In [None]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import datetime
from datetime import timedelta

import warnings
warnings.filterwarnings("ignore")

import glob

In [None]:
# Obtain all the models in the relevant folder recursively

def determine_number_of_models(folder_to_search):
    folders = [f for f in glob.glob(folder_to_search + "**/", recursive = True)]
    model_location = []
    model_names = []
    for folderscounter in folders:
        model_location = [f for f in glob.glob(folderscounter + "/*tst_pred*", recursive = True)]
    if len(model_location) > 0:
        for cnt in model_location:
            split_str = cnt.split('\\')
            model_names.append(split_str[-1][:-4])
    
    return len(model_location), model_location, model_names

In [None]:
folder_to_search = r'C:/Users/Ranja.Sarkar/PAMre/KPIs/BSP_GT5/QualitativeKPI/fold'
no_of_models, model_location, model_names = determine_number_of_models(folder_to_search)


In [None]:
# Load the failure file #Ensure the file is in relevant format

def load_excel(path):
#    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    dfoutput = pd.read_excel(path, parse_dates = ['Failure Date'])
    return dfoutput


In [None]:
# load the predicted output files

def load_csv(path):
    dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    dfoutput = pd.read_csv(path, parse_dates = ['timestamp'], date_parser = dateparse)
    dfoutput.set_index(['timestamp'], inplace = True)
    return dfoutput


In [None]:
location_of_failure_file = r'C:/Users/Ranja.Sarkar/PAMre/KPIs/BSP_GT5/QualitativeKPI/fold/GT5-downtime.xlsx'
df_failure = load_excel(location_of_failure_file)


In [None]:
# Evaluate number of alerts before failure

def evaluate_number_of_alerts_before_failure(df_failure, alert_results, model_name, output):
    for cnt in range(0, len(df_failure)):                   
        right_limit = df_failure.loc[cnt,'Failure Date']
        left_limit = df_failure.loc[cnt,'Failure Date'] - timedelta(hours = 14*24)
        sampdf = alert_results[(alert_results.index >= left_limit) & (alert_results.index < right_limit)]
        if len(sampdf) > 0 :
            sampdf = sampdf[(sampdf['system_status'] == 'online') & (sampdf['prediction'] == -1) & (sampdf['alert'] == 1)]
        output.loc[cnt, model_name] = len(sampdf)
    return output


In [None]:
# Generate Alert Ids

def create_model_ids(model):
    mid = []
    for i in range(0, no_of_models):
        c = 'M' + str(i)
        mid.append(c)
    return mid        

def create_list(a, b, n):
    output = ''
    for i in range(int(n)):
        output = output + str(a) + '.' + str(b) + '.' + str(i+1) + ';'
    return output[:-1]

def generate_alert_ids(output, model_name, m):
    for j in range(0, len(output)):
        xx = create_list(m, j+1, output.loc[j, model_name])
        output.loc[j, model_name + '_AlertIds'] = xx                          
    return output


In [None]:
location_of_rca_file = r'C:/Users/Ranja.Sarkar/PAMre/KPIs/BSP_GT5/QualitativeKPI/fold/Brunei_Champion_GT5_w1d_rca_by_rolling_z_score_01Aug2018_01Jul2019_rca_tag_rank.csv'
df_rca = load_csv(location_of_rca_file)


In [None]:
# Root Cause Tags

def isintop10(df_rca, sampdf, listofrcatags):
    out = ''
    if isinstance(listofrcatags, str):
        splittags = listofrcatags.split(';')       
        for i in range(len(sampdf)): 
#            print(i, len(sampdf))
            templist = []
            res = 0
            for j in range(10):
                templist.append(df_rca.loc[sampdf.index[i], 'top' + str(j+1)])           
            for tagname in templist:
                if tagname in splittags:
                    res = 1     
            if res == 1:
                out = out + 'y' + ';'
            if res == 0:
                out = out + 'n' + ';'           
    return out[:-1]
    
    
def evaluate_presence_of_root_cause_tags(df_failure, alert_results, model_name, df_rca, output):  
    for cnt in range(0, len(df_failure)):
        out = ''
        right_limit = df_failure.loc[cnt,'Failure Date']
        left_limit = df_failure.loc[cnt,'Failure Date'] - timedelta(hours = 14*24)
        sampdf = alert_results[(alert_results.index >= left_limit) & (alert_results.index < right_limit)]
        if len(sampdf) > 0:
            sampdf = sampdf[(sampdf['system_status'] == 'online') & (sampdf['prediction'] == -1) & (sampdf['alert'] == 1)]
            output.loc[cnt, model_name + '_relevanttags'] = isintop10(df_rca, sampdf, df_failure.loc[cnt,'RCA_Tags'])
#            output.loc[cnt, model_name + '_relevantags'] = isintop10(df_rca, sampdf, df_failure.loc[cnt,'RCA_Tags']).count('y')
    return output


In [None]:
#for cnt in model_location:
#    print(cnt)

In [None]:
k = 0
analysis_result_df = df_failure
mid = create_model_ids(no_of_models)
for cnt in model_location:
    print('processing', model_names[k])
    alert_results = load_csv(cnt)
    analysis_result_df = evaluate_number_of_alerts_before_failure(df_failure, alert_results, model_names[k], analysis_result_df)
    analysis_result_df = generate_alert_ids(analysis_result_df, model_names[k], mid[k])
    analysis_result_df = evaluate_presence_of_root_cause_tags(df_failure, alert_results, model_names[k], df_rca, analysis_result_df)
#    print(alert_results)
    k = k+1

In [None]:
#analysis_result_df.to_csv('Alert_Analysis_A.csv')