In [1]:
import pandas as pd

# Load the csv into a df
df = pd.read_csv('Network-heal/predictedLogs.csv', sep=',', engine='python')

# Create a noise-db
noise_events = ['audispd node manag type', 'comp manag subcomp sha usernam root']

In [11]:
# Remove rows with col-newText is Nan
df = df[df['newText'].notna()]

# Assign a unique value to each error-type
error_dict = {}
index = 0

for error in set(df['newText']):
    for noise in noise_events:
        if noise in error:
            continue
    error_dict[error] = index
    index += 1

# Check error_dict
print(len(error_dict.keys()))
print(error_dict.keys())

15154
dict_keys(['bd test export comp esx subcomp mpa client tid level info hostnodestatusvert mpaclienthostservic receiv publish msg type com manag agg messag pulloverwritestatsrequestmsg corelationid e bcc b aph c ee e daa cfb', 'test vsanmgmtsvc error vsand opid mainthread vsanhosthelp iswitnesshost vsi get vmkmodul vsanutil iswit except becasu bad paramet', 'b test export comp esx subcomp mpa client tid level info aggsvc switch mpaclienthostservic receiv publish msg type com manag agg messag clientdatarequestmsg corelationid b f eac aph c ee e daa cfb', 'test export comp esx subcomp mpa client tid level info hostnodestatusvert mpaclienthostservic receiv publish msg type com manag agg messag pulloverwritestatsrequestmsg corelationid cc ae aph c ee e daa cfb', 'test export comp esx subcomp mpa client tid level info hostnodestatusvert mpaclienthostservic receiv publish msg type com manag agg messag clientdatarequestmsg corelationid fd bb f aph c ee e daa cfb', 'test export comp esx su

In [3]:
TX_PER_ROW = 50
df_new = pd.DataFrame(columns = ['Transaction', 'Error IDs'])
outer_row = 0
inner_row = 1
total_rows = df.shape[0]

while outer_row < 1000:
    itemsets = []
    while inner_row % TX_PER_ROW != 0:
        error = df['newText'][inner_row]
        for i in noise:
            if i in error:
                continue
        # Make itemsets unique
        error_code = str(error_dict[error])
        if error_code not in itemsets:
            itemsets.append(error_code)
        inner_row += 1
    df_new.at[outer_row, 'Transaction'] = 't-%s' %inner_row 
    df_new.at[outer_row, 'Error IDs'] = itemsets
    outer_row += 1
    # Incr. inner_row for the case we will meet condition
    inner_row += 1

df_new.head()

Unnamed: 0,Transaction,Error IDs
0,t-50,"[12675, 5715, 3533, 13321, 10346, 7144, 264, 1..."
1,t-100,"[10346, 7144, 264, 5715, 3533, 13321, 4120, 11..."
2,t-150,"[264, 5715, 3533, 13321, 11694, 14278, 10346, ..."
3,t-200,"[5552, 5844, 6355, 4120, 13321, 1831, 3709]"
4,t-250,"[4120, 13321, 1795, 7935, 7865, 2662, 8865, 98..."


In [4]:
from apyori import apriori
# records will be placeholder for all labels
# type - list(lists)
records = []
rows = 1000
cols = 2

# Walk the data-frame and populate records
for row in range(0, rows):
    records.extend([df_new.values[row, col] for col in range(1, cols)])

print(records[:5])

[['12675', '5715', '3533', '13321', '10346', '7144', '264', '14565', '14913', '4755', '4557', '4120'], ['10346', '7144', '264', '5715', '3533', '13321', '4120', '11683', '10018', '5734'], ['264', '5715', '3533', '13321', '11694', '14278', '10346', '7144', '4120', '11683', '2991', '9888', '9441', '1772', '9140', '7826', '13212', '6355', '5844'], ['5552', '5844', '6355', '4120', '13321', '1831', '3709'], ['4120', '13321', '1795', '7935', '7865', '2662', '8865', '9868', '14350', '7373', '12922', '8325', '5923', '5623', '2008', '10413', '13982', '4329', '11694', '4266', '9598', '1320', '11026', '8368']]


In [5]:
# Run the apriori with the following support, confidence and lift parameters:
#  - Support: Popularity index
#    Support(A) = Transactions with (A) / Total transactions 
#. - Min-confidence: Likelihood that B also occurs when A occurs
#.   Confidence(A->B) = Transactions with (A+B) / Total transactions with A
#  - Min-lift: Increase in occurence B when A occurs
#    Lift(A->B) = Confidence(A->B) / Support(A)
association_rules = apriori(records, min_support=0.1, min_confidence=0.5, min_lift=2, min_length=3)
association_results = list(association_rules)

In [6]:
print(association_results[:5])

[RelationRecord(items=frozenset({'1', '10964'}), support=0.101, ordered_statistics=[OrderedStatistic(items_base=frozenset({'1'}), items_add=frozenset({'10964'}), confidence=0.9805825242718448, lift=9.61355415952789), OrderedStatistic(items_base=frozenset({'10964'}), items_add=frozenset({'1'}), confidence=0.9901960784313727, lift=9.61355415952789)]), RelationRecord(items=frozenset({'1', '13749'}), support=0.101, ordered_statistics=[OrderedStatistic(items_base=frozenset({'1'}), items_add=frozenset({'13749'}), confidence=0.9805825242718448, lift=9.33888118354138), OrderedStatistic(items_base=frozenset({'13749'}), items_add=frozenset({'1'}), confidence=0.961904761904762, lift=9.33888118354138)]), RelationRecord(items=frozenset({'14766', '1'}), support=0.102, ordered_statistics=[OrderedStatistic(items_base=frozenset({'1'}), items_add=frozenset({'14766'}), confidence=0.9902912621359223, lift=9.255058524634789), OrderedStatistic(items_base=frozenset({'14766'}), items_add=frozenset({'1'}), con

In [7]:
# Make a dataframe for all associations
df_arm = pd.DataFrame(columns = ['Association', 'Support', 'Confidence', 'Lift'])

o_index = 0
for res in association_results:
    items = list(res[0])
    items_link = ''
    for i_index in range(len(items)):
        # error_index = list(error_dict.values()).index(int(items[i_index]))
        if i_index == len(items) - 1:
            # items_link += list(error_dict.keys())[error_index]
            items_link += items[i_index]
        else:
            # items_link += list(error_dict.keys())[error_index] + ' -> '  
            items_link += items[i_index] + ' -> '
    support = res[1]
    confidence = res[2][0][2]
    lift = res[2][0][3]
    # print(items_link)
    df_arm.loc[o_index] = [items_link, support, confidence, lift]
    o_index += 1

In [19]:
# Sort the dataframe by Confidence
df_arm_sort = df_arm.sort_values('Confidence', ascending=False)

# Examine the df_arm dataframe
df_arm_sort.head(50)

Unnamed: 0,Association,Support,Confidence,Lift
149,8325 -> 7144 -> 10346,0.131,1.0,2.03252
2474,11694 -> 5715 -> 10346 -> 264 -> 7144,0.162,1.0,2.03666
483,5715 -> 3533 -> 9868,0.133,1.0,2.0
836,7865 -> 3533 -> 264 -> 10346,0.147,1.0,2.024291
837,3533 -> 7935 -> 264 -> 10346,0.145,1.0,2.024291
159,14766 -> 13749 -> 10964,0.102,1.0,9.52381
852,5623 -> 264 -> 10346 -> 5715,0.122,1.0,2.024291
860,5715 -> 5923 -> 264 -> 10346,0.12,1.0,2.024291
863,5715 -> 7865 -> 264 -> 10346,0.146,1.0,2.024291
864,5715 -> 7935 -> 264 -> 10346,0.144,1.0,2.024291


In [28]:
# Examine a few examples by hand

def convert_err_codes(error_codes):
    for i in range(len(error_codes)):
        error_code = error_codes[i]
        error_index = list(error_dict.values()).index(error_code)
        error_event = list(error_dict.keys())[error_index]
        if i == len(error_codes) - 1:
            print(error_event)
        else:
            print(error_event + '->')
    print('-----')
        
        
# Row-4 (old - before removing noise):        
# 4 	 747 -> 13138 -> 11274	0.481	0.977642	2.019922
# convert_err_codes([747, 13138, 11274])

# Row-48 (old - before removing noise):
# 48	5693 -> 747 -> 8997 -> 2526	0.458	0.927126	2.019881
# convert_err_codes([747, 8997, 2526])

# Shows good co-relation between the events. In this case, the separate events are related to some root account settings

# Row-299	 15125 -> 8997 -> 11274	0.127	1.0	2.000000
# print(convert_err_codes([15125, 8997, 11274]))
# print(convert_err_codes([747, 722, 8997, 13138, 11274]))
# print(convert_err_codes([15125, 747, 11274, 2526]))

existing_index = []
for row in df_arm_sort:
    associations = df_arm_sort['Association'][1]
    for i in str(associations).split('->'):
        print(i)

# 483	5715 -> 3533 -> 9868	0.133	1.0	2.000000
print(convert_err_codes([5715, 3533, 9868]))

# 159	14766 -> 13749 -> 10964	0.102	1.0	9.523810
print(convert_err_codes([14766, 13749, 10964]))

1 
 13749
1 
 13749
1 
 13749
1 
 13749
audispd node manag type cred disp msg audit pid uid auid ses subj unconfin msg op pam setcr acct root exe usr bin sudo hostnam addr termin res success->
audispd node manag type user end msg audit pid uid auid ses subj unconfin msg op pam session close acct root exe usr bin sudo hostnam addr termin res success->
comp manag subcomp sha usernam root level info comp data whiteboard tree metric match prefix sha plugin id cd f f bae action result rmdi return direct
-----
None
test vsansystem info vsansystem vsan sub lib vsaninfoimpl load dit subclust config store normal node->
test vsansystem info vsansystem vsan sub lib vsanconfigstor get datastorenam->
test vsansystem info vsansystem vsan sub lib vsaninfoimpl assign default datastor
-----
None


In [35]:
# TO-DO
# Build fault, remediation database (in memory)

149                                  8325 -> 7144 -> 10346
2474                 11694 -> 5715 -> 10346 -> 264 -> 7144
483                                   5715 -> 3533 -> 9868
836                           7865 -> 3533 -> 264 -> 10346
837                           3533 -> 7935 -> 264 -> 10346
                               ...                        
39179    5715 -> 5923 -> 7373 -> 8865 -> 8325 -> 3533 -...
39178    5715 -> 5923 -> 8865 -> 3533 -> 9868 -> 264 ->...
39176    5715 -> 5923 -> 8865 -> 8325 -> 3533 -> 264 ->...
39174    5715 -> 5923 -> 7373 -> 8865 -> 3533 -> 264 ->...
37633    13321 -> 8865 -> 5923 -> 4120 -> 9868 -> 264 -...
Name: Association, Length: 65897, dtype: object

In [None]:
# TO-DO - Machine learning models 
#  | - Decision tree 
#  | - Esemble methods - Random Forest (flavor of DT)