# Prepare synthetic data for simulating health information in normal and abnormal behavior

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
normal_cpu_utilization = np.random.normal(0.6,0.1, 100)
abnormal_cpu_utilization = np.random.normal(0.9,0.1, 10)
normal_free_heap_percent = np.random.normal(0.4,0.1, 100)
normal_jvm_old_generation_used = np.random.normal(0.5,0.2, 100)
# normal_cpu_utilization

## Features Identified

feature 1: avgbackground queue, (1500) [tasknode] <br>
feature 2: avg workflow queue, (55) [tasknode]<br>
feature 3: avg node manager thread pool size, (5.57) [tasknode] [feature 1,2,3 are related]<br>
 <br>
feature 4 : node id (need to revisit) [can we put node id as feature so that we can backtrack it later ???]<br>
feature 5: index=mail, failed post output=ssl ctx set options [source : splunk search] ( 0 normal, 5000+ abnormal)<br>
feature 6: product release (value 0,1)<br>
feature 7: ui node thread count (normal 250, abnormal 500+) (ims990)<br>
feature 8: ariba.ui.aribaweb.util.AWGenericException: java.lang.IllegalStateException: splunk search linked with feature 7<br>
feature 9: scheduletaskstatustab query arches batch publishinselectedrealm job exception <br>
<br>
feature 10: catalogue search time.(38) linked with 1,2,3 feature<br>
feature 11: no network connectivity between SNV and US1 - ims-983<br>
feature 12: 4310024, "Failed to get JDBC connection permit ims-980 (normal 0-100, abnormal 100-1000)<br>
feature 13: node status fair to critical its-980 (linked with feature 12)<br>
feature 14: cloud health index down more than N time in X minutes<br>
<br>
Feature 15: JVM Heap size getting filled & garbage collector not able to clean the data & frequently garbage collector is running<br>
Feature 16: log size getting filled drastically 10%->100% in an hour<br>
Feature 17: GT node restarting [Need more data]<br>
</p>

RecordDate  - Record date and time <br>

In [3]:
columns = ['RecordDate','AvgBackgroundQ', 'AvgThreadPoolSize', 'AvgWorkflowQ', 'CatalogSearchTime', 
           'Exception', 'LogSizeVolumePercent', 'NetworkConnectivitySNV-US1',  'NodeId', 'IsProductReleased',
          'UiNodeThreadsCount','CloudHealthIndex', 'Label'  
          ]

print('Dataset columns \n' , columns)

Dataset columns 
 ['RecordDate', 'AvgBackgroundQ', 'AvgThreadPoolSize', 'AvgWorkflowQ', 'CatalogSearchTime', 'Exception', 'LogSizeVolumePercent', 'NetworkConnectivitySNV-US1', 'NodeId', 'IsProductReleased', 'UiNodeThreadsCount', 'CloudHealthIndex', 'Label']


In [4]:
recordsCount = 10000

In [5]:
# Average BackgroudQ value is around 100-300 for normal & 1000+ abnormally
avgBackgroundQ = np.round(np.random.normal(200, 100, recordsCount-100)).astype(int)

# add abnormal values
avgBackgroundQ = np.append(avgBackgroundQ, np.round(np.random.normal(1000, 100, 100)).astype(int))


#Average WorkflowQ value is around 10-40
avgWorkflowQ = np.round(np.random.normal(20, 10, recordsCount)).astype(int)

#Average ThreadPoolSize would be around 8-20
avgThreadPoolSize = np.round(np.random.normal(8, 4, recordsCount)).astype(int)

# Most of the records don't have ID -1 represents no nodeid
nodeId = np.random.choice(np.append(np.arange(60,80), [-999]),recordsCount)

# Indicates if product is released
isProductReleased = np.random.choice([0,1], recordsCount)

# UI health index data 
uiNodeThreadCount = np.round(np.random.normal(250, 200, recordsCount)).astype(int)

# CloudHealthIndex
cloudHealthValues = ['FAIR', 'GOOD', 'CRITICAL']
cloudHealthIndex = np.random.choice(cloudHealthValues, recordsCount)

# Exception occurred
exception = np.random.choice(['AWGenericException: java.lang.IllegalStateException','java.lang.OutOfMemoryException',
                              'NA', 'javax.net.ssl.SSLHandshakeException', 'spanning tree event',
                              'JDBC-connection-permit-failure', '[OutOfMemoryException, GT Nodes restarting]'], recordsCount)

# Catelog search time
catalogSearchTime = np.round(np.random.normal(20, 10, recordsCount)).astype(int)

# Network Connectivity broken ?
nwConnectivitySNV_US1 = np.random.choice([0,1], recordsCount)

# LogSize Volume Percent 0-1 . 
logSizeVolumePercent = np.round(np.random.random(recordsCount),decimals=3)
#np.random.randint(50,100,recordsCount)

# Record dates for every 30min from 1 Jan 2017, taking only given number of records
record_dates = pd.date_range(pd.to_datetime('01-01-2017', format='%d-%m-%Y'), 
                                  pd.to_datetime('31-12-2017', format='%d-%m-%Y'), freq='30min')[:recordsCount]


print('avgBackgroundQ = ', avgBackgroundQ)
print('avgWorkflowQ = ', avgWorkflowQ)
print('avgThreadPoolSize = ', avgThreadPoolSize)
print('nodeId = ', nodeId)
print('isProductReleased = ', isProductReleased)
print('uiNodeThreadCount = ', uiNodeThreadCount)
print('cloudHealthIndex = ', cloudHealthIndex)
print('exception = ', exception)
print('catalogSearchTime = ', catalogSearchTime)
print('nwConnectivitySNV_US1 = ', nwConnectivitySNV_US1)
print('logSizeVolumePercent = ', logSizeVolumePercent)
print('record_dates =\n', record_dates)


avgBackgroundQ =  [186 234 179 ... 934 932 901]
avgWorkflowQ =  [43 31 23 ...  8 25 24]
avgThreadPoolSize =  [11 14  6 ... 10 14 12]
nodeId =  [  78   67   70 ...   66 -999   79]
isProductReleased =  [0 1 1 ... 1 1 1]
uiNodeThreadCount =  [499  64   5 ...  57 516 308]
cloudHealthIndex =  ['FAIR' 'FAIR' 'CRITICAL' ... 'GOOD' 'CRITICAL' 'CRITICAL']
exception =  ['spanning tree event' 'spanning tree event'
 'java.lang.OutOfMemoryException' ... 'NA'
 'JDBC-connection-permit-failure' 'JDBC-connection-permit-failure']
catalogSearchTime =  [18 19 10 ...  8 24 30]
nwConnectivitySNV_US1 =  [0 0 1 ... 0 1 0]
logSizeVolumePercent =  [0.438 0.693 0.804 ... 0.414 0.294 0.802]
record_dates =
 DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 00:30:00',
               '2017-01-01 01:00:00', '2017-01-01 01:30:00',
               '2017-01-01 02:00:00', '2017-01-01 02:30:00',
               '2017-01-01 03:00:00', '2017-01-01 03:30:00',
               '2017-01-01 04:00:00', '2017-01-01 04:30:00',
       

In [6]:
# There are fixed Issue category

incidentTypes = ['NoIssue', 'DBConnectionIssue', 'InvoiceIssue', 'OrderIssue', 'CommunityIssue',
                 'WorkspaceIssue' , 'NetworkIssue', 'CommunityHealthIssue' ]

labels = np.random.choice(incidentTypes, recordsCount)
print('labels = ', +labels)


labels =  ['DBConnectionIssue' 'NoIssue' 'InvoiceIssue' ... 'CommunityIssue'
 'DBConnectionIssue' 'CommunityHealthIssue']


In [7]:
recordsDict= {
    'AvgBackgroundQ' : avgBackgroundQ, 'AvgWorkflowQ' : avgWorkflowQ, 'AvgThreadPoolSize' : avgThreadPoolSize,
    'NodeId' : nodeId, 'IsProductReleased' : isProductReleased, 'UiNodeThreadsCount':uiNodeThreadCount , 
     'CatalogSearchTime' : catalogSearchTime, 
    'NetworkConnectivitySNV-US1': nwConnectivitySNV_US1, 'LogSizeVolumePercent' : logSizeVolumePercent,
    'Label' : labels, 'Date' : record_dates, 'CloudHealthIndex' : cloudHealthIndex
}
# 'Exception' : exception, is removed for now

sample1 = pd.DataFrame(data=recordsDict)
sample1.head()

Unnamed: 0,AvgBackgroundQ,AvgThreadPoolSize,AvgWorkflowQ,CatalogSearchTime,CloudHealthIndex,Date,IsProductReleased,Label,LogSizeVolumePercent,NetworkConnectivitySNV-US1,NodeId,UiNodeThreadsCount
0,186,11,43,18,FAIR,2017-01-01 00:00:00,0,DBConnectionIssue,0.438,0,78,499
1,234,14,31,19,FAIR,2017-01-01 00:30:00,1,NoIssue,0.693,0,67,64
2,179,6,23,10,CRITICAL,2017-01-01 01:00:00,1,InvoiceIssue,0.804,1,70,5
3,266,11,6,10,FAIR,2017-01-01 01:30:00,0,OrderIssue,0.329,1,67,267
4,249,11,39,29,GOOD,2017-01-01 02:00:00,1,NoIssue,0.683,1,63,746


In [8]:
sample1.to_excel('SystemLogsDataset-temp.xlsx',index=False)

## SystemLogsDataset is created
#### Now we will train the model

### Feature Generation & Normalization

In [9]:
# Consider all negative values for AvgBackgroundQ, AvgThreadPoolSize, AvgWorkflowQ, CatalogSearchTime, UiNodeThreadsCount
# as Missing values and replace with -999

df = sample1.copy()

df['AvgBackgroundQ'][df['AvgBackgroundQ'] < 1] = np.NAN
df['AvgThreadPoolSize'][df['AvgThreadPoolSize'] < 1] = np.NAN
df['AvgWorkflowQ'][df['AvgWorkflowQ'] < 1] = np.NAN
df['CatalogSearchTime'][df['CatalogSearchTime'] < 1] = np.NAN
df['UiNodeThreadsCount'][df['UiNodeThreadsCount'] < 1] = np.NAN

# df['AvgBackgroundQ'].hist()
# df['AvgThreadPoolSize'].hist()
# df['AvgWorkflowQ'].hist()
# df['CatalogSearchTime'].hist()
# df['UiNodeThreadsCount'].hist()

# Fill NaN values with -999
df = df.fillna(-999)


#Assign number to label
df['Label'] = df['Label'].apply(incidentTypes.index)
df['CloudHealthIndex'] = df['CloudHealthIndex'].apply(cloudHealthValues.index)

# Create new features from date
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Hour'] = df['Date'].dt.hour
df['Minute'] = df['Date'].dt.minute
df['Second'] = df['Date'].dt.second

df['DayOfYear'] = df['Date'].dt.dayofyear
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['WeekOfYear'] = df['Date'].dt.weekofyear

df['WeekOfYear'] = df['Date'].dt.weekofyear
df['Quarter'] = df['Date'].dt.quarter

df['IsWeekend'] = (df['Date'].dt.dayofweek > 4).astype(int)
df['IsMonthStart'] = (df['Date'].dt.is_month_start).astype(int)
df['IsMonthEnd'] = (df['Date'].dt.is_month_end).astype(int)

df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Rem

Unnamed: 0,AvgBackgroundQ,AvgThreadPoolSize,AvgWorkflowQ,CatalogSearchTime,CloudHealthIndex,Date,IsProductReleased,Label,LogSizeVolumePercent,NetworkConnectivitySNV-US1,...,Hour,Minute,Second,DayOfYear,DayOfWeek,WeekOfYear,Quarter,IsWeekend,IsMonthStart,IsMonthEnd
0,186.0,11.0,43.0,18.0,0,2017-01-01 00:00:00,0,1,0.438,0,...,0,0,0,1,6,52,1,1,1,0
1,234.0,14.0,31.0,19.0,0,2017-01-01 00:30:00,1,0,0.693,0,...,0,30,0,1,6,52,1,1,1,0
2,179.0,6.0,23.0,10.0,2,2017-01-01 01:00:00,1,2,0.804,1,...,1,0,0,1,6,52,1,1,1,0
3,266.0,11.0,6.0,10.0,0,2017-01-01 01:30:00,0,3,0.329,1,...,1,30,0,1,6,52,1,1,1,0
4,249.0,11.0,39.0,29.0,1,2017-01-01 02:00:00,1,0,0.683,1,...,2,0,0,1,6,52,1,1,1,0


In [16]:
features = df.columns.drop(['Label', 'Date'])

X = df[features]
y = df['Label']


### Training a model 

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

# K-NN 
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.112

## Saving a model

In [34]:
import pickle
import dill as pickle

filename = 'model_v1.pk'

with open('models/'+filename, 'wb') as file:
    pickle.dump(knn, file)

## Loading a model

In [35]:
with open('models/'+filename ,'rb') as f:
    loaded_model = pickle.load(f)
    
loaded_model.score(X_test, y_test)

0.112

In [44]:
predicitons = loaded_model.predict(X_test)
predNames = pd.Series(predicitons).apply(lambda x : incidentTypes[x])
predNames.head()

0    CommunityHealthIssue
1            InvoiceIssue
2       DBConnectionIssue
3            InvoiceIssue
4          WorkspaceIssue
dtype: object