In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [89]:
incident_df = pd.read_excel('incident_dataset.xlsx', sheet_name='Sheet2')

incident_df['Exception']  = sample_df['Exception'].fillna('NA')
incident_df['CloudHealthIndex']  = sample_df['CloudHealthIndex'].fillna('NA')


incidentTypesDict = dict(zip(incident_df['Label'].unique(), np.arange(incident_df['Label'].unique().size)))
exceptionTypes = dict(zip(incident_df['Exception'].unique(), np.arange(incident_df['Exception'].unique().size)))
cloudHealthValues = dict(zip(incident_df['CloudHealthIndex'].unique(), np.arange(incident_df['CloudHealthIndex'].unique().size)))
features_vars = incident_df.columns.drop(['Label'])


print('incidentTypesDict  = ', incidentTypesDict, '\n')
print('exceptionTypes  = ', exceptionTypes, '\n')
print('cloudHealthValues  = ', cloudHealthValues, '\n')
print('features_vars  = ', features_vars, '\n')

sample_df.head()


incidentTypesDict  =  {'NetworkIssue': 0, 'NoIssue': 1, 'DatabaseConnection': 2, 'CommunityHealthIssue': 3, 'Community-FromtDoorNotaccessible': 4, 'InvoiceIssue': 5, 'UnsearchableWorkspaces': 6, 'DataLoad Failure': 7} 

exceptionTypes  =  {'spanning tree event': 0, 'NA': 1, 'JDBC-connection-permit-failure': 2, '[AWGenericException: java.lang.IllegalStateException, Export-webservice-ConnectionTimeout]': 3, '[ScheduledTsak-ArchesBatchPublishInThisRealm-Failure, Arches Schema version mismatch]': 4, '[OutOfMemoryException, GT Nodes restarting]': 5} 

cloudHealthValues  =  {'FAIR': 0, 'GOOD': 1, 'POOR': 2, 'NA': 3, 'CRITICAL': 4} 

features_vars  =  Index(['Date', 'AvgBackgroundQ', 'AvgThreadPoolSize', 'AvgWorkflowQ',
       'CatalogSearchTime', 'IncreasingBGQueueTrend', 'IncreasingThreadTrend',
       'IncreasingWFQueueTrend', 'Exception', 'LogSizeVolumePercent',
       'NetworkConnectivitySNV-US1', 'IsProductReleased', 'UiNodeThreadsCount',
       'CloudHealthIndex'],
      dtype='object'

Unnamed: 0,Date,AvgBackgroundQ,AvgThreadPoolSize,AvgWorkflowQ,CatalogSearchTime,IncreasingBGQueueTrend,IncreasingThreadTrend,IncreasingWFQueueTrend,Exception,LogSizeVolumePercent,NetworkConnectivitySNV-US1,IsProductReleased,UiNodeThreadsCount,CloudHealthIndex,Label
0,2017-01-01 00:00:00,296.0,9.0,14.0,16.0,0,0,0,spanning tree event,39.0,1.0,1.0,13.0,FAIR,NetworkIssue
1,2017-01-01 00:30:00,284.0,9.0,8.0,15.0,0,0,0,spanning tree event,40.0,1.0,1.0,13.0,GOOD,NetworkIssue
2,2017-01-01 01:00:00,,9.0,14.0,13.0,0,0,0,spanning tree event,26.0,1.0,0.0,19.0,POOR,NetworkIssue
3,2017-01-01 01:30:00,289.0,8.0,14.0,11.0,0,0,0,spanning tree event,39.0,1.0,0.0,12.0,,NetworkIssue
4,2017-01-01 02:00:00,304.0,10.0,10.0,10.0,0,0,0,spanning tree event,29.0,1.0,1.0,11.0,GOOD,NetworkIssue


### Pipeline and Estimator to PreProcess incident Dataset

In [79]:
from sklearn.base import BaseEstimator, TransformerMixin


class Preprocessor(BaseEstimator, TransformerMixin):
    """Custom Preprocessing Estimator for the custom case
    
    """
    
    def __init___(self):
        pass
    
    def transform(self, df):
        
        df = df[features_vars]
        
        # Make all negative values to NaN so that it can be replaced with single value
        df['AvgBackgroundQ'][df['AvgBackgroundQ'] < 1] = np.NAN
        df['AvgThreadPoolSize'][df['AvgThreadPoolSize'] < 1] = np.NAN
        df['AvgWorkflowQ'][df['AvgWorkflowQ'] < 1] = np.NAN
        df['CatalogSearchTime'][df['CatalogSearchTime'] < 1] = np.NAN
        df['UiNodeThreadsCount'][df['UiNodeThreadsCount'] < 1] = np.NAN
        
        df = df.fillna(-999)
        
        df1 = df.apply(lambda rec : pd.Series({'Day' : rec['Date'].day, 
                      'Month' : rec['Date'].month, 
                      'Year' :  rec['Date'].year,
                      'Hour' : rec['Date'].hour, 
                      'Minute' : rec['Date'].minute, 
                      'Second' :  rec['Date'].second,
                      'DayOfYear' : rec['Date'].dayofyear, 
                      'DayOfWeek' : rec['Date'].dayofweek, 
                      'WeekOfYear' :  rec['Date'].weekofyear,
                      'WeekOfYear' : rec['Date'].weekofyear, 
                      'Quarter' : rec['Date'].quarter,
                      'IsWeekend' : int(rec['Date'].dayofweek > 4), 
                      'IsMonthStart' : int(rec['Date'].is_month_start),
                      'IsMonthEnd' :  int(rec['Date'].is_month_end)
                     }), axis = 1)
        
        df = pd.concat([df, df1], axis=1)

        df.replace({'CloudHealthIndex': cloudHealthValues}, inplace=True)
        df.replace({'Exception': exceptionTypes}, inplace=True)
        
        df = df[df.columns.drop(['Date'])]
        return df  #.as_matrix()
    
    def fit(self, df, y=None, **fit_params):
        
        return self;
        
        


In [97]:
# Get the y values from DataFram
y = incident_df['Label'].replace(incidentTypesDict)
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

In [98]:
preprocess = Preprocessor()

X = preprocess.transform(sample_df)
X.head()

Unnamed: 0,AvgBackgroundQ,AvgThreadPoolSize,AvgWorkflowQ,CatalogSearchTime,IncreasingBGQueueTrend,IncreasingThreadTrend,IncreasingWFQueueTrend,Exception,LogSizeVolumePercent,NetworkConnectivitySNV-US1,...,Hour,Minute,Second,DayOfYear,DayOfWeek,WeekOfYear,Quarter,IsWeekend,IsMonthStart,IsMonthEnd
0,296.0,9.0,14.0,16.0,0,0,0,0,39.0,1.0,...,0,0,0,1,6,52,1,1,1,0
1,284.0,9.0,8.0,15.0,0,0,0,0,40.0,1.0,...,0,30,0,1,6,52,1,1,1,0
2,-999.0,9.0,14.0,13.0,0,0,0,0,26.0,1.0,...,1,0,0,1,6,52,1,1,1,0
3,289.0,8.0,14.0,11.0,0,0,0,0,39.0,1.0,...,1,30,0,1,6,52,1,1,1,0
4,304.0,10.0,10.0,10.0,0,0,0,0,29.0,1.0,...,2,0,0,1,6,52,1,1,1,0


### Split the dataset in Training and Test data

In [94]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

### Train the algorithm using Classifier

In [95]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

knn.score(X_test, y_test)

0.9169960474308301

### Save the Model in file

In [96]:
import pickle
import dill as pickle

filename = 'model_v2.pk'

with open('models/'+filename, 'wb') as file:
    pickle.dump(knn, file)

In [101]:
knn.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 7, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 6, 1, 4, 6, 2, 1, 1, 7, 1, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1,
       1, 4, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 3, 1, 4, 1, 1, 7, 4, 1, 1, 1,
       1, 1, 5, 1, 4, 1, 1, 1, 6, 1, 1, 6, 7, 6, 1, 5, 1, 1, 1, 1, 5, 1,
       1, 1, 4, 1, 1, 6, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 6, 1,
       1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 4, 1, 1, 4, 1, 1, 1, 1, 1,
       6, 0, 0, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 7, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 5, 0, 4, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1], dtype=int64)