In [1]:
import pandas as pd
import numpy as np

In [3]:
sampleDf = pd.read_excel('SystemLogsDataset-temp.xlsx')
sampleDf.head()

Unnamed: 0,AvgBackgroundQ,AvgThreadPoolSize,AvgWorkflowQ,CatalogSearchTime,CloudHealthIndex,Date,IsProductReleased,Label,LogSizeVolumePercent,NetworkConnectivitySNV-US1,NodeId,UiNodeThreadsCount
0,302,7,32,10,CRITICAL,2017-01-01 00:00:00,0,NetworkIssue,0.087,1,63,403
1,203,3,19,8,GOOD,2017-01-01 00:30:00,0,CommunityIssue,0.956,1,62,387
2,270,10,20,17,CRITICAL,2017-01-01 01:00:00,0,CommunityIssue,0.569,0,73,131
3,135,1,34,34,FAIR,2017-01-01 01:30:00,1,CommunityHealthIssue,0.048,1,65,22
4,118,14,12,14,FAIR,2017-01-01 02:00:00,1,OrderIssue,0.692,1,74,28


### Pipeline and Estimator

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin



class Preprocessor(BaseEstimator, TransformerMixin):
    """Custom Preprocessing Estimator for the custom case
    
    """
    
    def __init___(self):
        pass
    
    def transform(self, df):
        
        pred_vars = ['AvgBackgroundQ', 'AvgThreadPoolSize', 'AvgWorkflowQ',
       'CatalogSearchTime', 'CloudHealthIndex', 'Date', 'IsProductReleased',
       'Label', 'LogSizeVolumePercent', 'NetworkConnectivitySNV-US1', 'NodeId',
       'UiNodeThreadsCount']
        
        df = df[pred_vars]
        
        # Make all negative values to NaN so that it can be replaced with single value
        df['AvgBackgroundQ'][df['AvgBackgroundQ'] < 1] = np.NAN
        df['AvgThreadPoolSize'][df['AvgThreadPoolSize'] < 1] = np.NAN
        df['AvgWorkflowQ'][df['AvgWorkflowQ'] < 1] = np.NAN
        df['CatalogSearchTime'][df['CatalogSearchTime'] < 1] = np.NAN
        df['UiNodeThreadsCount'][df['UiNodeThreadsCount'] < 1] = np.NAN
        
        df = df.fillna(-999)
        
        df1 = df.apply(lambda rec : pd.Series({'Day' : rec['date'].day, 
                      'Month' : rec['date'].month, 
                      'Year' :  rec['date'].year,
                      'Hour' : rec['date'].hour, 
                      'Minute' : rec['date'].minute, 
                      'Second' :  rec['date'].second,
                      'DayOfYear' : rec['date'].dayofyear, 
                      'DayOfWeek' : rec['date'].dayofweek, 
                      'WeekOfYear' :  rec['date'].weekofyear,
                      'WeekOfYear' : rec['date'].weekofyear, 
                      'Quarter' : rec['date'].quarter,
                      'IsWeekend' : int(rec['date'].dayofweek > 4), 
                      'IsMonthStart' : int(rec['date'].is_month_start),
                      'IsMonthEnd' :  int(rec['date'].is_month_end)
                     }), axis = 1)
        
        df = pd.concat([df, df1], axis=1)
        
        
