In [1]:
import pandas as pd
import numpy as np

In [2]:
num_train_files = int(input("Enter the number of files for training? "))

Enter the number of files for training? 3


## Data Preparation

- Load the file and Add the headers
- Converting the type of timestamp from object to timestamp64
- Extracting the hour from timestamp
- Based on statistical analysis, Dividing the per day data into 2 halves - Day (3am to 2pm) and Night(2pm to 3am)

In [3]:
def data_prep(filename):
    
    # Load the file
    data= pd.read_csv(filename, header=None)
    
    # Adding the header
    data.columns = ["TimeStamp", "Values"]
    
    return data

In [4]:
def timeperiod(data):
    # Convering object into Timestamp
    data['TimeStamp']=pd.to_datetime(data['TimeStamp'], format='%Y-%m-%d %H:%M:%S')
    
    # Extracting the hour from timestamp
    data['Hour']=data['TimeStamp'].dt.hour
    
    ### Based on Statistical Distribution ####
    # Timestamp between 3am and 2pm is Day 
    # Timestamp between 2pm and 3am is Night
    # Day is 0 and Night is 1

    data.loc[(data['Hour'] >=14), 'Hour'] = 1
    data.loc[(data['Hour'] <3), 'Hour'] = 1
    data.loc[(data['Hour'] >1), 'Hour'] = 0
    
    return data

## Training

- Concatenating the July 13, 14 and 16 data
- Find the Interquartile range (IQR)
- Define outliers based on 
    - Values less than (25th percentile - 2.5* IQR)
    - Values greater than (75th percentile + 2.5*IQR)
- Choosen 2.5 times instead of standard 1.5 times to be more conservative and avoid False positives

In [5]:
trainingData_list =[]
for i in range(num_train_files):
    
    filename = input("Enter the name of {} training file? ".format(i+1))
    
    data= data_prep(filename)
    data = timeperiod(data)
    
    trainingData_list.append(data)

Enter the name of 1 training file? June13_data.csv
Enter the name of 2 training file? June14_data.csv
Enter the name of 3 training file? June16_data.csv


In [6]:
# Concatenating the data for training
#train_data= pd.concat([data_13, data_14, data_16])
train_data= pd.concat(trainingData_list)

In [7]:
def min_max(data, timeperiod):
    q75, q25 = np.percentile(data[data['Hour']==timeperiod].Values.dropna(), [75 ,25])
    iqr = q75 - q25

    # Used 2.5 times instead of 1.5 to be more conservative. Also, we had assumed no outliers in training data.
    min = q25 - (iqr*2.5)
    max = q75 + (iqr*2.5)
    
    return (min, max)

In [8]:
min_day, max_day = min_max(train_data,0)
min_night, max_night = min_max(train_data,1)

## Validating

- Using outliers threshold for both day and night, validate the July 17 data
- Write the anamolous points timestamp and values in a csv file

In [9]:
def inference(data, min_day, max_day, min_night, max_night):
    
    day=data[data['Hour']==0]
    night=data[data['Hour']==1]
    
    df1= day[day['Values']<min_day]
    df2= day[day['Values']>max_day]
    
    df3= night[night['Values']<min_night]
    df4= night[night['Values']>max_night]
    
    result = pd.concat([df1,df2,df3,df4])
    
    return result[['TimeStamp','Values']]
    

In [10]:
val_filename = input("Enter the name of validation file?")
val_data= data_prep(val_filename)
val_data = timeperiod(val_data)

Enter the name of validation file?June17_data.csv


In [11]:
anomaly = inference(val_data,min_day, max_day, min_night, max_night)

In [12]:
anomaly.to_csv("AnomalousData.csv")

In [13]:
anomaly.tail()

Unnamed: 0,TimeStamp,Values
1195,2018-06-18 02:55:00,13640
1196,2018-06-18 02:56:00,14781
1197,2018-06-18 02:57:00,14124
1198,2018-06-18 02:58:00,14129
1199,2018-06-18 02:59:00,13865


In [14]:
print ("Successful Run")

Successful Run
