In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pmdarima

**Setting up environment**

In [None]:
#importing required packages
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
from sklearn.model_selection import TimeSeriesSplit
from pmdarima.arima import ADFTest
from pmdarima.arima import auto_arima

import warnings
warnings.filterwarnings("ignore")

#defining random seed to get uniformity in results
rs = 42

#downloading dataset 
os.chdir('/kaggle/input/anomliot')
iotq = pd.read_csv('dataset_final.csv')

**Exploring Dataset**

In [None]:
print(iotq.info())
print(iotq.shape)
print(iotq.describe())
print(iotq.head())

**Setting Time Format**

In [None]:
iotq.Time = pd.to_datetime(iotq.Time, infer_datetime_format=True, unit='s')
iotq.rename(columns={'Air Quality':'Air_Quality'},inplace=True)
iotq.head()

**Visualizing Time Series for all features**

In [None]:
from itertools import cycle
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='Sensor Data'))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=iotq.Time, y=iotq.Temperature, 
                         mode='markers',
                         marker=dict(color='red')))
fig.add_trace(go.Scatter(x=iotq.Time, y=iotq.Humidity, 
                         mode='markers',
                         marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=iotq.Time, y=iotq.Air_Quality, 
                         mode='markers',
                         marker=dict(color='green')))
names = cycle(['Temperature', 'Humidity','Air Quality'])
fig.for_each_trace(lambda t:  t.update(name = next(names)))

In [None]:
layout1 = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='Sensor Data'))

fig1 = go.Figure(layout=layout1)

fig1.add_trace(go.Scatter(x=iotq.Time, y=iotq.Light, 
                         mode='markers',
                         marker=dict(color='light blue')))
fig1.add_trace(go.Scatter(x=iotq.Time, y=iotq.Loudness, 
                         mode='markers',
                         marker=dict(color='purple')))

nam = cycle(['Light','Loudness'])
fig1.for_each_trace(lambda t:  t.update(name = next(nam)))

**Isolating Temperature Time Series**

In [None]:
temp = iotq[['Time','Temperature']]

**Splitting into Train and Test Data Sets**

In [None]:
train_set, test_set= np.split(temp, [int(.67 *len(temp))]) #67% reserved for training and 33% for testing
print(train_set.head())
print(train_set.tail())
print(test_set.head())
print(test_set.tail())

**Standardising data**

In [None]:
scaler = StandardScaler()
scaler.fit(train_set['Temperature'].values.reshape(-1,1))
train_set['stand_value'] = scaler.transform(train_set['Temperature'].values.reshape(-1,1))
std_of_array = train_set['Temperature'].std()
mean_of_array = train_set['Temperature'].mean()
train_set.head()

In [None]:
test_set['stand_value'] = scaler.transform(test_set['Temperature'].values.reshape(-1,1))

**Making the dataframes leaner**

In [None]:
train_set = train_set.drop(['Temperature'],axis=1)
train_set.set_index('Time',inplace=True)
test_set = test_set.drop(['Temperature'],axis=1)
test_set.set_index('Time',inplace=True)

**Dickey-Fuller test to check stationarity of the time series**

In [None]:
adftest = ADFTest(alpha=0.01) #checking stationarity of data at a significance level of 99%
adftest.should_diff(train_set) #as indicated by the output below we can say with 99% confidence that our data is stationary, hence d or integration factor to be set to 0

**Model Training**

In [None]:
model = auto_arima(train_set, start_p=0, d=0, start_q=0,max_d=0,seasonal=False,trace=True,stepwise=True,ramdom_state=rs,n_fits=50)

In [None]:
model.summary()

**Checking ARIMA model predictions on the training data**

In [None]:
prediction_train = pd.DataFrame(model.predict(n_periods=len(train_set),index=train_set.index))
prediction_train.index = train_set.index
train_set['predict'] = prediction_train
train_set

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='Temperature'))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=train_set.index, y=train_set.stand_value, 
                         mode='markers',
                         marker=dict(color='blue')))

fig.add_trace(go.Scatter(x=train_set.index, y=train_set.predict, 
                         mode='markers',
                         marker=dict(color='orange')))

nam = cycle(['Actual','Prediction'])
fig.for_each_trace(lambda t:  t.update(name = next(nam)))

**Using Model Predictions to detect anomalies**

In [None]:
def calculate_prediction_errors(input_data):
    return (abs(input_data['stand_value'] -input_data['predict'])).to_numpy()

train_pred_errors = calculate_prediction_errors(train_set)

#greater the value of config parameter lower the sensitivty of anomaly detection
anomaly_config = 3 #number of standard deviations prediction error is away from mean prediction error to be classified as an anomaly
pred_error_threshold = np.mean(train_pred_errors) + (anomaly_config * np.std(train_pred_errors))

In [None]:
#function to detect anomalies based on prediction errors and defined statis threhsold for anomaly detection
def detect_anomalies(pred_error_threshold,df):
    # Calculate errors for the given data
    test_reconstruction_errors = calculate_prediction_errors(df)
    # Filter errors w/ the threshold
    predicted_anomalies = list(
        map(lambda v: 1 if v > pred_error_threshold else 0,
        test_reconstruction_errors)
    )
    df['anomaly_predicted'] = predicted_anomalies
    # Extract indexes of the filtered anomalies
    indexes = [i for i, x in enumerate(predicted_anomalies) if x == 1]
    return indexes

train_anomalies_idxs = detect_anomalies(
    pred_error_threshold, train_set
)

**Visualizing Anomaly Detection on Training Data**

In [None]:
layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='Temperature'))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=train_set.index, y=train_set['stand_value'], 
                         mode='markers',
                         marker=dict(color='blue')))

fig.add_trace(go.Scatter(x=train_set.index, y=train_set['predict'], 
                         mode='markers',
                         marker=dict(color='orange')))

fig.add_trace(go.Scatter(x=train_set.index, y=train_set['stand_value'].iloc[train_anomalies_idxs], 
                         mode='markers',
                         marker=dict(color='red')))

nam = cycle(['Actual','Prediction','Detected Anomaly'])
fig.for_each_trace(lambda t:  t.update(name = next(nam)))

**Using model to detect anomlaies on Test Data**

In [None]:
prediction_test = pd.DataFrame(model.predict(n_periods=len(test_set),index=test_set.index))

prediction_test.index = test_set.index
test_set['predict'] = prediction_test
test_set

In [None]:
test_pred_errors = calculate_prediction_errors(test_set)
pred_error_threshold = np.mean(test_pred_errors) + anomaly_config * np.std(test_pred_errors)
test_anomalies_idxs = detect_anomalies(
    pred_error_threshold, test_set)

In [None]:
test_set

In [None]:
ano_ind = np.where(test_set["anomaly_predicted"]==1)

layout = dict(xaxis=dict(title='Timestamp'), yaxis=dict(title='Temperature'))

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=test_set.index, y=test_set['stand_value'], 
                         mode='markers',
                         marker=dict(color='blue')))

fig.add_trace(go.Scatter(x=test_set.index, y=test_set['predict'], 
                         mode='markers',
                         marker=dict(color='orange')))

fig.add_trace(go.Scatter(x=test_set.iloc[ano_ind].index, y=test_set['stand_value'].iloc[test_anomalies_idxs], 
                         mode='markers',
                         marker=dict(color='red')))

nam = cycle(['Actual','Prediction','Detected Anomaly'])
fig.for_each_trace(lambda t:  t.update(name = next(nam)))

**Detected Anomlaies in Test Data with Timestamps**

In [None]:
detected_anomalies = test_set['stand_value'].iloc[test_anomalies_idxs]
orig_det_anom = (detected_anomalies * std_of_array) + mean_of_array
orig_det_anom