In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
import time
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
train_df = pd.read_csv("./energy-anomaly-detection/train.csv")

train_df

Unnamed: 0,building_id,timestamp,meter_reading,anomaly
0,1,2016-01-01 00:00:00,,0
1,32,2016-01-01 00:00:00,,0
2,41,2016-01-01 00:00:00,,0
3,55,2016-01-01 00:00:00,,0
4,69,2016-01-01 00:00:00,,0
...,...,...,...,...
1749489,1316,2016-12-31 23:00:00,38.844,0
1749490,1318,2016-12-31 23:00:00,202.893,0
1749491,1319,2016-12-31 23:00:00,,0
1749492,1323,2016-12-31 23:00:00,172.000,0


In [10]:
unique_ids = train_df.building_id.unique()

In [11]:
df_list = []

for b_id in unique_ids:
    building_df = train_df.groupby('building_id').get_group(b_id).copy(deep=True)
    building_df['meter_reading'] = building_df['meter_reading'].replace(float('nan'),
                                                                        building_df['meter_reading'].median())
    building_df.reset_index(drop=True, inplace=True)
    df_list.append(building_df)

print(len(df_list))
anomaly = df_list

200


In [12]:
imputed_train = pd.concat(anomaly)

imputed_train

Unnamed: 0,building_id,timestamp,meter_reading,anomaly
0,1,2016-01-01 00:00:00,38.651,0
1,1,2016-01-01 01:00:00,38.651,0
2,1,2016-01-01 02:00:00,38.651,0
3,1,2016-01-01 03:00:00,38.651,0
4,1,2016-01-01 04:00:00,38.651,0
...,...,...,...,...
7466,1353,2016-12-31 19:00:00,2.425,0
7467,1353,2016-12-31 20:00:00,2.450,0
7468,1353,2016-12-31 21:00:00,2.425,0
7469,1353,2016-12-31 22:00:00,2.450,0


In [13]:
imputed_train.anomaly.value_counts()

anomaly
0    1712198
1      37296
Name: count, dtype: int64

In [14]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

seed = 42
import joblib

def calculate_quartiles_and_iqr(values):
    q1 = np.percentile(values, 25)
    q3 = np.percentile(values, 75)

    # Calculate IQR
    iqr = q3 - q1

    return q1, q3, iqr

In [15]:
f1_scores = []
prec_scores = []
recall_scores = []
all_ids = imputed_train.building_id.unique()
for _id in all_ids:
    building = imputed_train[imputed_train['building_id'] == _id].copy(deep=True)
    X_train = building['meter_reading']
    y_train = building['anomaly'].values
    scaler = StandardScaler()
    X_train = scaler.fit_transform(pd.DataFrame(X_train))
    ind_best_f1 = 0
    q1, q3, iqr = calculate_quartiles_and_iqr(X_train)
    for K in range(5, 45, 5):
        K /= 10  # 0.5,1,1.5,2,...
        low = q1 - K * iqr
        high = q3 + K * iqr
        y_pred = np.zeros(len(y_train))
        bool_array = (X_train > high) | (X_train < low)
        bool_array = bool_array.reshape(-1)
        y_pred[bool_array] = 1

        score_f1 = f1_score(y_train, y_pred)

        if score_f1 > ind_best_f1:
            score_prec = precision_score(y_train, y_pred)
            score_recall = recall_score(y_train, y_pred)
            ind_best_f1 = score_f1
            ind_best_prec = score_prec
            ind_best_recall = score_recall
    f1_scores.append(ind_best_f1)
    prec_scores.append(ind_best_prec)
    recall_scores.append(ind_best_recall)

In [16]:
print(f1_scores)

print(prec_scores)

print(recall_scores)

[0.04865556978233035, 0.03642384105960265, 0.07718120805369127, 0.1593625498007968, 0.10827693909422176, 0.11976047904191617, 0.24, 0.06666666666666667, 0.28319882611885544, 0.7361963190184049, 0.7105263157894737, 0.08055944055944056, 0.7600849256900213, 0.7032967032967034, 0.5991189427312775, 0.6791277258566978, 0.7380952380952381, 0.4261682242990654, 0.64, 0.39219934994582883, 0.13676633444075303, 0.7875457875457875, 0.2958199356913183, 0.7685589519650655, 0.39643652561247217, 0.002014098690835851, 0.0979020979020979, 0.3008849557522124, 0.11299435028248588, 0.16666666666666666, 0.7953216374269005, 0.17164179104477612, 0.3108108108108108, 0.021119324181626188, 0.5428571428571428, 0.015655577299412915, 0.019769357495881382, 0.3287671232876712, 0.6228070175438597, 0.008379888268156424, 0.0683371298405467, 0.09691629955947137, 0.1044776119402985, 0.03476946334089191, 0, 0.026420079260237782, 0.004698512137823023, 0.0056657223796034, 0, 0, 0, 0, 0, 0.900398406374502, 0.044444444444444446

In [17]:
print(np.mean(f1_scores))

print(np.mean(prec_scores))

print(np.mean(recall_scores))

0.4143468356463433
0.5204857927715788
0.49293203620675086
