In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import optuna
import time
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_df = pd.read_csv("./energy-anomaly-detection/train.csv")

train_df

unique_ids = train_df.building_id.unique()

df_list = []

for b_id in unique_ids:
    building_df = train_df.groupby('building_id').get_group(b_id).copy(deep=True)
    building_df['meter_reading'] = building_df['meter_reading'].replace(float('nan'),
                                                                        building_df['meter_reading'].median())
    building_df.reset_index(drop=True, inplace=True)
    df_list.append(building_df)

print(len(df_list))

200


In [3]:
imputed_train = pd.concat(df_list)

imputed_train

imputed_train.anomaly.value_counts()

anomaly
0    1712198
1      37296
Name: count, dtype: int64

In [4]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler


seed = 42
import joblib


best_params = {'n_estimators': 114,
               'max_samples': 0.11944670503770004,
               'contamination': 0.03103123295817309,
               'max_features': 0.11829004013466254,
               'bootstrap': True,
               'n_jobs': -1,  # Use all available CPU cores
               'random_state': seed,  # For reproducibility
               }


In [5]:
f1_scores = []
prec_scores = []
recall_scores = []
all_ids = imputed_train.building_id.unique()
for _id in all_ids:
    building = imputed_train[imputed_train['building_id'] == _id].copy(deep=True)
    X_train = building['meter_reading']
    y_train = building['anomaly']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(pd.DataFrame(X_train))
    ind_best_f1 = 0
    for cont in range(1, 87, 5):
        cont /= 1000
        best_params['contamination'] = cont

        model = IsolationForest(
            **best_params
        )

        model.fit(X_train)

        y_pred = model.predict(X_train)

        y_pred = np.where(y_pred == -1, 1, 0)

        score_f1 = f1_score(y_train, y_pred)

        if score_f1 > ind_best_f1:
            score_prec = precision_score(y_train, y_pred)
            score_recall = recall_score(y_train, y_pred)
            ind_best_f1 = score_f1
            ind_best_prec = score_prec
            ind_best_recall = score_recall
    f1_scores.append(ind_best_f1)
    prec_scores.append(ind_best_prec)
    recall_scores.append(ind_best_recall)

In [6]:
print(f1_scores)
print(prec_scores)
print(recall_scores)

[0.2037037037037037, 0.03464566929133858, 0.07692307692307693, 0.18571428571428572, 0.06888888888888889, 0.16666666666666666, 0.21686746987951808, 0.17307692307692307, 0.6033690658499234, 0.6956521739130435, 0.6372093023255814, 0.7598784194528876, 0.7061143984220908, 0.6710097719869706, 0.5793650793650794, 0.7262872628726287, 0.6422764227642277, 0.7290322580645161, 0.5601750547045952, 0.5596330275229358, 0.6261682242990654, 0.7635726795096323, 0.6153846153846154, 0.7426160337552743, 0.4469525959367946, 0.1411764705882353, 0.5072463768115942, 0.29015544041450775, 0.17721518987341772, 0.43243243243243246, 0.7440633245382586, 0.2057142857142857, 0.3700440528634361, 0.09022556390977443, 0.5466666666666666, 0.03125, 0.14545454545454545, 0.41706161137440756, 0.5993265993265994, 0.1885593220338983, 0.16666666666666666, 0.2100456621004566, 0.22727272727272727, 0.16853932584269662, 0.12206572769953052, 0.05752212389380531, 0.03278688524590164, 0.013114754098360656, 0.10967741935483871, 0, 0.125

In [None]:
print(np.mean(f1_scores))
print(np.mean(prec_scores))
print(np.mean(recall_scores))

0.49297639746419514
0.5489940417094722
0.5097119785013742
