# Sidel – Anomaly Labelling Notebook
Uses the `Efficiency` and `Produce` columns to compute energy per bottle, builds a 30‑day rolling median per recipe, flags rows > 25 % above that baseline, and merges alarms.

In [1]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from zoneinfo import ZoneInfo

# ------------ parameters ------------
recipe_files = {
    'machine1': 'Data/df_recipe_performance_machine1.csv',
    'machine2': 'Data/df_recipe_performance_machine2.csv',
    'machine3': 'Data/df_recipe_performance_machine3.csv'
}
alarm_files = {
    'machine1': 'Data/alarm_machine1.csv',
    'machine2': 'Data/alarm_machine2.csv',
    'machine3': 'Data/alarm_machine3.csv'
}
tz = ZoneInfo('Europe/Rome')

energy_col  = 'Efficiency'   # kWh metric
output_col  = 'Produce'      # bottles produced
threshold   = 0.25           # 25 %


In [3]:

# ------------ load & combine recipe data ------------
recipes = []
for m, fp in recipe_files.items():
    df = pd.read_csv(fp)
    ts_col = next(c for c in df.columns if 'time' in c.lower())
    df[ts_col] = (pd.to_datetime(df[ts_col], utc=True).dt.tz_convert('Europe/Rome'))
    df['machine'] = m
    recipes.append(df)
recipes = pd.concat(recipes, ignore_index=True)


In [4]:

# ------------ feature engineering ------------
recipes['energy_per_bottle'] = recipes[energy_col] / recipes[output_col].replace(0, np.nan)

recipe_id_col = next((c for c in recipes.columns if 'recipe' in c.lower()), None)
if not recipe_id_col:
    print('⚠️  No recipe identifier found automatically.')

# Compute rolling 30‑day median within each recipe & machine
recipes = recipes.sort_values(ts_col)
recipes['roll_median'] = recipes.groupby(['machine', recipe_id_col])['energy_per_bottle']\
                              .transform(lambda s: s.rolling('30D', on=ts_col).median())

recipes['anomaly'] = (recipes['energy_per_bottle'] > (1 + threshold) * recipes['roll_median']).astype(int)
print('Total anomalies:', recipes['anomaly'].sum())


KeyError: 'Efficiency'

In [None]:

# ------------ load & align alarms ------------
alarms = []
for m, fp in alarm_files.items():
    df = pd.read_csv(fp)
    dt_col = next(c for c in df.columns if 'time' in c.lower() or 'created' in c.lower())
    df_a[dt_col] = (pd.to_datetime(df_a[dt_col], utc=True).dt.tz_convert('Europe/Rome'))
    df['machine'] = m
    alarms.append(df[[dt_col, 'machine']])
alarms = pd.concat(alarms, ignore_index=True)
alarms.rename(columns={dt_col: 'alarm_time'}, inplace=True)
alarms['alarm_time_min'] = alarms['alarm_time'].dt.floor('min')

recipes['timestamp_min'] = recipes[ts_col].dt.floor('min')
recipes = recipes.merge(alarms[['alarm_time_min', 'machine']],
                        left_on=['timestamp_min', 'machine'],
                        right_on=['alarm_time_min', 'machine'],
                        how='left', indicator='alarm_merge')

recipes['alarm_overlap'] = (recipes['alarm_merge'] == 'both').astype(int)
print('% of anomalies with an overlapping alarm:',
      (recipes.query('anomaly == 1')['alarm_overlap'].mean() * 100).round(2), '%')


In [None]:

# ------------ quick plot ------------
m = 'machine1'
sub = recipes[recipes['machine'] == m].set_index(ts_col)
plt.figure(figsize=(12,4))
plt.plot(sub['energy_per_bottle'], label='Energy per bottle', alpha=0.4)
plt.scatter(sub[sub['anomaly']==1].index,
            sub.loc[sub['anomaly']==1, 'energy_per_bottle'],
            s=10, label='Anomaly', color='red')
plt.title(f'Energy per bottle & anomalies – {m}')
plt.legend()
plt.show()
