In [3]:
import os
parent_dir = os.path.dirname(os.path.abspath('./'))
filename = f"{parent_dir}/input_data/Electric_Production.csv"
print(f"Filename: {filename}")

Filename: /Users/sristiraj/Downloads/sristiraj/ML/anomaly_detection/timeseries/input_data/Electric_Production.csv


### Read data using duckdb

In [4]:
import duckdb

elec_df = duckdb.read_csv(filename)

### Show data

In [5]:
duckdb.sql("select * from elec_df")

┌────────────┬────────────┐
│    DATE    │ IPG2211A2N │
│    date    │   double   │
├────────────┼────────────┤
│ 1985-01-01 │    72.5052 │
│ 1985-01-02 │     70.672 │
│ 1985-01-03 │    62.4502 │
│ 1985-01-04 │    57.4714 │
│ 1985-01-05 │    55.3151 │
│ 1985-01-06 │    58.0904 │
│ 1985-01-07 │    62.6202 │
│ 1985-01-08 │    63.2485 │
│ 1985-01-09 │    60.5846 │
│ 1985-01-10 │    56.3154 │
│     ·      │        ·   │
│     ·      │        ·   │
│     ·      │        ·   │
│ 2017-01-04 │     88.353 │
│ 2017-01-05 │    92.0805 │
│ 2017-01-06 │   102.1532 │
│ 2017-01-07 │   112.1538 │
│ 2017-01-08 │   108.9312 │
│ 2017-01-09 │    98.6154 │
│ 2017-01-10 │    93.6137 │
│ 2017-01-11 │    97.3359 │
│ 2017-01-12 │   114.7212 │
│ 2018-01-01 │   129.4048 │
├────────────┴────────────┤
│   397 rows (20 shown)   │
└─────────────────────────┘

# Time series data feature engineering

### Add rolling avg and rolling std

In [6]:
query_str = '''
    select * 
    from (
        select *, 
            avg(IPG2211A2N) over(order by period ROWS between 11 PRECEDING AND CURRENT ROW) roll_avg, 
            stddev(IPG2211A2N) over(order by period ROWS between 11 PRECEDING AND CURRENT ROW) roll_std,
            row_number() over (order by period) rn
            from 
                (select 
                    year(date)*100+month(date) period,
                    sum(IPG2211A2N) IPG2211A2N
                    from
                    elec_df
                    group by year(date)*100+month(date)
                ) k
        )
            where rn >= 12
'''

In [7]:
duckdb.sql(query_str)


┌────────┬────────────────────┬────────────────────┬────────────────────┬───────┐
│ period │     IPG2211A2N     │      roll_avg      │      roll_std      │  rn   │
│ int64  │       double       │       double       │       double       │ int64 │
├────────┼────────────────────┼────────────────────┼────────────────────┼───────┤
│ 199601 │           1012.589 │  879.1508833333334 │  86.29707282619029 │    12 │
│ 199701 │ 1010.8342999999999 │  901.2214083333333 │  82.94680902630492 │    13 │
│ 199801 │          1038.5289 │  925.0555916666667 │  77.22807818652794 │    14 │
│ 199901 │          1070.2907 │  948.5062083333333 │  74.80708721560178 │    15 │
│ 200001 │ 1101.4814999999999 │         970.579975 │  77.80377500398758 │    16 │
│ 200101 │          1097.5295 │           990.1456 │  77.74146566403758 │    17 │
│ 200201 │          1129.2849 │ 1010.9392416666668 │  78.89441330348318 │    18 │
│ 200301 │          1148.4013 │          1031.5275 │  79.92348745704456 │    19 │
│ 200401 │      

### Tag anomaly records using 3 sigma rule

In [8]:
query_str = '''
    select * 
    from (
        select *, 
            avg(IPG2211A2N) over(order by period ROWS between 11 PRECEDING AND CURRENT ROW) roll_avg, 
            stddev(IPG2211A2N) over(order by period ROWS between 11 PRECEDING AND CURRENT ROW) roll_std,
            row_number() over (order by period) rn
            from 
                (select 
                    year(date)*100+month(date) period,
                    sum(IPG2211A2N) IPG2211A2N
                    from
                    elec_df
                    group by year(date)*100+month(date)
                ) k
        )
    where rn >= 12
    and abs(IPG2211A2N-roll_avg) >= 3*roll_std
'''

In [9]:
duckdb.sql(query_str)

┌────────┬────────────┬────────────────────┬────────────────────┬───────┐
│ period │ IPG2211A2N │      roll_avg      │      roll_std      │  rn   │
│ int64  │   double   │       double       │       double       │ int64 │
├────────┼────────────┼────────────────────┼────────────────────┼───────┤
│ 201801 │   129.4048 │ 1131.9812833333333 │ 316.13076686143324 │    34 │
└────────┴────────────┴────────────────────┴────────────────────┴───────┘