#### Perform analysis on original time series data, such as period prediction

In [2]:
from scipy.fftpack import fft, fftfreq
from statsmodels.tsa.stattools import acf
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [3]:
path = '../data-sets/KDD-Cup/data/'
files_name = [i for i in os.listdir(path) if 'Anomaly' in i] #remove irrelevant files
files_name.sort(key = lambda x : x.split('_')[0]) #sort by id

In [12]:
periods = []
for name in files_name:
    split_index  = int(name.split('.')[0].split('_')[3])
    data = np.loadtxt(path+name)[0:split_index+1]

    fft_series = fft(data)
    power = np.abs(fft_series)
    sample_freq = fftfreq(fft_series.size)

    pos_mask = np.where(sample_freq > 0)
    freqs = sample_freq[pos_mask]
    powers = power[pos_mask]

    top_k_seasons = 5

    # top K=5 index
    top_k_idxs = np.argpartition(powers, -top_k_seasons)[-top_k_seasons:]
    top_k_power = powers[top_k_idxs]
    fft_periods = (1 / freqs[top_k_idxs]).astype(int)
    fft_periods = [i for i in fft_periods if i < 0.5*split_index] #delete outliers for 102 and 151
    
    # Expected time period
    scores = []
    for lag in fft_periods:
        # lag = fft_periods[np.abs(fft_periods - time_lag).argmin()]
        acf_score = acf(data, nlags=lag)[-1]
        scores.append(acf_score)
        # print(f"lag: {lag} fft acf: {acf_score}")
    period = fft_periods[scores.index(max(scores))] #candidated periods with highest acf score
    periods.append(period)


In [13]:
pd.DataFrame({'File_name':files_name,'Period':periods}).to_csv('../data-sets/KDD-Cup/period/period.csv',index=None)