In [1]:
import os
import random
import numpy as np
import pandas as pd
import sklearn
import scipy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier


In [2]:
%cd ..

c:\Users\maher\RLNAS-for-Anomaly-Detection-in-Time-Series-using-Autoencoders


In [3]:
load_path = 'dataset/weather_data/' # path to the dataset
save_path = 'dataset/gen_anomalies/' # path to save the generated anomalies

dataset = "IHAMPS1"
feat_to_keep = ["Temperature_C", "Humidity_%", "Pressure_hPa"]

In [4]:
seed = 19
random.seed(seed)
np.random.seed(seed)


In [5]:
def load_data(path, filename):
    csv_load_path = os.path.join(path, filename)
    return pd.read_csv(csv_load_path, index_col=0)

def save_data(df, path, filename):
    csv_save_path = os.path.join(path, filename)
    df.to_csv(csv_save_path)

In [6]:
def triOut_remove(reg_data, model='GMM', rate=10): 
    X_trn = reg_data.iloc[:, :]
    if model == 'GMM':
        fit_model = GaussianMixture(n_components=3, n_init=10, reg_covar=1e-3, random_state=seed)
    scaler = StandardScaler(random_state=seed)
    X_trn = scaler.fit_transform(X_trn)
    fit_model.fit(X_trn)
    densities = fit_model.score_samples(X_trn)
    density_threshold = np.percentile(densities, rate)
    purified_data = reg_data.loc[densities >= density_threshold].reset_index(drop=True)
    
    return purified_data

In [7]:
def local_generation(reg_data, n_insts=10000):
    alpha = 5
    reg_inst = reg_data.iloc[:,:].values
    data_trn, data_tst = train_test_split(reg_inst, test_size=0.3, random_state=0)
    lowest_bic = np.infty
    bic = []
    n_components_range = range(1, 3)
    cv_types = ["spherical", "tied", "diag", "full"]
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = GaussianMixture(
                n_components=n_components, covariance_type=cv_type, random_state=seed
            )
            gmm.fit(data_trn)
            bic.append(gmm.bic(data_tst))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm
    # determine local outlier distribution model
    local_gmm = sklearn.base.clone(best_gmm)
    local_gmm.weights_ = best_gmm.weights_
    local_gmm.means_ = best_gmm.means_
    local_gmm.covariances_ = alpha*best_gmm.covariances_ # stretching distribution space
    
    # Sample local outliers from outlier distribution
    local_insts = local_gmm.sample(n_insts) # generated instances
    local_insts = pd.DataFrame(local_insts[0], columns=reg_data.columns)
    local_insts['label'] = np.tile(1, local_insts.shape[0]) # Add label of outliers as 1

    
    # Return local outliers in form of data frame
    return local_insts


In [11]:
# FILTER INTERSTING INSTANCES

def inst_filter(reg_data, out_data, classifier='KNN', n_insts=10000, maxRem=50):
    removed = np.inf
    reg_inst = reg_data.copy()
    reg_inst["label"] = np.tile(0, reg_inst.shape[0])
    out_source = out_data.copy()
    idx = random.sample(out_source.index.to_list(), n_insts)
    out_inst = out_source.loc[idx]
    out_source.drop(idx, axis=0)
    if classifier=='KNN':
        clf = KNeighborsClassifier()
    while removed > maxRem:  
        data_trn = pd.concat((reg_inst, out_inst), axis=0).reset_index(drop=True)
        X, y = data_trn.iloc[:, :-1].values.astype('float32'), data_trn.iloc[:, -1].values.astype('int32')
        clf.fit(X, y)
        y_out = clf.predict(out_inst.iloc[:, :-1].values)
        out_inst_old = out_inst.loc[y_out==1]
        removed = np.sum(np.where(y_out==0, 1, 0))
        print(removed)
        idx = random.sample(out_source.index.to_list(), removed)
        out_inst_new = out_source.loc[idx]
        out_source.drop(idx, axis=0)
        out_inst = pd.concat((out_inst_old, out_inst_new), axis=0).reset_index(drop=True)
        
    return out_inst


In [12]:
## LOCAL OUTLIER GENERATION

def global_generation(reg_data, n_insts=10000):
    # reg_data = reg_data.drop('label', axis=1)
    info = reg_data.describe()
    attri_mins = 0.9*info.loc['min'].values
    attri_maxs = 1.1*info.loc['max'].values
    attri_scales = attri_maxs - attri_mins
    rv = scipy.stats.uniform(loc=[attri_mins], scale=[attri_scales])
    global_insts = []

    # create 1000 global outliers
    for i in range(n_insts):
        sample = rv.rvs(size=attri_mins.shape)
        global_insts.append(sample)

    global_insts = pd.DataFrame(global_insts, columns=reg_data.columns)
    global_insts['label'] = np.tile(1, global_insts.shape[0])
    
    return global_insts

In [13]:
reg_data = load_data(load_path, f'{dataset}.csv')
reg_data = reg_data[feat_to_keep]
reg_data.dropna(inplace=True)
reg_data = reg_data.reset_index(drop=True)

# local outliers
local_insts = local_generation(reg_data, n_insts=50000)
local_outs = inst_filter(reg_data, local_insts, n_insts=15000)

# global outliers
global_insts = global_generation(reg_data, n_insts=50000)
global_outs = inst_filter(reg_data, global_insts, n_insts=15000)

# save files
save_data(local_outs, save_path, f'locOuts_{dataset}.csv')
save_data(global_outs, save_path, f'gloOuts_{dataset}.csv')

4410
1660
633
244
91
29
1166
189
50


In [14]:
reg_data.describe()

Unnamed: 0,Temperature_C,Humidity_%,Pressure_hPa
count,138996.0,138996.0,138996.0
mean,12.724892,75.860672,1008.130986
std,10.78016,16.666094,7.628113
min,-17.78,16.0,967.83
25%,3.39,66.0,1003.39
50%,16.78,78.0,1007.11
75%,20.17,89.0,1013.21
max,36.56,99.0,1035.22


In [15]:
local_outs.describe()

Unnamed: 0,Temperature_C,Humidity_%,Pressure_hPa,label
count,15000.0,15000.0,15000.0,15000.0
mean,14.411976,78.240782,1007.493054,1.0
std,27.714954,32.080798,18.775821,0.0
min,-87.381783,-46.777477,944.745885,1.0
25%,-7.89256,59.078589,993.400271,1.0
50%,17.15988,81.976676,1006.890324,1.0
75%,35.358283,102.445514,1021.180191,1.0
max,110.352709,174.967706,1070.569261,1.0


In [16]:
global_outs.describe()

Unnamed: 0,Temperature_C,Humidity_%,Pressure_hPa,label
count,15000.0,15000.0,15000.0,15000.0
mean,12.416315,61.632842,1004.285856,1.0
std,16.487459,27.912549,80.792211,0.0
min,-15.995934,14.40473,871.063748,1.0
25%,-2.197589,37.066012,931.967805,1.0
50%,12.78959,61.224846,999.30897,1.0
75%,26.893086,86.359804,1076.699766,1.0
max,40.214058,108.898029,1138.723149,1.0
