# Autoencoder for anomaly detection


## Libraries import

In [41]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from tqdm.notebook import tqdm 
from typing import Callable, Dict, List, Set, Tuple
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn
import random
import category_encoders as ce
import sklearn.preprocessing
tfk = tf.keras
tfkl = tfk.layers
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [43]:
from utils.preprocessing import labelEncodeCats, remove_categories_not_in_both, remove_outliers, CATEGORICAL_TO_DROP, NUMERICAL_NON_COUNTERS, NUMERICAL_TO_DROP
from utils.preprocessing import (
    encode_counters,
    remove_categories_not_in_both,
    remove_outliers,
    trigonometric_date_encoding,
)

## Random seed for reproducibility

In [44]:
seed = 1234

np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)

## Dataset loading and preprocessing

In [46]:
TRAIN_VAL_DATA_PATH: Path = os.path.join('.', 'train_val_Enc_Counters.parquet')
TEST_DATA_PATH: Path = os.path.join('.', 'test_val_Enc_Counters.parquet')

In [47]:
df: pd.DataFrame = pd.read_parquet(TRAIN_VAL_DATA_PATH) 
df = df.astype({f"f_{i}": "category" for i in range(2, 33)})

test: pd.DataFrame = pd.read_parquet(TEST_DATA_PATH) 
test = test.astype({f"f_{i}": "category" for i in range(2, 33)})

df = df.astype({'f_1': 'int'})
test = test.astype({'f_1': 'int'})

df = df.astype({'is_clicked': 'int'})
df = df.astype({'is_installed': 'int'})

booleans = list(df.select_dtypes(['boolean']).columns)
for i in booleans:
    df[i] = df[i].astype('bool')
    test[i] = test[i].astype('bool')

### Preprocessing

In [48]:
from utils.notebook_utils import collapse_binary

# if do you want to collapse the binary columns set collapse_binary to True
activate_collapse_binary = False

if activate_collapse_binary:
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)] + ["f_394041", "f_33457"]
else:
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)]

numerical_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
categorical_columns = [col for col in categorical_columns if col not in CATEGORICAL_TO_DROP]
numerical_columns = [
    col
    for col in numerical_columns
    if col not in NUMERICAL_TO_DROP and col in NUMERICAL_NON_COUNTERS
]
counter_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
counter_columns = [
    col
    for col in counter_columns
    if col not in NUMERICAL_TO_DROP and col not in NUMERICAL_NON_COUNTERS
]

def preprocess_data_nn(
    df_train: pd.DataFrame, df_val: pd.DataFrame, Y_train: pd.DataFrame, Y_val: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    categorical_columns: List[str] = [f"f_{i}" for i in range(2, 32 + 1)]
    numerical_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
    boolean_columns: List[str] = [f"f_{i}" for i in range(33, 42)]
    categorical_columns = [col for col in categorical_columns if col not in CATEGORICAL_TO_DROP]
    numerical_columns = [col for col in numerical_columns if col not in NUMERICAL_TO_DROP and col in NUMERICAL_NON_COUNTERS]
    counter_columns: List[str] = [f"f_{i}" for i in range(42, 79 + 1)]
    counter_columns = [col for col in counter_columns if col not in NUMERICAL_TO_DROP and col not in NUMERICAL_NON_COUNTERS]
    
    print("Drop bad columns...")
    df_train = df_train.drop(columns=CATEGORICAL_TO_DROP + NUMERICAL_TO_DROP)
    df_val = df_val.drop(columns=CATEGORICAL_TO_DROP + NUMERICAL_TO_DROP)

    print("Collapsing binary columns...")
    df_train = collapse_binary(df_train, dropOriginal=True)
    df_val = collapse_binary(df_val, dropOriginal=True)

    print("Removes categories not in both...")
    df_train, df_val = remove_categories_not_in_both(df_train, df_val, categorical_columns)
    
    cb_encoder = ce.CatBoostEncoder()
    cb_encoder.fit(df_train[categorical_columns], Y_train)
    df_train[categorical_columns] = cb_encoder.transform(df_train[categorical_columns])
    df_val[categorical_columns] = cb_encoder.transform(df_val[categorical_columns])

    print("Normalizing counter columns...")
    df_train, mins_train, steps_train = encode_counters(
        df=df_train,
        columns=counter_columns,
        mins=None,
        steps=None,
    )
    df_val, _, _ = encode_counters(
        df=df_val,
        columns=counter_columns,
        mins=mins_train,
        steps=steps_train,
    )
    counter_modes: pd.Series = df_train[counter_columns].mode()
    df_train = df_train.fillna(counter_modes)
    df_val = df_val.fillna(counter_modes)
    for col in counter_columns:
        n_zeros: int = (df_train[col] == 0).sum()
        if n_zeros > df_train.shape[0] * 0.95:
            df_train[col] = np.where(df_train[col].values, 1, 0)
            df_train = df_train.astype({col: "bool"})
            boolean_columns.append(col)
            df_val[col] = np.where(df_val[col].values, 1, 0)
            df_val = df_val.astype({col: "bool"})
        else:
            df_train[col] = np.log(df_train[col] + 0.5)
            df_val[col] = np.log(df_val[col] + 0.5)

    print("Removing outliers from numerical columns...")
    means: pd.Series = df_train[numerical_columns].mean()
    stds: pd.Series = df_train[numerical_columns].std()
    df_train = remove_outliers(
        df=df_train,
        columns=numerical_columns,
        coefficient=4,
        means=means,
        stds=stds,
    )
    df_val = remove_outliers(
        df=df_val,
        columns=numerical_columns,
        coefficient=4,
        means=means,
        stds=stds,
    )

    print("Standardizing numerical columns...")
    means_no_outliers: pd.Series = df_train[numerical_columns].mean()
    stds_no_outliers: pd.Series = df_train[numerical_columns].std()
    df_train.loc[:, numerical_columns] = (
        df_train.loc[:, numerical_columns] - means_no_outliers
    ) / stds_no_outliers
    df_val.loc[:, numerical_columns] = (
        df_val.loc[:, numerical_columns] - means_no_outliers
    ) / stds_no_outliers
    df_train = df_train.fillna(means_no_outliers)
    df_val = df_val.fillna(means_no_outliers)

    scaler = MinMaxScaler()
    df_train = scaler.fit_transform(df_train)
    df_val = scaler.transform(df_val)
    
    return df_train, df_val

### Splitting and applying preprocessing

In [49]:
val_day = 65
train_df = df[(df["f_1"] < val_day)]
val_df = df[df["f_1"] >= val_day]

X_train = train_df.drop(columns=["is_clicked", "is_installed"])
y_train = train_df[["is_installed"]]
X_val = val_df.drop(columns=["is_clicked", "is_installed"])
y_val = val_df[["is_installed"]]

X_train, X_val = preprocess_data_nn(X_train, X_val, y_train, y_val)

Drop bad columns...
Collapsing binary columns...
Removes categories not in both...
Normalizing counter columns...
Removing outliers from numerical columns...
Standardizing numerical columns...


In [50]:
def build_model():
    # Define the autoencoder architecture
    n_features = X_train.shape[1]

    input_layer = tfkl.Input(shape=(n_features,))
    x = tfkl.Dense(32, activation='relu', kernel_regularizer=tfk.regularizers.l2(1.1852537578175572e-05))(input_layer)
    x = tfkl.Dense(24, activation='relu', kernel_regularizer=tfk.regularizers.l2(1.1852537578175572e-05))(x)

    x = tfkl.Dense(16, activation='relu', kernel_regularizer=tfk.regularizers.l2(1.1852537578175572e-05))(x)

    x = tfkl.Dense(24, activation='relu', kernel_regularizer=tfk.regularizers.l2(1.1852537578175572e-05))(x)
    x = tfkl.Dense(32, activation='relu', kernel_regularizer=tfk.regularizers.l2(1.1852537578175572e-05))(x)
    output_layer = tfkl.Dense(n_features, activation="sigmoid")(x)

    autoencoder = tfk.models.Model(inputs=input_layer, outputs=output_layer)
    optimizer = tfk.optimizers.Adam(learning_rate=0.0010917112049605858)
    autoencoder.compile(optimizer=optimizer, loss="cosine_similarity")
        
    return autoencoder

In [51]:
autoencoder = build_model()
autoencoder.fit(X_train, 
                X_train, 
                epochs=1, 
                batch_size=64,
                validation_data=(X_val, X_val),
                shuffle=True
                ) 



<keras.callbacks.History at 0x7e65ca511000>

## Testing the experiment
We start by computing the similarity/distance over training's normal samples and we compare it with the same metric over training's abnormal samples. Ideally, the last one should be larger

### Cosine similarity

Over unseen normal samples

In [52]:
def analyze_similarities(df_original, df_reconstructed):
    similarities = []
    for row in tqdm(range(len(df_reconstructed))):
        original_row = df_original[row].reshape(1, -1)
        reconstructed_row = df_reconstructed[row].reshape(1, -1)
        similarities.append(cosine_similarity(original_row, reconstructed_row))
    return np.mean(similarities), np.std(similarities)

In [58]:
analyze_similarities(X_val[y_val["is_installed"] == 0], autoencoder.predict(X_val[y_val["is_installed"] == 0]))



  0%|          | 0/205022 [00:00<?, ?it/s]

(0.9836996324052211, 0.014908229002202956)

Over unseen abnormal samples

In [59]:
analyze_similarities(X_val[y_val["is_installed"] == 1], autoencoder.predict(X_val[y_val["is_installed"] == 1]))



  0%|          | 0/39961 [00:00<?, ?it/s]

(0.9853235961976019, 0.01263236184981765)