# Tutorial 1 - Autoencoder

We will build an ANOMALY detector. We will use the airbnb data set

**The unit of analysis is a single housing district**

**We will train an autoencoder on the `inland` districts and consider them the "normal" data. Then, we will reconstruct the `near ocean` districts to see if we can identify them as anomalies.**

I already created two files:<br>
`inland.csv`: includes only the inland districts<br>
`near ocean.csv`: includes only the near ocean districts

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

random_state=42

# Get the data

In [None]:
inland = pd.read_csv("inland.csv")

ocean = pd.read_csv("near ocean.csv")


# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

##  Identify the numerical and categorical columns

In [None]:
inland.dtypes

In [None]:
# Identify the numerical columns
numeric_columns = inland.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = inland.select_dtypes('object').columns.to_list()

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns)],
        remainder='passthrough')
    
#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for NORMAL data

In [None]:
#Fit and transform the train data
normal_x = preprocessor.fit_transform(inland)

normal_x

In [None]:
normal_x.shape

# Tranform: transform() for ANOMALOUS DATA

In [None]:
# Transform the test data
anomaly_x = preprocessor.transform(ocean)

anomaly_x

In [None]:
anomaly_x.shape

# Autoencoder

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
model = keras.models.Sequential()

#Encoder
model.add(keras.layers.InputLayer(input_shape=9))
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(7, activation='relu'))

#Decoder
model.add(keras.layers.Dense(8, activation='relu'))
model.add(keras.layers.Dense(9, activation=None))

model.summary()

In [None]:
adam = keras.optimizers.Adam(learning_rate=0.001)


model.compile(loss='mse', optimizer='Nadam', metrics=['mean_squared_error'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

callback = [earlystop]

In [None]:
# Be careful: both input and output are "housing_normal_std" while training the autoencoder

model.fit(normal_x, normal_x, 
          validation_data = (normal_x, normal_x),
          epochs=100, batch_size=100, callbacks=callback)

### Check the average MSE on the "normal" data

In [None]:
model.evaluate(normal_x, normal_x)

In [None]:
#Multiply by 100 to make sense of the error term:

model.evaluate(normal_x, normal_x)[0]*100

### Check the average MSE on the "anomalous" data

In [None]:
model.evaluate(anomaly_x, anomaly_x)

In [None]:
#Multiply by 100 to make sense of the error term:

model.evaluate(anomaly_x, anomaly_x)[0]*100

## Predict first 20 in normal data

In [None]:
from sklearn.metrics import mean_squared_error

for i in range(0,20):
    prediction = model.predict(normal_x[i:i+1])
    print((mean_squared_error(normal_x[i:i+1], prediction))*100)

    
#Error terms are multiplied by 100 to make sense of the numbers

## Predict first 20 in anomalous data


In [None]:
for i in range(0,20):
    prediction = model.predict(anomaly_x[i:i+1])
    print((mean_squared_error(anomaly_x[i:i+1], prediction))*100)

    
#Error terms are multiplied by 100 to make sense of the numbers