![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRRI9EVsRlf_k35o-qloKvsvMWpdhMJ4aFTjA&usqp=CAU)pt.slideshare.net

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#Code by Sheik Mohamed Imran https://www.kaggle.com/imrandude/h2o-autoencoders-and-anomaly-detection-python/notebook

Anomaly detection with H2O in Python

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import h2o
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,Normalizer
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
from pylab import rcParams
rcParams['figure.figsize']=15,10

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadsgermancsv/german.csv', delimiter=';', encoding = "ISO-8859-2", nrows = nRowsRead)
df.dataframeName = 'german.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

#List column

H2O cannot use columns with character datatype. Creating Dummy variables instead. All data is numerical. No need to encode here.

In [None]:
#cols_to_transform = [ 'continue_drop','gender','caste','guardian','internet' ]
#df = pd.get_dummies( df,columns = cols_to_transform )
#df.head()

In [None]:
predictors=list(range(0,15))

#Standardize input data.

In [None]:
# Copy the original dataset
scaled_features = df.copy()

# Extract column names to be standardized
col_names = ['Creditability',
 'Account_Balance',
 'Duration_of_Credit_monthly',
 'Payment_Status_of_Previous_Credit',
 'Purpose',
 'Credit_Amount',
 'Value_Savings_Stocks',
 'Length_of_current_employment',
 'Instalment_per_cent',
 'Sex_Marital_Status',
 'Guarantors',
 'Duration_in_Current_address',
 'Most_valuable_available_asset',
 'Age_years',
 'Concurrent_Credits',
 'Type_of_apartment',
 'No_of_Credits_at_this_Bank',
 'Occupation', 'No_of_dependents',
 'Telephone',
 'Foreign_Worker']

# Standardize the columns and re-assingn to original dataframe
features = scaled_features[col_names]
scaler = RobustScaler().fit_transform(features.values)
features = pd.DataFrame(scaler, index=df.index, columns=col_names)
scaled_features [col_names] = features
scaled_features.head()

#Split dataset - Duration in Current address as 'test' and Type of apartment as 'train'

In [None]:
#df = df.astype(object)

train=scaled_features.loc[scaled_features['Type_of_apartment'] == 1]
test=scaled_features.loc[scaled_features['Duration_in_Current_address'] == 1]

#H2O Autoencoding and Anomaly detection

Starting H2O cluster

In [None]:
h2o.init(nthreads=-1, enable_assertions = False)

#Convert panda dataframe to H2O dataframe

In [None]:
train.hex=h2o.H2OFrame(train)
test.hex=h2o.H2OFrame(test)

#Create AutoEncoder Model

In [None]:
model=H2OAutoEncoderEstimator(activation="Tanh",
                              hidden=[120],
                              ignore_const_cols=False,
                              epochs=100
                             )

#Train the model with training dataset

In [None]:
model.train(x=predictors,training_frame=train.hex)

#Print the output in JSON format

In [None]:
model._model_json['output']

#Get anomalous values

In [None]:
#test_rec_error=model.anomaly(test.hex)
train_rec_error=model.anomaly(train.hex)

#Convert output to dataframe

In [None]:
#test_rec_error_df=test_rec_error.as_data_frame()
train_rec_error_df=train_rec_error.as_data_frame()
final = pd.concat([train_rec_error_df, train_rec_error_df])

#Calculate top whisker value

In [None]:
boxplotEdges=final.quantile(.75)
iqr = np.subtract(*np.percentile(final, [75, 25]))
top_whisker=boxplotEdges[0]+(1.5*iqr)
top_whisker

#Add id column to dataframe

In [None]:
train_rec_error_df['Telephone']=train_rec_error_df.index
#test_rec_error_df['']=test_rec_error_df.index + 18200 #Count of train data

#Scatter plot with top whisker

In [None]:
plt.scatter(train_rec_error_df['Telephone'],train_rec_error_df['Reconstruction.MSE'],label='Continued df',s=1)
plt.axvline(x=18200,linewidth=1)
#plt.scatter(test_rec_error_df['Telephone'],test_rec_error_df['Reconstruction.MSE'],label='Dropped df',s=1)
plt.axhline(y=top_whisker,linewidth=1, color='r')
plt.legend()

#Output:

We trained the model. From the graph we see all that have been classififed as Outliers?
Can we?

In [None]:
h2o.cluster().shutdown()