## Anomaly Detection in Financial Accounting

We will use the data for Financial Accounting extracted from a SAP ERP System  

In [1]:
import pandas as pd 
import numpy as np
import pandas_bokeh
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/GitiHubi/deepAI/master/data/fraud_dataset_v2.csv')

In [3]:
df.head()

Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label
0,288203,C3,C31,C9,C92,A3,B1,280979.6,0.0,regular
1,324441,C1,C18,C7,C76,A1,B2,129856.53,243343.0,regular
2,133537,C1,C19,C2,C20,A1,B3,957463.97,3183838.41,regular
3,331521,C4,C48,C9,C95,A2,B1,2681709.51,28778.0,regular
4,375333,C5,C58,C1,C19,A3,B1,910514.49,346.0,regular


In [4]:
pandas_bokeh.output_notebook()


In [5]:
df.plot_bokeh.hist(
    bins=30,
    vertical_xlabel=True,
    hovertool=False,
    title="Normal distributions (Top-on-Top)",
    line_color="black")

#Side-by-Side Histogram (multiple bars share bin side-by-side) also accessible via
#kind="hist":
# df.plot_bokeh(
#     kind="hist",
#     bins=np.linspace(-5, 5, 41),
#     histogram_type="sidebyside",
#     vertical_xlabel=True,
#     hovertool=False,
#     title="Normal distributions (Side-by-Side)",
#     line_color="black")

# #Stacked histogram:
# df.plot_bokeh.hist(
#     bins=np.linspace(-5, 5, 41),
#     histogram_type="stacked",
#     vertical_xlabel=True,
#     hovertool=False,
#     title="Normal distributions (Stacked)",
#     line_color="black")

In [6]:
sc = StandardScaler()
df[['DMBTR','WRBTR']] = sc.fit_transform(df[['DMBTR','WRBTR']])

In [7]:
minmax = MinMaxScaler()
df[['DMBTR','WRBTR']] = minmax.fit_transform(df[['DMBTR','WRBTR']])

In [8]:
df.head()

Unnamed: 0,BELNR,WAERS,BUKRS,KTOSL,PRCTR,BSCHL,HKONT,DMBTR,WRBTR,label
0,288203,C3,C31,C9,C92,A3,B1,0.003039,0.0,regular
1,324441,C1,C18,C7,C76,A1,B2,0.001405,0.004084,regular
2,133537,C1,C19,C2,C20,A1,B3,0.010357,0.053433,regular
3,331521,C4,C48,C9,C95,A2,B1,0.029009,0.000483,regular
4,375333,C5,C58,C1,C19,A3,B1,0.009849,6e-06,regular


In [9]:
df[['DMBTR','WRBTR']].plot_bokeh.hist(
    bins=30,
    vertical_xlabel=True,
    hovertool=False,
    title="Normal distributions (Top-on-Top)",
    line_color="black")

In [10]:
# sns.pairplot(data=df[['DMBTR','WRBTR','label']], vars=['DMBTR','WRBTR'], hue='label')
# g.fig.suptitle('Distribution of DMBTR vs. WRBTR amount values')
# g.fig.set_size_inches(15, 5)

In [11]:
# plt.show()

In [12]:
# Preparing categorical Columns
categorical_and_numericals = pd.get_dummies(df,columns=['WAERS','BUKRS','KTOSL','PRCTR','HKONT','BSCHL'])

In [13]:
categorical_and_numericals.head()

Unnamed: 0,BELNR,DMBTR,WRBTR,label,WAERS_B00,WAERS_B31,WAERS_B39,WAERS_C1,WAERS_C2,WAERS_C3,...,BSCHL_W57,BSCHL_W61,BSCHL_W62,BSCHL_W84,BSCHL_Y19,BSCHL_Y42,BSCHL_Y59,BSCHL_Y78,BSCHL_Z68,BSCHL_Z74
0,288203,0.003039,0.0,regular,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,324441,0.001405,0.004084,regular,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,133537,0.010357,0.053433,regular,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,331521,0.029009,0.000483,regular,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,375333,0.009849,6e-06,regular,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
categorical_and_numericals.pop('BELNR')
categorical_and_numericals.pop('label')
categorical_and_numericals.shape

(533009, 618)

In [15]:
categorical_and_numericals.head()

Unnamed: 0,DMBTR,WRBTR,WAERS_B00,WAERS_B31,WAERS_B39,WAERS_C1,WAERS_C2,WAERS_C3,WAERS_C4,WAERS_C5,...,BSCHL_W57,BSCHL_W61,BSCHL_W62,BSCHL_W84,BSCHL_Y19,BSCHL_Y42,BSCHL_Y59,BSCHL_Y78,BSCHL_Z68,BSCHL_Z74
0,0.003039,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.001405,0.004084,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.010357,0.053433,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.029009,0.000483,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.009849,6e-06,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
categorical_and_numericals.to_csv('../Data/categorical_and_numericals.csv',index=False)

In [17]:
import os
os.getcwd()

'c:\\Users\\somya\\Documents\\DSprojects\\Deep_Learning_with_PyTorch\\Anomaly_Detection_Financial_Accounting\\Notebooks'