In [None]:
# Environment setup
# %pip install openpyxl
# %pip install pyod
# %pip install -U kaleido

In [16]:
# Find specific version
# %pip freeze | grep openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
class Config(object):
    def __init__(self):
        self.input_folder = "./Companies"
        self.input_file = "Records.xlsx"
        self.output_folder = "./analysis_results"
        self.company_code = "CompanyA"
        self.showgraph = True


args = Config()

In [None]:
import pandas as pd
import numpy as np
from os import path

In [None]:
excel_file_path = path.join(args.input_folder, args.company_code, args.input_file)
print(excel_file_path)
df = pd.read_excel(excel_file_path, engine="openpyxl")

In [None]:
df.head()

In [None]:
df_no_date = df.drop(columns="Date", inplace=False)

In [None]:
from pyod.models.pca import PCA
from pyod.models.lof import LOF
from pyod.models.abod import ABOD

In [None]:
models = {
    "pca": PCA(contamination=0.1, n_components=3),
    "lof": LOF(contamination=0.1),
    "abod": ABOD(contamination=0.1),
}

In [None]:
# Show all detectors
for i, clf in enumerate(models.keys()):
    print("Model", i + 1, clf)

In [None]:
for i, (clf_name, clf) in enumerate(models.items()):
    print(i + 1, "fitting", clf_name)
    clf.fit(df_no_date)
    outliers = clf.predict(df_no_date)
    df[clf_name] = outliers

In [None]:
df.head()

In [None]:
from pathlib import Path

# Create output folders if not exists
outputs_folder = path.join(args.output_folder, args.company_code)
Path(outputs_folder).mkdir(parents=True, exist_ok=True)

In [None]:
excel_outputfile_path = path.join(outputs_folder, "outlier_records.xlsx")
print(excel_outputfile_path)

df.to_excel(excel_outputfile_path)

In [None]:
# Create feature correlations plot
import plotly
import plotly.figure_factory as ff

features = [k for k in df_no_date.columns]

fig = ff.create_annotated_heatmap(
    np.array(df_no_date.corr().round(2)),
    colorscale="Viridis",
    x=features,
    y=features,
    hoverongaps=True,
)

fig.update_layout(
    paper_bgcolor="white",
    width=1200,
    height=1200,
    titlefont=dict(size=25),
    title_text="Features correlation plot",
)
fig.update_xaxes(tickangle=90, side="bottom")

if args.showgraph:
    fig.show()
fig.write_image(path.join(outputs_folder, "correlation_plot.png"))
plotly.offline.plot(fig, filename=path.join(outputs_folder, "correlation_plot.html"))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Let's print a pairplot
# https://doobzncoobz.com/seaborn-pairplot/
plt.figure()
sns_plot = sns.pairplot(df_no_date)
sns_plot.fig.set_size_inches(15, 15)
sns_plot.fig.suptitle("Pair plot", y=1.01, size=30)

if args.showgraph:
    plt.show()
sns_plot.savefig(path.join(outputs_folder, "sns_pairplot.png"))

In [None]:
import plotly.express as px

outlier_column = list(models.keys())[0]

fig = px.scatter_3d(data_frame=df, x="ft01", y="ft02", z="ft03", symbol=outlier_column)

fig.update_layout(
    margin=dict(l=30, r=30, b=30, t=30),
    autosize=False,
    width=1000,
    height=1000,
    showlegend=False,
    title={
        "text": f"Outlier Plot ({outlier_column})",
        "y": 0.91,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
)

# circle's are outliers, diamonds are normal entries for the specific model
# Let's change the outliers to red X and the rest into green circles
for i, d in enumerate(fig.data):
    if fig.data[i].marker.symbol == "circle":
        fig.data[i].marker.symbol = "x"
        fig.data[i].marker.color = "red"
    else:
        fig.data[i].marker.symbol = "circle"
        fig.data[i].marker.color = "green"
if args.showgraph:
    fig.show()
fig.write_image(path.join(outputs_folder, "outlier_plot.png"))
plotly.offline.plot(fig, filename=path.join(outputs_folder, "outlier_plot.html"))