# **Anomalies**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
import numpy as np

In [None]:
# Dataset

df = pd.read_excel('Online Retail.xlsx', engine='openpyxl')

In [None]:
# Slicing

data = df.head(500)

data = data[['Quantity','UnitPrice','CustomerID']] 

In [None]:
# Pairplot

sns.pairplot(data)

In [None]:
# Boxplot

plt.figure(figsize=(8,6))
sns.boxplot(data[['Quantity','UnitPrice']])
plt.title('Boxplot')
plt.show()

In [None]:
# Decomposition

pca = PCA(n_components=2)
df_pca = pd.DataFrame(pca.fit_transform(data), columns=['C1','C2'])

In [None]:
# Scatterplot

plt.figure(figsize=(8,6))
sns.scatterplot(df_pca)
plt.title('Scatter plot after PCA')
plt.show()

In [None]:
# Boxplot

plt.figure(figsize=(8,6))
sns.boxplot(df_pca)
plt.title('Scatter plot after PCA')
plt.show()

In [None]:
# Anomalies

z_score1 = stats.zscore(df_pca['C1'])
z_score2 = stats.zscore(df_pca['C2'])

anomalies1 =  df_pca['C1'][(abs(z_score1)>3)]
anomalies2 =  df_pca['C2'][(abs(z_score2)>3)]

In [None]:
# Plot the z scores

fig, ax = plt.subplots(1,2, figsize=(8,6))

ax[0].plot(z_score1)
ax[0].set_title('Z-score of C1')

ax[1].plot(z_score2)
ax[1].set_title('Z-score of C2')

plt.show()


In [None]:
# Plot anomalies of C1

threshold=3
plt.figure(figsize=(8,6))
plt.scatter(
    df_pca["C1"],
    np.zeros_like(df_pca["C1"]),
    color="blue",
    label="Normal Data",
    alpha=0.5,
)
plt.scatter(
    anomalies1,
    np.zeros_like(anomalies1),
    color="red",
    label="Anomalies",
    s=100,
)
plt.axvline(
    x=threshold * df_pca["C1"].std() + df_pca["C1"].mean(),
    color="orange",
    linestyle="--",
    label="Threshold",
)
plt.axvline(
    x=-threshold * df_pca["C1"].std() + df_pca["C1"].mean(),
    color="orange",
    linestyle="--",
)
plt.title("Anomaly Detection using Z-Scores")
plt.xlabel("Feature 1")
plt.yticks([])
plt.legend()
plt.show()

In [None]:
# Plot anomalies of C2

threshold=3
plt.figure(figsize=(8,6))
plt.scatter(
    df_pca["C2"],
    np.zeros_like(df_pca["C2"]),
    color="blue",
    label="Normal Data",
    alpha=0.5,
)
plt.scatter(
    anomalies2,
    np.zeros_like(anomalies2),
    color="red",
    label="Anomalies",
    s=100,
)
plt.axvline(
    x=threshold * df_pca["C2"].std() + df_pca["C2"].mean(),
    color="orange",
    linestyle="--",
    label="Threshold",
)
plt.axvline(
    x=-threshold * df_pca["C2"].std() + df_pca["C2"].mean(),
    color="orange",
    linestyle="--",
)
plt.title("Anomaly Detection using Z-Scores")
plt.xlabel("Feature 1")
plt.yticks([])
plt.legend()
plt.show()