In [None]:
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import os

In [None]:
# load environment variables from .env file for project
dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)
data_directory = os.getenv("OUTPUT_DIRECTORY")

In [None]:
feature_directory = os.path.join(data_directory, "features")

In [None]:
os.listdir(feature_directory)

In [None]:
feature_df = pd.read_parquet(os.path.join(feature_directory, "ssi_lidl_spacy_nl_md_features.parquet"), engine="pyarrow")
feature_df = feature_df.rename(columns={"month": "year_month"})
feature_df.head()

In [None]:
feature_df["year"] = feature_df.year_month.str[:4]
feature_df.head()

In [None]:
unique_years = feature_df.year.unique()
unique_years

In [None]:
X_train = feature_df[feature_df["year"] == "2019"]["features_spacy_nl_md"].tolist()
X_train[:1]

In [None]:
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
clf.fit(X_train)

In [None]:
X_test = feature_df[feature_df["year"] == "2020"]["features_spacy_nl_md"].tolist()
X_test[:1]

In [None]:
y_pred_test = clf.predict(X_test)
n_error_test = y_pred_test[y_pred_test == -1].size

In [None]:
y_pred_test.shape, n_error_test

In [None]:
import numpy as np

X_train_test = np.array(X_train + X_test)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE()
tsne.fit(X_train_test)

In [None]:
X_train_embedded = tsne.fit_transform(np.array(X_train))
X_test_embedded = tsne.fit_transform(np.array(X_test))

In [None]:
X_test_inliers = X_test_embedded[y_pred_test == 1]
X_test_outliers = X_test_embedded[y_pred_test == -1]

In [None]:
import matplotlib
import matplotlib.pyplot as plt

plt.title("Novelty Detection with LOF")
s = 40
b1 = plt.scatter(X_train_embedded[:, 0], X_train_embedded[:, 1], c="white", s=s, edgecolors="k")
b2 = plt.scatter(X_test_inliers[:, 0], X_test_inliers[:, 1], c="blueviolet", s=s, edgecolors="k")
b2 = plt.scatter(X_test_outliers[:, 0], X_test_outliers[:, 1], c="gold", s=s, edgecolors="k")

In [None]:
x_test_scores = clf.score_samples(X_test)

In [None]:
X_test_df = feature_df[feature_df["year"] == "2020"]
X_test_df["loc_score"] = x_test_scores
X_test_outliers_df = X_test_df.iloc[y_pred_test == -1]

X_test_outliers_df

In [None]:
X_test_outliers_df[X_test_outliers_df.loc_score ==  X_test_outliers_df.loc_score.min()]

In [None]:
X_train_df = feature_df[feature_df["year"] == "2019"]
X_train_df

In [None]:
# Outlier detection does not seem to work as Happy Sock is already in the 2019 data
X_train_df[X_train_df.ean_name == "Happy Sock"]

# Use different Outlier Detection method

In [None]:
import cleanlab
from cleanlab.outlier import OutOfDistribution

In [None]:
ood = OutOfDistribution()
train_outlier_scores = ood.fit_score(features=np.array(X_train))

In [None]:
top_train_outlier_idxs = (train_outlier_scores).argsort()[:15]
X_train_df.iloc[top_train_outlier_idxs]

In [None]:
test_outlier_scores = ood.score(features=np.array(X_test))

In [None]:
top_outlier_idxs = (test_outlier_scores).argsort()[:40]
X_test_df.iloc[top_outlier_idxs]

In [None]:
# Take the 2.5th percentile of the outlier scores in the training data as the threshold
threshold = np.percentile(test_outlier_scores, 2.5)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
plt_range = [min(train_outlier_scores.min(),test_outlier_scores.min()), \
             max(train_outlier_scores.max(),test_outlier_scores.max())]

axes[0].hist(train_outlier_scores, range=plt_range, bins=50)
axes[0].set(title='train_outlier_scores distribution', ylabel='Frequency')
axes[0].axvline(x=threshold, color='red', linewidth=2)
axes[1].hist(test_outlier_scores, range=plt_range, bins=50)
axes[1].set(title='test_outlier_scores distribution', ylabel='Frequency')
axes[1].axvline(x=threshold, color='red', linewidth=2)

In [None]:
# Consider everything under 0.4 outlier
X_test_df.iloc[test_outlier_scores < 0.4]

In [None]:
# Eiwitbrood is available in both X_test as X_train, so maybe an outlier but not a novelty!!
X_train_df[X_train_df.ean_name == "Eiwitbrood"]

In [None]:
# Consider everything under 0.4 outlier
X_train_df.iloc[train_outlier_scores < 0.4]