# Exercise 1 - Anomaly Detection

Generate a dataset with the following artificial Data

In [None]:
np.random.seed(42)

# Generate train data
X_inliers = 0.3 * np.random.randn(100, 2)
X_inliers = np.r_[X_inliers + 2, X_inliers - 2]

# Generate some outliers
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X_inliers, X_outliers]

n_outliers = len(X_outliers)
ground_truth = np.ones(len(X), dtype=int)
ground_truth[-n_outliers:] = -1

Apply the Local Outlier Factor and get the following metrics:
* Recall
* F1-Score
* Precission
* Accuracy
* ROC AUC

(You can use in this exercise predict function)

## Solution

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
import sklearn.metrics as metrics

In [None]:
def plot_confussion_matrix(matrix):
    df_confussion_matrix = pd.DataFrame(matrix,
                     ['True Normal','True Fraud'],
                     ['Pred Normal','Pred Fraud'])
    plt.figure(figsize = (8,4))
    sns.set(font_scale=1.4)
    plt.title('Confussion Matrix')
    _ = sns.heatmap(df_confussion_matrix, annot=True, annot_kws={"size": 16}, fmt='g')
    
def model_reporting(y_real, y_pred):
    confussion_matrix = metrics.confusion_matrix(y_real, y_pred)
    roc_auc = metrics.roc_auc_score(y_real, y_pred)
    metrica_f1 = metrics.f1_score(y_real, y_pred)
    print('\tAUC of ROC Curve is: {}'.format(round(roc_auc,2)))
    print('\tF1 Score: {}'.format(round(metrica_f1,2)))
    print("\tAccuracy: {}".format(round(metrics.accuracy_score(y_real, y_pred),3)))  
    print("\tSensitivity:{}".format(round(metrics.recall_score(y_real, y_pred),3)))
    print("\tPrecission: {}".format(round(metrics.precision_score(y_real, y_pred),3)))   
    plot_confussion_matrix(confussion_matrix)

In [None]:
plt.scatter(X[:, 0], X[:,1])

In [None]:
df = pd.DataFrame({
    "X1" : X[: ,0],
    "X2" : X[: ,1],
    "Target" : ground_truth
})

sns.pairplot(data=df, hue = "Target", palette="viridis")

In [None]:
# fit the model for outlier detection (default)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
# use fit_predict to compute the predicted labels of the training samples
# (when LOF is used for outlier detection, the estimator has no predict,
# decision_function and score_samples methods).
y_pred = clf.fit_predict(X)
n_errors = (y_pred != ground_truth).sum()
X_scores = clf.negative_outlier_factor_

In [None]:
model_reporting(ground_truth, y_pred) 

In [None]:
plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color="k", s=3.0, label="Data points")
# plot circles with radius proportional to the outlier scores
radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min())
plt.scatter(
    X[:, 0],
    X[:, 1],
    s=1000 * radius,
    edgecolors="r",
    facecolors="none",
    label="Outlier scores",
)
plt.axis("tight")
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.xlabel("prediction errors: %d" % (n_errors))
legend = plt.legend(loc="upper left")
legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [20]
plt.show()