# Semantaic-Sense Anomaly Detection Comparision

## Imports

In [20]:

import numpy as np
import pandas as pd
from io import StringIO
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import LocalOutlierFactor
import inspect
from semantic_sense import AnomalyDetector




## Data Loading

In [17]:
df = pd.read_csv("food.csv")
df = df.drop(columns=["ItemID"])
df.head(10)

Unnamed: 0,Item,Price
0,Sandwich,10
1,Sandwich,20
2,Bread,10
3,Bread,20
4,Meat,10
5,Meat,10


## Traditional Approach

In [21]:
# Split by data type
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols     = df.select_dtypes(include=["number"]).columns.tolist()

print("Categorical cols:", categorical_cols)
print("Numeric cols    :", numeric_cols)

#  OneHotEncoder
sig = inspect.signature(OneHotEncoder)
if "sparse_output" in sig.parameters:
    enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
else:
    enc = OneHotEncoder(sparse=False, handle_unknown="ignore")

X_cat = enc.fit_transform(df[categorical_cols]) if categorical_cols else np.empty((len(df), 0))
scaler = StandardScaler()
X_num  = scaler.fit_transform(df[numeric_cols]) if numeric_cols else np.empty((len(df), 0))

X_traditional = np.hstack([X_num, X_cat])
print("Traditional feature matrix shape:", X_traditional.shape)

# Tiny dataset → small neighbors; contamination ≈ 1/6 so ~1 row flagged
lof = LocalOutlierFactor(n_neighbors=2, contamination=2/6, novelty=False)
labels = lof.fit_predict(X_traditional)      # -1 anomaly, 1 normal
scores = -lof.negative_outlier_factor_       # higher = more anomalous

trad_out = df.copy()
trad_out["trad_lof_score"]  = scores
trad_out["trad_is_anomaly"] = (labels == -1).astype(int)
trad_out = trad_out.sort_values("trad_lof_score", ascending=False).reset_index(drop=True)

print("\n=== Traditional LOF (sorted by score) ===")
display(trad_out)


Categorical cols: ['Item']
Numeric cols    : ['Price']
Traditional feature matrix shape: (6, 4)

=== Traditional LOF (sorted by score) ===


Unnamed: 0,Item,Price,trad_lof_score,trad_is_anomaly
0,Sandwich,20,1.25,1
1,Bread,20,1.25,1
2,Sandwich,10,1.0,0
3,Bread,10,1.0,0
4,Meat,10,1.0,0
5,Meat,10,1.0,0


It’s clear that price is the main factor. In one-hot encoding, Sandwich, Bread, and Meat are treated as different items, so changes in price stand out more.

## Using semantic-sense

In [23]:
det_hybrid = AnomalyDetector(mode="hybrid", numeric_weight=0.1)
out_hybrid = det_hybrid.detect(df, top_percent=20)

display(out_hybrid[["Item","Price","row_text","centroid_distance","rank","is_anomaly"]]
        .sort_values("centroid_distance", ascending=False))


Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.86it/s]
Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]


Unnamed: 0,Item,Price,row_text,centroid_distance,rank,is_anomaly
0,Meat,10,"Item: Meat, Price: 10",0.108942,2,1
1,Meat,10,"Item: Meat, Price: 10",0.108942,1,1
2,Bread,20,"Item: Bread, Price: 20",0.105362,3,0
3,Sandwich,20,"Item: Sandwich, Price: 20",0.095404,4,0
4,Bread,10,"Item: Bread, Price: 10",0.089077,5,0
5,Sandwich,10,"Item: Sandwich, Price: 10",0.072168,6,0


Now we can see that Meat is the anomaly. Using semantic understanding, the model groups Bread and Sandwich as similar, while Meat stands out even with price differences. The impact of numbers can be adjusted using the numeric_weight parameter — setting it to 1 makes numerical values highly influential.

In [25]:
det_hybrid_w3 = AnomalyDetector(mode="hybrid", numeric_weight=1)
out_hybrid_w3 = det_hybrid_w3.detect(df, top_percent=20)

display(out_hybrid_w3[["Item","Price","row_text","centroid_distance","rank","is_anomaly"]]
        .sort_values("centroid_distance", ascending=False))


Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 31.96it/s]
Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]


Unnamed: 0,Item,Price,row_text,centroid_distance,rank,is_anomaly
0,Bread,20,"Item: Bread, Price: 20",0.627779,1,1
1,Sandwich,20,"Item: Sandwich, Price: 20",0.621453,2,1
2,Meat,10,"Item: Meat, Price: 10",0.174485,3,0
3,Meat,10,"Item: Meat, Price: 10",0.174485,4,0
4,Bread,10,"Item: Bread, Price: 10",0.172492,5,0
5,Sandwich,10,"Item: Sandwich, Price: 10",0.157374,6,0


We can see that price is playing a major role. However, Meat is also close to being an anomaly, showing up at 3rd and 4th rank due to the influence of semantic understanding.