In [1]:
# === Notebook cell 1: Imports + dataset ===
import numpy as np
import pandas as pd
from io import StringIO

# Traditional baseline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import LocalOutlierFactor
import inspect

# Your module
from semantic_sense import AnomalyDetector




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.read_csv("food.csv")
df = df.drop(columns=["ItemID"])
df.head(10)

Unnamed: 0,Color,Price
0,Sandwich,10
1,Sandwich,20
2,Bread,10
3,Bread,20
4,Meat,10
5,Meat,10


In [6]:
# === Notebook cell 2: Traditional anomaly detection (LOF on one-hot + scaled numeric) ===

# Auto split by dtype
categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols     = df.select_dtypes(include=["number"]).columns.tolist()

print("Categorical cols:", categorical_cols)
print("Numeric cols    :", numeric_cols)

# Version-safe OneHotEncoder
sig = inspect.signature(OneHotEncoder)
if "sparse_output" in sig.parameters:
    enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
else:
    enc = OneHotEncoder(sparse=False, handle_unknown="ignore")

X_cat = enc.fit_transform(df[categorical_cols]) if categorical_cols else np.empty((len(df), 0))
scaler = StandardScaler()
X_num  = scaler.fit_transform(df[numeric_cols]) if numeric_cols else np.empty((len(df), 0))

X_traditional = np.hstack([X_num, X_cat])
print("Traditional feature matrix shape:", X_traditional.shape)

# Tiny dataset → small neighbors; contamination ≈ 1/6 so ~1 row flagged
lof = LocalOutlierFactor(n_neighbors=2, contamination=1/6, novelty=False)
labels = lof.fit_predict(X_traditional)      # -1 anomaly, 1 normal
scores = -lof.negative_outlier_factor_       # higher = more anomalous

trad_out = df.copy()
trad_out["trad_lof_score"]  = scores
trad_out["trad_is_anomaly"] = (labels == -1).astype(int)
trad_out = trad_out.sort_values("trad_lof_score", ascending=False).reset_index(drop=True)

print("\n=== Traditional LOF (sorted by score) ===")
display(trad_out)


Categorical cols: ['Color']
Numeric cols    : ['Price']
Traditional feature matrix shape: (6, 4)

=== Traditional LOF (sorted by score) ===


Unnamed: 0,Color,Price,trad_lof_score,trad_is_anomaly
0,Sandwich,20,1.25,0
1,Bread,20,1.25,0
2,Sandwich,10,1.0,0
3,Bread,10,1.0,0
4,Meat,10,1.0,0
5,Meat,10,1.0,0


In [8]:
# === Cell 2: Semantic-Sense — TEXT mode ===
# Row -> "Col: val, ..." string -> embedding -> centroid distance
det_text = AnomalyDetector(mode="text")

# With 6 rows, ~16.7% flags ≈ 1 row
out_text = det_text.detect(df, top_percent=20)

display(out_text[["Color","Price","row_text","centroid_distance","rank","is_anomaly"]]
        .sort_values("centroid_distance", ascending=False))


Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.30it/s]
Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]


Unnamed: 0,Color,Price,row_text,centroid_distance,rank,is_anomaly
0,Meat,10,"Color: Meat, Price: 10",0.0949,2,1
1,Meat,10,"Color: Meat, Price: 10",0.0949,1,1
2,Bread,20,"Color: Bread, Price: 20",0.086467,3,0
3,Bread,10,"Color: Bread, Price: 10",0.078854,4,0
4,Sandwich,20,"Color: Sandwich, Price: 20",0.074243,5,0
5,Sandwich,10,"Color: Sandwich, Price: 10",0.064681,6,0


In [14]:
# === Cell 3: Semantic-Sense — HYBRID (numeric_weight = 1.0) ===
# Text embedding + scaled numeric features
det_hybrid = AnomalyDetector(mode="hybrid", numeric_weight=0.1)
out_hybrid = det_hybrid.detect(df, top_percent=20)

display(out_hybrid[["Color","Price","row_text","centroid_distance","rank","is_anomaly"]]
        .sort_values("centroid_distance", ascending=False))


Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 66.04it/s]
Converting rows to text: 100%|██████████| 6/6 [00:00<?, ?it/s]


Unnamed: 0,Color,Price,row_text,centroid_distance,rank,is_anomaly
0,Meat,10,"Color: Meat, Price: 10",0.096857,2,1
1,Meat,10,"Color: Meat, Price: 10",0.096857,1,1
2,Bread,20,"Color: Bread, Price: 20",0.095738,3,0
3,Sandwich,20,"Color: Sandwich, Price: 20",0.083605,4,0
4,Bread,10,"Color: Bread, Price: 10",0.081205,5,0
5,Sandwich,10,"Color: Sandwich, Price: 10",0.067036,6,0


In [15]:
# === Cell 4: Semantic-Sense — HYBRID (numeric_weight = 3.0) ===
# Increase influence of numeric columns (Price) on the embedding direction
det_hybrid_w3 = AnomalyDetector(mode="hybrid", numeric_weight=1)
out_hybrid_w3 = det_hybrid_w3.detect(df, top_percent=20)

display(out_hybrid_w3[["Color","Price","row_text","centroid_distance","rank","is_anomaly"]]
        .sort_values("centroid_distance", ascending=False))


Converting rows to text: 100%|██████████| 6/6 [00:00<00:00, 18710.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 166.41it/s]
Converting rows to text: 100%|██████████| 6/6 [00:00<00:00, 8952.62it/s]


Unnamed: 0,Color,Price,row_text,centroid_distance,rank,is_anomaly
0,Bread,20,"Color: Bread, Price: 20",0.619745,1,1
1,Sandwich,20,"Color: Sandwich, Price: 20",0.612033,2,1
2,Meat,10,"Color: Meat, Price: 10",0.166902,3,0
3,Meat,10,"Color: Meat, Price: 10",0.166902,4,0
4,Bread,10,"Color: Bread, Price: 10",0.16654,5,0
5,Sandwich,10,"Color: Sandwich, Price: 10",0.154015,6,0
