This notebook fits a new model using both the dinov2 embeddings and or 10 features

### loading in hand-labeled feats

In [28]:
import pandas as pd
import numpy as np
import json
import os

In [29]:
# load google sheets and combine kelly and alex sheets
df_alex  = pd.read_csv("phase2_data/alex_labeled.csv")
df_kelly = pd.read_csv("phase2_data/kelly_labeled.csv")
df_alex["label"] = "Alex"
df_kelly["label"] = "Kelly"
df = pd.concat([df_alex, df_kelly], ignore_index=True)

In [30]:
# convert all N/Y to 0/1 and clean up names
df = df.rename(columns={
    "Young Little Kid (Kelly)": "young_kid",
    "In_Europe(Alex)": "in_europe",
    "Board Games (Kelly)": "board_games",
    "Kellyâ€™s Husband (Kelly)": "husband",
    "Close up pictures of people ~ selfies (Kelly)": "close_up",
    "Vertical lines emphasis (Alex)": "vertical_lines",
    "Framing (Alex)": "framing",
    "One Point Perspective (Alex)": "one_point_perspective",
    "Flat Perspective (Kelly)": "flat_perspective",
    "Vast color distribution (Kelly)": "vast_color"
})

feature_cols = [
    "young_kid", "in_europe", "board_games", "husband",
    "close_up", "vertical_lines", "framing",
    "one_point_perspective", "flat_perspective", "vast_color"
]
pd.set_option('future.no_silent_downcasting', True)
for col in feature_cols:
    df[col] = (
        df[col]
        .astype(str)
        .str.upper()
        .str.strip()
        .replace({"Y": 1, "N": 0, "M":0, "1": 1, "0": 0})
    )
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [31]:
df.head()

Unnamed: 0,filename,young_kid,in_europe,board_games,husband,close_up,vertical_lines,framing,one_point_perspective,flat_perspective,vast_color,label
0,1,1,0,0,0,0,0,0,1,1,0,Alex
1,2,1,0,0,0,0,1,0,1,0,0,Alex
2,3,0,0,0,0,0,0,1,1,0,0,Alex
3,4,0,0,0,0,0,0,0,1,0,0,Alex
4,5,0,0,0,0,0,0,1,1,0,0,Alex


### load dino-v2 embeddings

In [32]:
with open("all_embeddings.json", "r") as f:
    emb = json.load(f)
rows = []
for full_path, vec in emb.items():
    path = str(full_path).replace("\\", "/")
    parts = path.split("/")
    base = parts[-1]  # e.g. "Alex-Image01.jpg" or "Kelly-Image123.png"

    # infer label
    if base.lower().startswith("alex"):
        label = "Alex"
    elif base.lower().startswith("kelly"):
        label = "Kelly"
    else:
        # fallback to folder name if needed
        if len(parts) >= 2 and parts[-2].lower() in ["alex", "kelly"]:
            label = parts[-2].title()
        else:
            print("cant find label", full_path)
            continue

    # extract number from filename
    digits = "".join(ch for ch in base if ch.isdigit())
    if not digits:
        print("no digits", base)
        continue
    img_number = int(digits)
    rows.append({
        "label": label,
        "filename": img_number,   
        "embedding": np.array(vec).reshape(-1)
    })
emb_df = pd.DataFrame(rows)
print("emb_df shape:", emb_df.shape)
emb_df.head()

emb_df shape: (485, 3)


Unnamed: 0,label,filename,embedding
0,Alex,119,"[1.5550662279129028, 0.11290228366851807, 0.61..."
1,Alex,131,"[-3.759308338165283, -0.48579519987106323, 3.7..."
2,Alex,125,"[-0.6113118529319763, 0.36881011724472046, 1.2..."
3,Alex,247,"[7.148937225341797, 0.12083685398101807, -3.45..."
4,Alex,27,"[3.2177608013153076, -0.3513256311416626, -1.9..."


### merge the embeddings and label tables

In [33]:
merged = df.merge(emb_df, on=["label", "filename"], how="inner")
print("merged shape:", merged.shape)
print(merged[["label", "filename"] + feature_cols].head())

merged shape: (485, 13)
  label  filename  young_kid  in_europe  board_games  husband  close_up  \
0  Alex         1          1          0            0        0         0   
1  Alex         2          1          0            0        0         0   
2  Alex         3          0          0            0        0         0   
3  Alex         4          0          0            0        0         0   
4  Alex         5          0          0            0        0         0   

   vertical_lines  framing  one_point_perspective  flat_perspective  \
0               0        0                      1                 1   
1               1        0                      1                 0   
2               0        1                      1                 0   
3               0        0                      1                 0   
4               0        1                      1                 0   

   vast_color  
0           0  
1           0  
2           0  
3           0  
4           0  


### comparing baseline embeddings svc model vs. embeddings + 10 features svc model

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [35]:
X_emb  = np.stack(merged["embedding"].values)         
X_feat = merged[feature_cols].to_numpy(dtype=float)    
y = merged["label"].to_numpy()                     

X_emb_tr, X_emb_val, X_feat_tr, X_feat_val, y_tr, y_val = train_test_split(
    X_emb, X_feat, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [36]:
# Baseline: embeddings only
clf_base = SVC(gamma="scale", probability=False, random_state=123)
clf_base.fit(X_emb_tr, y_tr)
y_pred_base = clf_base.predict(X_emb_val)
acc_base = accuracy_score(y_val, y_pred_base)

In [37]:
# Augmented: embeddings + features
X_tr_aug  = np.hstack([X_emb_tr, X_feat_tr])
X_val_aug = np.hstack([X_emb_val, X_feat_val])
clf_aug = SVC(gamma="scale", probability=False, random_state=123)
clf_aug.fit(X_tr_aug, y_tr)
y_pred_aug = clf_aug.predict(X_val_aug)
acc_aug = accuracy_score(y_val, y_pred_aug)

In [38]:
print(f"baseline accuracy (embeddings only): {acc_base:.3f}")
print(f"embeddings + 10 features accuracy:   {acc_aug:.3f}")

baseline accuracy (embeddings only): 0.887
embeddings + 10 features accuracy:   0.887
