In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from imagehat.parsers.jpeg_parser import JPEGParser

In [None]:

def extract_ifd_order_vector(metadata):
    tag_positions = {}
    
    # 0th IFD
    ifds = metadata.get("APP1 Info", {})
    for section, section_data in ifds.items():
        if isinstance(section_data, dict):
            for tag, tagdata in section_data.items():
                tag_name = f"{section}_{tag}"
                order = tagdata.get("IFD Tag Order")
                if order is not None:
                    tag_positions[tag_name] = order

    return tag_positions


In [None]:

def build_order_dataset(json_folder):
    rows = []
    labels = []

    for root, _, files in os.walk(json_folder):
        for file in files:
            if file.endswith(".json"):
                full_path = os.path.join(root, file)
                with open(full_path, "r") as f:
                    metadata = json.load(f)

                tag_vector = extract_ifd_order_vector(metadata)

                # Flatten brand + model from filename
                filename = metadata.get("General File Info", {}).get("file_name", "")
                brand_model = os.path.basename(filename).split("_0_")[0]

                rows.append(tag_vector)
                labels.append(brand_model)

    df = pd.DataFrame(rows).fillna(-1)
    return df, labels