In [2]:
from sklearn import svm
from datasets import load_dataset
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
models = ["convnextv2", "mobilenetv3", "vit"]

In [22]:
data = load_dataset("pufanyi/SC4000_eval_results", "val")
# data = load_dataset("pufanyi/SC4000_eval_results", "full")

In [23]:
train_df, val_df = data["train"].to_pandas(), data["validation"].to_pandas()

In [24]:
train_df.head()

Unnamed: 0,image_id,answer,convnextv2,convnextv2_balanced,mobilenetv3,vit
0,1959106502.jpg,2,"[-1.9520471096038818, -0.8462232351303101, 5.1...","[-0.24982061982154846, -1.0279805660247803, 3....","[0.03455905243754387, 0.016511911526322365, 0....","[-2.3550519943237305, 0.15317581593990326, 5.0..."
1,893564766.jpg,3,"[-2.7128067016601562, -3.259507179260254, 0.79...","[0.2573392689228058, -1.3081119060516357, -0.2...","[1.689208875177428e-05, 2.0160097847110592e-05...","[-1.9509457349777222, -3.168455123901367, -0.4..."
2,2252885950.jpg,3,"[-3.029195547103882, -3.0657451152801514, -2.0...","[-0.5064132809638977, -1.412938117980957, -1.0...","[1.8458380282027065e-06, 1.121260538639035e-06...","[-1.9304018020629883, -3.3717763423919678, -1...."
3,2178418518.jpg,3,"[-2.6217901706695557, -3.044633150100708, 0.66...","[-0.23686741292476654, -1.254176139831543, -0....","[4.259888100932585e-06, 7.009319961071014e-05,...","[-3.38240647315979, -2.7183454036712646, 1.395..."
4,244407780.jpg,0,"[2.413464069366455, 0.088779516518116, -1.9206...","[1.5419367551803589, 0.20424342155456543, -1.0...","[0.39770635962486267, 0.04494056478142738, 0.0...","[3.041616201400757, -0.7818437814712524, -1.71..."


In [25]:
val_df.head()

Unnamed: 0,image_id,answer,convnextv2,convnextv2_balanced,mobilenetv3,vit
0,1032082353.jpg,3,"[-2.8055851459503174, -1.899994969367981, -0.1...","[-0.8705260753631592, -0.35763120651245117, -0...","[1.8842243889594101e-06, 4.885403177468106e-05...","[-2.6817591190338135, -1.4198451042175293, 0.3..."
1,3453071010.jpg,4,"[-0.26134613156318665, -1.9008660316467285, -1...","[1.422821044921875, -0.6947699189186096, -0.02...","[0.004472201690077782, 0.2566199004650116, 0.0...","[-0.6916424632072449, -0.5975199937820435, -1...."
2,3619872017.jpg,3,"[-2.263319969177246, 0.07364509254693985, -1.6...","[-0.481817364692688, -0.6947422027587891, -1.2...","[1.7813044905778952e-06, 0.0002525941818021238...","[-1.1457653045654297, -0.06673234701156616, -1..."
3,2405023635.jpg,3,"[0.45248937606811523, -0.7792124152183533, -0....","[1.325195550918579, -0.4810410141944885, -1.31...","[0.15254716575145721, 0.09175295382738113, 0.0...","[-0.7659131288528442, -1.7290306091308594, -0...."
4,807086334.jpg,2,"[-2.7944529056549072, -2.382516384124756, 3.72...","[-1.1005042791366577, -1.034427523612976, 2.86...","[0.0007713254308328032, 0.0009335664799436927,...","[-2.2144246101379395, -1.9241547584533691, 5.0..."


In [26]:
import numpy as np


def extract_features(item):
    features = []
    for model in models:
        feature = item[model]
        if model != "mobilenetv3":
            feature = softmax(feature).tolist()
        feature = np.sqrt(np.array(feature)).tolist()
        features.extend(feature)
    return features

In [27]:
train_data = {
    "X": [extract_features(item) for _, item in train_df.iterrows()],
    "y": train_df["answer"],
}

In [28]:
model = svm.LinearSVC()
model.fit(train_data["X"], train_data["y"])

In [29]:
val_data = {
    "X": [extract_features(item) for _, item in val_df.iterrows()],
    "y": val_df["answer"],
}

In [30]:
model.score(val_data["X"], val_data["y"])

0.9345794392523364

In [31]:
model.score(train_data["X"], train_data["y"])

0.9283489096573209

In [32]:
import pickle
from pathlib import Path

model_path = Path("../../assets/")

print("Creating model directory", model_path.resolve().absolute())

model_path.mkdir(parents=True, exist_ok=True)

path = model_path / "model_selection.pkl"

print("Saving model to", path.resolve().absolute())

with open(path, "wb") as f:
    pickle.dump(model, f)

Creating model directory /data/pufanyi/project/SC4000/assets
Saving model to /data/pufanyi/project/SC4000/assets/model_selection.pkl


In [33]:
import pickle

with open(path, "rb") as f:
    model = pickle.load(f)

In [34]:
model

In [35]:
model.predict(val_data["X"])

array([3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3,
       3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 1, 3, 3, 3, 0, 3, 3, 1, 1, 3, 2, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 1, 4, 1, 3, 4, 2, 3, 3, 3, 3, 3, 2, 3, 4,
       3, 2, 2, 3, 4, 0, 1, 3, 3, 3, 3, 4, 3, 1, 3, 0, 3, 3, 3, 2, 3, 3,
       2, 3, 3, 3, 2, 3, 2, 3, 4, 4, 0, 3, 0, 2, 0, 3, 4, 2, 3])