In [1]:
from sklearn import svm
from datasets import load_dataset
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models = ["convnextv2", "convnextv2_balanced", "mobilenetv3", "vit"]

In [3]:
data = load_dataset("pufanyi/SC4000_eval_results", "val")

In [4]:
train_df, val_df = data["train"].to_pandas(), data["validation"].to_pandas()

In [5]:
train_df.head()

Unnamed: 0,image_id,answer,convnextv2,convnextv2_balanced,mobilenetv3,vit,__index_level_0__
0,1586338305.jpg,3,"[-3.2536468505859375, -2.6047475337982178, -0....","[-0.25814253091812134, -1.3590772151947021, -0...","[1.9638902813312598e-05, 5.641267853206955e-05...","[-3.321620225906372, -3.069734573364258, -0.71...",351
1,4109154808.jpg,3,"[-3.1794118881225586, -2.796578884124756, -1.2...","[-0.34996867179870605, -0.8078905940055847, -0...","[1.0344053436028844e-07, 5.012622636968445e-07...","[-2.587888479232788, -2.8572161197662354, -1.1...",530
2,634816572.jpg,3,"[-3.539937973022461, -1.7975764274597168, -0.7...","[-0.22549781203269958, -0.8406252861022949, -0...","[9.457328269490972e-05, 0.00013533516903407872...","[-3.160341739654541, -1.1152867078781128, -0.2...",896
3,3885885091.jpg,3,"[-2.65594744682312, -2.5383026599884033, -1.20...","[-0.16385968029499054, -0.8772090673446655, -1...","[7.579084194730967e-05, 0.0008043970447033644,...","[-1.7944451570510864, -1.4936530590057373, -1....",947
4,3920433539.jpg,0,"[2.8496909141540527, -0.3478693962097168, -1.6...","[3.1631290912628174, 0.20580849051475525, -0.6...","[0.9674176573753357, 0.007927113212645054, 0.0...","[3.7792317867279053, 0.06791925430297852, -2.1...",936


In [6]:
val_df.head()

Unnamed: 0,image_id,answer,convnextv2,convnextv2_balanced,mobilenetv3,vit,__index_level_0__
0,1032082353.jpg,3,"[-2.8055851459503174, -1.899994969367981, -0.1...","[-0.8705260753631592, -0.35763120651245117, -0...","[1.8842243889594101e-06, 4.885403177468106e-05...","[-2.6817591190338135, -1.4198451042175293, 0.3...",644
1,3453071010.jpg,4,"[-0.26134613156318665, -1.9008660316467285, -1...","[1.422821044921875, -0.6947699189186096, -0.02...","[0.004472201690077782, 0.2566199004650116, 0.0...","[-0.6916424632072449, -0.5975199937820435, -1....",629
2,3619872017.jpg,3,"[-2.263319969177246, 0.07364509254693985, -1.6...","[-0.481817364692688, -0.6947422027587891, -1.2...","[1.7813044905778952e-06, 0.0002525941818021238...","[-1.1457653045654297, -0.06673234701156616, -1...",70
3,2405023635.jpg,3,"[0.45248937606811523, -0.7792124152183533, -0....","[1.325195550918579, -0.4810410141944885, -1.31...","[0.15254716575145721, 0.09175295382738113, 0.0...","[-0.7659131288528442, -1.7290306091308594, -0....",962
4,807086334.jpg,2,"[-2.7944529056549072, -2.382516384124756, 3.72...","[-1.1005042791366577, -1.034427523612976, 2.86...","[0.0007713254308328032, 0.0009335664799436927,...","[-2.2144246101379395, -1.9241547584533691, 5.0...",787


In [7]:
def extract_features(item):
    features = []
    for model in models:
        feature = item[model]
        if model != "mobilenetv3":
            feature = softmax(feature).tolist()
        features.extend(feature)
    return features

In [8]:
train_data = {"X": [extract_features(item) for _, item in train_df.iterrows()], "y": train_df["answer"]}

In [9]:
train_data["X"][:5]

[[5.852179436769535e-05,
  0.00011197756681502813,
  0.0007472451631166313,
  0.9977942764731161,
  0.001287979002584616,
  0.11079855054548592,
  0.03684717701475819,
  0.07752478168634168,
  0.6804142264100597,
  0.0944152643433546,
  1.9638902813312598e-05,
  5.641267853206955e-05,
  0.0002275739097967744,
  0.9991175532341003,
  0.0005786268739029765,
  4.73346268847763e-08,
  6.995181405043448e-05,
  8.998943610657598e-05,
  0.0009513991834279767,
  0.9909101716455101,
  0.007978487920904933],
 [7.15758987198148e-05,
  0.0001049612685588745,
  0.0004903474162151081,
  0.9970265516768901,
  0.0023065637396162725,
  0.0800520579765199,
  0.05064068099651519,
  0.0449306763354796,
  0.7650252655104209,
  0.05935131918106442,
  1.0344053436028844e-07,
  5.012622636968445e-07,
  0.0005619212170131505,
  0.9952425956726074,
  0.004194903187453747,
  5.389963586566182e-09,
  9.670310653364886e-05,
  7.387081961778408e-05,
  0.00038999558236761005,
  0.9974899677486305,
  0.00194946274285

In [10]:
model = svm.SVC()
model.fit(train_data["X"], train_data["y"])

In [11]:
val_data = {"X": [extract_features(item) for _, item in val_df.iterrows()], "y": val_df["answer"]}

In [12]:
model.score(val_data["X"], val_data["y"])

0.9345794392523364

In [13]:
import pickle
from pathlib import Path

model_path = Path("../../assets/")

print("Creating model directory", model_path.resolve().absolute())

model_path.mkdir(parents=True, exist_ok=True)

path = model_path / "model_selection.pkl"

print("Saving model to", path.resolve().absolute())

with open(path, "wb") as f:
    pickle.dump(model, f)

Creating model directory /data/pufanyi/project/SC4000/assets
Saving model to /data/pufanyi/project/SC4000/assets/model_selection.pkl


In [14]:
import pickle

with open(path, "rb") as f:
    model = pickle.load(f)

In [15]:
model

In [16]:
model.predict(val_data["X"])

array([3, 4, 3, 3, 2, 3, 3, 3, 3, 3, 4, 3, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3,
       3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 1, 3, 3, 3, 3, 3, 3, 1, 1, 3, 2, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 1, 4, 1, 3, 4, 2, 3, 3, 3, 3, 3, 2, 3, 4,
       3, 2, 2, 3, 4, 0, 1, 3, 3, 3, 3, 4, 3, 1, 3, 0, 3, 3, 3, 2, 3, 3,
       2, 3, 3, 3, 2, 3, 2, 3, 4, 4, 0, 3, 0, 2, 0, 3, 4, 2, 3, 2, 3, 3,
       3, 4, 1, 1, 4, 3, 3, 1, 3, 3, 3, 3, 3, 3, 2, 3, 2, 0, 3, 3, 3, 4,
       1, 4, 2, 4, 0, 3, 3, 2, 3, 1, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 1, 4, 2, 4, 1, 3,
       3, 3, 4, 1, 3, 0, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3,
       3, 3, 1, 3, 4, 3, 3, 3, 4, 3, 3, 2, 3, 3, 0, 3])