In [714]:
import os
import random

import pandas as pd
import numpy as np
from PIL import Image
from PIL import ImageEnhance, ImageOps

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [715]:
seed = 42
random.seed(seed)

img_size = (256, 256)

root_path = "E:\IOAI\kits\preonia-2025\starter_kit"

# Data preparation

In [716]:
def extract_features(img: Image.Image):
    arr = np.array(img, dtype=np.float32).flatten()
    return arr / 255.0


def augment_image(img: Image.Image):
    """
    Apply a random augmentation from a set of transformations.
    """
    transforms = [
        lambda i: ImageOps.mirror(i),
        lambda i: i.rotate(15),
        lambda i: i.rotate(-15),
        lambda i: ImageEnhance.Contrast(i).enhance(1.3),
        lambda i: ImageEnhance.Brightness(i).enhance(1.2),
        lambda i: ImageEnhance.Color(i).enhance(1.5),
    ]
    return random.choice(transforms)(img)


def clean_df(df):
    return df

def prep_features(df: pd.DataFrame):  
    df["CodeID"] = df["CodeID"].astype(str) 

    feats = []
    for idx in df["CodeID"]:
        img = Image.open(os.path.join(root_path, "train", f"{idx}.jpg")).convert("RGB").resize(img_size)
        feats.append(extract_features(img))
    df["features"] = feats

    return df

In [717]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [718]:
df_train.head()

Unnamed: 0,CodeID,Label,features
0,211,1,"[0.34901962, 0.3529412, 0.3019608, 0.36862746,..."
1,163,1,"[0.22745098, 0.15686275, 0.87058824, 0.2274509..."
2,166,1,"[0.0, 0.02745098, 0.09019608, 0.0, 0.02745098,..."
3,5184,0,"[0.20784314, 0.18431373, 0.08627451, 0.1725490..."
4,5323,0,"[0.93333334, 0.93333334, 0.93333334, 0.9294117..."


In [719]:
# balance by simple up-sampling
minor = df_train[df.Label == 1]
major = df_train[df.Label == 0]
upsampled_min = minor.sample(len(major), replace=True, random_state=seed)
df_bal = pd.concat([major, upsampled_min]).sample(frac=1, random_state=seed)

In [720]:
X = np.vstack(df_bal["features"].values)
y = df_bal["Label"].values

# EDA

In [721]:
df_bal["features"].map(lambda x: x.shape).value_counts()

features
(196608,)    260
Name: count, dtype: int64

# Model selection

In [None]:
data = []
aug_multiplier = 4

for _, row in df_bal.iterrows():
    cid, lbl = str(row.CodeID), row.Label
    img = Image.open(os.path.join(root_path, 'train', f"{cid}.jpg")).convert('RGB').resize(img_size)
    # original
    data.append((extract_features(img), lbl))
    # augmentations
    for _ in range(aug_multiplier):
        aug_img = augment_image(img)
        data.append((extract_features(aug_img), lbl))

X_all, y_all = zip(*data)
X = np.vstack(X_all)
y = np.array(y_all)

In [723]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=seed, stratify=y
)

In [724]:
def evaluate(clf):
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=3, n_jobs=-1)
    score = scores.mean() - scores.std()

    clf.fit(X_train, y_train)
    acc = accuracy_score(y_test, clf.predict(X_test))
    return score, acc

In [725]:
lr = LogisticRegression(C=10, max_iter=500)

evaluate(lr)

(0.8799824029337273, 0.9663648124191462)

In [726]:
svc = SVC(C=10, kernel="rbf", class_weight="balanced")

evaluate(svc)

(0.9239606516837671, 0.9793014230271668)

In [727]:
clf = svc

clf.fit(X_train, y_train)

# Submission

In [728]:
# get test images array
test_ids = [
    int(f.split(".")[0])
    for f in os.listdir(os.path.join(root_path, "test"))
    if f.endswith(".jpg")
]

X_val = np.array([
    extract_features(
            Image.open(os.path.join(root_path, "test", f"{i}.jpg"))
            .convert("RGB")
            .resize(img_size)
    ) for i in test_ids
])

X_val.shape



(117, 196608)

In [729]:
# subtask 1
img129 = Image.open(f"{root_path}\\train\\129.jpg")
subtask1 = img129.size[0] * img129.size[1]

# subtask 2
def count_inbalance(df: pd.DataFrame):
    counts = df["Label"].value_counts().sort_values(ascending=True)
    ratio = counts.iloc[0] / counts.iloc[1]
    return ratio # should be closer to 1.0

subtask2 = count_inbalance(df_train)
print([('train', subtask2), ('train augmented', count_inbalance(df_bal))])

# subtask 3
subtask3 = clf.predict(X_val)

[('train', 0.7153846153846154), ('train augmented', 1.0)]


In [730]:
def build_subtask_df(id, answer):
    idx = range(len(test_ids)) if id == 3 else [0]
    return pd.DataFrame(
        {
            "subtaskID": id,
            "datapointID": (test_ids if id == 3 else 0),
            "answer": answer,
        },
        index=idx,
    )

subtasks = [
    (1, subtask1),
    (2, subtask2),
    (3, subtask3)
]

submission = pd.concat([build_subtask_df(id, answer) for id, answer in subtasks], ignore_index=True)

In [731]:
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,0,353808.0
1,2,0,0.715385
2,3,0,0.0
3,3,1,0.0
4,3,10,0.0


In [732]:
submission.to_csv("submission.csv", index=False)