In [32]:
import torch
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import numpy as np

In [33]:
import numpy as np
import random
import os

In [34]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer

from sklearn.metrics import accuracy_score

In [35]:
SEED = 28
img_sz = 128
DATA_PATH = "../data/astro_dataset_maxia/astro_dataset_maxia"

In [36]:
random.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

In [37]:
blur_transform = mega_transform = transforms.Compose([
    transforms.Resize((img_sz, img_sz)),
    transforms.GaussianBlur(3),
    transforms.ToTensor()
])

bright_transform = mega_transform = transforms.Compose([
    transforms.Resize((img_sz, img_sz)),
    transforms.ColorJitter(0.4, 0.6),
    transforms.ToTensor()
])

mega_transform = transforms.Compose([
    transforms.Resize((img_sz, img_sz)),
    transforms.ColorJitter(0.4, 0.6),
    transforms.GaussianBlur(3),
    transforms.ToTensor()
])

transformations = [blur_transform, bright_transform, mega_transform]

In [38]:
TRAIN_PATH = os.path.join(DATA_PATH, "training")
VAL_PATH = os.path.join(DATA_PATH, "validation")
TEST_PATH = os.path.join(DATA_PATH, "test")

In [39]:
def dataset_to_numpy(group_of_images):
    X_list, y_list = [], []
    for img, label in group_of_images:
        arr = img.numpy().reshape(-1)
        y_list.append(label)
        X_list.append(arr)
    X = np.stack(X_list, axis=0)
    y = np.array(y_list)
    return X, y

In [40]:
data = [None for _ in range(len(transformations))]

for i in range(len(transformations)):
    transformation = transformations[i]
    train_group = datasets.ImageFolder(TRAIN_PATH, transform=transformation)
    val_group   = datasets.ImageFolder(VAL_PATH,   transform=transformation)
    test_group  = datasets.ImageFolder(TEST_PATH,  transform=transformation)

    class_names = train_group.classes
    num_classes = len(class_names)
    X_train, y_train = dataset_to_numpy(train_group)
    X_val,   y_val   = dataset_to_numpy(val_group)
    X_test,  y_test  = dataset_to_numpy(test_group)

    data[i] = {"X_train": X_train, "X_val": X_val, "X_test": X_test,
        "y_train": y_train, "y_val": y_val, "y_test": y_test}

data

[{'X_train': array([[0.49019608, 0.49019608, 0.49803922, ..., 0.24313726, 0.24313726,
          0.23921569],
         [0.04313726, 0.04313726, 0.03921569, ..., 0.52156866, 0.5254902 ,
          0.5254902 ],
         [0.02352941, 0.02352941, 0.01960784, ..., 0.30980393, 0.30980393,
          0.30980393],
         ...,
         [0.01176471, 0.00392157, 0.00784314, ..., 0.        , 0.        ,
          0.        ],
         [0.01176471, 0.00392157, 0.00784314, ..., 0.        , 0.        ,
          0.        ],
         [0.00392157, 0.00392157, 0.00392157, ..., 0.        , 0.        ,
          0.        ]], shape=(2416, 49152), dtype=float32),
  'X_val': array([[0.03529412, 0.04313726, 0.04313726, ..., 0.45882353, 0.45490196,
          0.45490196],
         [0.00784314, 0.00784314, 0.01176471, ..., 0.94509804, 0.9607843 ,
          0.96862745],
         [0.06666667, 0.06666667, 0.06666667, ..., 0.00392157, 0.00392157,
          0.00392157],
         ...,
         [0.00392157, 0.00392157

In [41]:
# in data, 0 - blur transform, 1 - bright transform, 2 - mega (both) transform

In [42]:
feature_transforms = {
    "normal": Pipeline([
        ("scaler", StandardScaler())
    ]),
    "rooted": Pipeline([
        ("sqrt", FunctionTransformer(np.sqrt, validate=False)),
        ("scaler", StandardScaler())
    ]),
    "squared": Pipeline([
        ("square", FunctionTransformer(np.square, validate=False)),
        ("scaler", StandardScaler())
    ]),
    "cubed": Pipeline([
        ("cube", FunctionTransformer(lambda x: np.power(x, 3), validate=False)),
        ("scaler", StandardScaler())
    ]),
}

feature_transforms.keys()


dict_keys(['normal', 'rooted', 'squared', 'cubed'])

In [None]:
experiments = {
    "mega_normal": (2, "normal"),
    "mega_square": (2, "squared"),
    "blur_cubed": (0, "cubed"),
    "bright_rooted": (1, "rooted"),
}

experiments

{'mega_normal': (2, 'normal'),
 'mega_square': (2, 'squared'),
 'blur_cubed': (0, 'cubed'),
 'bright_rooted': (1, 'rooted')}

In [44]:
def knn_trainer(model, preproc_index, feature_transform, ks):

    # if preproc_index == 0:
    #     preproc_name = "Blur"
    # elif preproc_index == 1:
    #     preproc_name = "Bright"
    # else:
    #     preproc_name = "Mega"

    print(model)

    processed_data = data[preproc_index]
    X_train, y_train = processed_data["X_train"], processed_data["y_train"]
    X_val, y_val = processed_data["X_val"], processed_data["y_val"]

    pipe = feature_transforms[feature_transform]

    X_train = pipe.fit_transform(X_train)
    X_val   = pipe.transform(X_val)

    print("New shapes:", X_train.shape, X_val.shape)

    res = {}
    for k in ks:
        knn = KNeighborsClassifier(k, metric="euclidean", weights="distance",
            algorithm="brute", n_jobs=-1)

        knn.fit(X_train, y_train)

        y_train_pred = knn.predict(X_train)
        y_val_pred   = knn.predict(X_val)

        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc   = accuracy_score(y_val,   y_val_pred)

        res[k] = {
            "train_acc": train_acc,
            "val_acc":   val_acc,
        }

        print("k:", k, "Train Acciracy:", train_acc, "Val Accuracy:", val_acc)

    return res


In [45]:
ks = [1, 3, 5, 7, 9, 11]

total_res = {}

for model, (preproc_index, feature_transform) in experiments.items():
    res = knn_trainer(model, preproc_index, feature_transform, ks)
    total_res[model] = res

mega_normal
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8054711246200608
k: 3 Train Acciracy: 1.0 Val Accuracy: 0.7872340425531915
k: 5 Train Acciracy: 1.0 Val Accuracy: 0.7689969604863222
k: 7 Train Acciracy: 1.0 Val Accuracy: 0.7583586626139818
k: 9 Train Acciracy: 1.0 Val Accuracy: 0.7446808510638298
k: 11 Train Acciracy: 1.0 Val Accuracy: 0.7310030395136778
mega_square
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8100303951367781
k: 3 Train Acciracy: 1.0 Val Accuracy: 0.7811550151975684
k: 5 Train Acciracy: 1.0 Val Accuracy: 0.7644376899696048
k: 7 Train Acciracy: 1.0 Val Accuracy: 0.7477203647416414
k: 9 Train Acciracy: 1.0 Val Accuracy: 0.729483282674772
k: 11 Train Acciracy: 1.0 Val Accuracy: 0.7218844984802432
blur_cubed
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8753799392097265
k: 3 Train Acciracy: 1.0 Val Accuracy: 0.8449848024316109
k: 5 Train Acciracy: 1.0 Val Accura

In [46]:
test_res = {}

for model, (preproc_index, feature_transform) in experiments.items():
    res = total_res[model]
    
    optimal_k = 0
    optimal_val_acc = 0

    for k, kmap in res.items():
        if kmap['val_acc'] > optimal_val_acc:
            optimal_k = k
            optimal_val_acc = kmap['val_acc']
    
    print(model, optimal_k, optimal_val_acc)

    proc_data = data[preproc_index]
    X_train, y_train = proc_data["X_train"], proc_data["y_train"]
    X_val, y_val = proc_data["X_val"], proc_data["y_val"]
    X_test, y_test = proc_data["X_test"], proc_data["y_test"]

    pipe = feature_transforms[feature_transform]

    X_train_val = np.vstack([X_train, X_val])
    y_train_val = np.concatenate([y_train, y_val])

    X_train_val_Z = pipe.fit_transform(X_train_val)
    X_test_Z      = pipe.transform(X_test)

    knn_best = KNeighborsClassifier(
        n_neighbors=optimal_k,
        weights="distance",
        metric="euclidean",
        n_jobs=-1
    )
    knn_best.fit(X_train_val_Z, y_train_val)

    y_test_pred = knn_best.predict(X_test_Z)
    test_acc = accuracy_score(y_test, y_test_pred)

    test_res[model] = {
        "optimal_k": optimal_k,
        "test_accuracy": test_acc,
    }

    print("Optimal K Test Acccuracy", test_acc)

print(test_res)


mega_normal 1 0.8054711246200608
Optimal K Test Acccuracy 0.7942028985507247
mega_square 1 0.8100303951367781
Optimal K Test Acccuracy 0.782608695652174
blur_cubed 1 0.8753799392097265
Optimal K Test Acccuracy 0.8434782608695652
bright_rooted 1 0.756838905775076
Optimal K Test Acccuracy 0.7652173913043478
{'mega_normal': {'optimal_k': 1, 'test_accuracy': 0.7942028985507247}, 'mega_square': {'optimal_k': 1, 'test_accuracy': 0.782608695652174}, 'blur_cubed': {'optimal_k': 1, 'test_accuracy': 0.8434782608695652}, 'bright_rooted': {'optimal_k': 1, 'test_accuracy': 0.7652173913043478}}


In [None]:
# since blur gave good results with cube, testing with just blur transform

new_exps = {
    "blur_normal": (0, "normal"),
    "blur_square": (0, "squared"),
    "blur_cubed": (0, "cubed"),
    "blur_rooted": (0, "rooted"),
}

In [50]:
k = [1]
new_total_res = {}

for model, (preproc_index, feature_transform) in new_exps.items():
    res = knn_trainer(model, preproc_index, feature_transform, k)
    total_res[model] = res

blur_normal
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8054711246200608
blurs_square
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8100303951367781
blur_cubed
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.8753799392097265
blur_rooted
New shapes: (2416, 49152) (658, 49152)
k: 1 Train Acciracy: 1.0 Val Accuracy: 0.756838905775076


In [51]:
new_test_res = {}

for model, (preproc_index, feature_transform) in experiments.items():
    res = new_total_res[model]
    
    optimal_k = 0
    optimal_val_acc = 0

    for k, kmap in res.items():
        if kmap['val_acc'] > optimal_val_acc:
            optimal_k = k
            optimal_val_acc = kmap['val_acc']
    
    print(model, optimal_k, optimal_val_acc)

    proc_data = data[preproc_index]
    X_train, y_train = proc_data["X_train"], proc_data["y_train"]
    X_val, y_val = proc_data["X_val"], proc_data["y_val"]
    X_test, y_test = proc_data["X_test"], proc_data["y_test"]

    pipe = feature_transforms[feature_transform]

    X_train_val = np.vstack([X_train, X_val])
    y_train_val = np.concatenate([y_train, y_val])

    X_train_val_Z = pipe.fit_transform(X_train_val)
    X_test_Z      = pipe.transform(X_test)

    knn_best = KNeighborsClassifier(
        n_neighbors=optimal_k,
        weights="distance",
        metric="euclidean",
        n_jobs=-1
    )
    knn_best.fit(X_train_val_Z, y_train_val)

    y_test_pred = knn_best.predict(X_test_Z)
    test_acc = accuracy_score(y_test, y_test_pred)

    new_test_res[model] = {
        "optimal_k": optimal_k,
        "test_accuracy": test_acc,
    }

    print("Optimal K Test Acccuracy", test_acc)

print(new_test_res)


mega_normal 1 0.8054711246200608
Optimal K Test Acccuracy 0.7942028985507247
mega_square 1 0.8100303951367781
Optimal K Test Acccuracy 0.782608695652174
blur_cubed 1 0.8753799392097265
Optimal K Test Acccuracy 0.8434782608695652
bright_rooted 1 0.756838905775076
Optimal K Test Acccuracy 0.7652173913043478
{'mega_normal': {'optimal_k': 1, 'test_accuracy': 0.7942028985507247}, 'mega_square': {'optimal_k': 1, 'test_accuracy': 0.782608695652174}, 'blur_cubed': {'optimal_k': 1, 'test_accuracy': 0.8434782608695652}, 'bright_rooted': {'optimal_k': 1, 'test_accuracy': 0.7652173913043478}}
