In [1]:
from keras.src.applications.resnet import ResNet50
from keras.src.applications.resnet import preprocess_input
from keras.src.utils.image_utils import img_to_array
from keras.src.utils.image_utils import load_img
from sklearn.model_selection import train_test_split
import pandas as pd

import numpy as np
from pandas import Series

# Load the ResNet50 model (without the top classification layer)
model = ResNet50(weights="imagenet", include_top=False, input_shape=(224, 224, 3))


def extract_images(img_path: Series, prefix):
    imgs = img_path.apply(lambda old: prefix + old)\
        .apply(lambda new: load_img(new, target_size=(224, 224)))\
        .apply(lambda img: img_to_array(img))\
        .apply(lambda img: preprocess_input(img))
    return imgs


df_train = pd.read_csv('Training_set.csv') # https://www.kaggle.com/datasets/phucthaiv02/butterfly-image-classification

labels = df_train[['label']]
df_train = df_train.drop(['label'], axis=1)
feats_train = extract_images(df_train['filename'], './train/')

2024-12-23 22:37:18.048018: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734986238.118189  139477 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734986238.139372  139477 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-23 22:37:18.314433: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1734986242.608647  139477 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 55

In [2]:
batch_train = np.stack(feats_train.to_list())

# Split the batch into smaller batches
def process_in_batches(b, batch_size=16):
    # Split the large batch into smaller ones
    num_batches = len(b) // batch_size + (1 if len(b) % batch_size != 0 else 0)
    for i in range(num_batches):
        # Get the slice for the current batch
        yield b[i * batch_size:(i + 1) * batch_size]

features_train = []
for smaller_batch in process_in_batches(batch_train, batch_size=64):
    features_train.append(model.predict(smaller_batch, verbose=0))

features_train = np.concatenate(features_train)
train_flattened = features_train.reshape(features_train.shape[0], -1)

I0000 00:00:1734986248.814154  139594 service.cc:148] XLA service 0x7f70700015c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1734986248.814442  139594 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti, Compute Capability 8.6
2024-12-23 22:37:28.871500: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1734986249.180454  139594 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1734986251.682016  139594 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [3]:
from datasketch import MinHashLSH, MinHash
k = 128
counter = 0
def create_minhash_features(fs):
    minhash_features = []
    for vector in fs:
        global counter
        counter += 1
        if counter % 100 == 0:
            print(counter)
        m = MinHash(num_perm=k)
        for val in vector:
            m.update(str(val).encode('utf8'))
        minhash_features.append(m)
    return minhash_features

# Створюємо MinHash для train набору
train_minhashes = create_minhash_features(train_flattened)

100
200
300
400
500
600
700
800
900
1000


In [4]:
hashes_train, hashes_test, labels_train, labels_test, features_train, features_test = train_test_split(train_minhashes, labels, train_flattened, test_size=0.2, random_state=420)

lsh = MinHashLSH(threshold=0.1, num_perm=k)

for i, minhash in enumerate(hashes_train):
    lsh.insert(f'img_{i}', minhash)

correct_pred = 0
for i, minhash in enumerate(hashes_test):
    res = lsh.query(minhash)
    if res:
        closest = res[0].split('_')[1]
        predicted_label = labels_train['label'].values[int(closest)]
        global correct_pred
        if predicted_label == labels_test['label'].values[i]:
            correct_pred += 1
print(f'Accuracy: {round(100 * correct_pred / len(hashes_test), 2):.2f}%')

Accuracy: 0.00%


In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(features_train, labels_train)
accuracy_knn = knn.score(features_test, labels_test)
print(f'KNN Accuracy: {accuracy_knn * 100:.2f}%')

  return self._fit(X, y)


KNN Accuracy: 35.50%


## Висновок
### Даний датасет не є підходящим для звичайної "простої" класифікації, як от вирізнення метеликів від велосипедів. Датасет містить в собі 6499 зображень метеликів, і задача - розрізнити якийсь підвид метеликів. Якщо алгоритму LSH згодувати весь датасет (всі 6499 розмічених зображень), то точність буде 1.5%, що все ще є дуже низьким показником. LSH не має застосовуватись у класифікації таких деталізованих ознак, як підвид метелика, адже призначення LSH - оптимізація пошуку, виконуючи "згладжування ознак".