In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import pickle
import os
from collections import defaultdict
import pandas as pd

In [10]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

TRAIN_DIR = 'train_augmented'
TEST_DIR = 'test'
RES_DIR = 'results'

train_features_path = os.path.join(RES_DIR, 'train_features')
test_features_path = os.path.join(RES_DIR, 'test_features')
test_labels_path = os.path.join(RES_DIR, 'test_labels')
scaler_path = os.path.join(RES_DIR, 'scaler')
scaled_train_path = os.path.join(RES_DIR, 'scaled_train')
scaled_test_path = os.path.join(RES_DIR, 'scaled_test')
pca_path = os.path.join(RES_DIR, 'pca')
best_params_svm_path = os.path.join(RES_DIR, 'best_params_svm')

BATCH_SIZE = 32
IMG_WIDTH = 224
IMG_HEIGHT = 224

In [15]:
# construct pandas dataframes
def construct_dataframe(dir):
    label_dirs = os.listdir(dir)
    dataframe_data = []
    
    for label in label_dirs:
        label_path = os.path.join(dir, label)
        if os.path.isdir(label_path):
            for image_name in os.listdir(label_path):
                image_path = os.path.join(label_path, image_name)
                dataframe_data.append([image_path, label])
    
    df = pd.DataFrame(dataframe_data, columns=['filepath', 'label'])
    return df


train_df = construct_dataframe(TRAIN_DIR)
test_df = construct_dataframe(TEST_DIR)
print(test_df)

                         filepath     label
0    images/test/negative/342.jpg  negative
1    images/test/negative/159.jpg  negative
2    images/test/negative/198.jpg  negative
3    images/test/negative/275.jpg  negative
4    images/test/negative/484.jpg  negative
..                            ...       ...
371        images/test/mink/5.jpg      mink
372       images/test/mink/42.jpg      mink
373        images/test/mink/0.jpg      mink
374       images/test/mink/44.jpg      mink
375       images/test/mink/43.jpg      mink

[376 rows x 2 columns]


In [16]:
# image_gen = ImageDataGenerator(
#     featurewise_center=True,
#     featurewise_std_normalization=True,
#     rotation_range=20,
#     width_shift_range=0.2,
#     height_shift_range=0.2,
#     horizontal_flip=True,
#     preprocessing_function=tf.keras.applications.resnet50.preprocess_input
# )

image_gen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.resnet50.preprocess_input
)

train_flow = image_gen.flow_from_dataframe(
    train_df,
    x_col='filepath',
    y_col='label',
    batch_size=BATCH_SIZE,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    seed=42,
    shuffle=False,
    interpolation="bilinear",
)

test_flow = image_gen.flow_from_dataframe(
    test_df,
    x_col='filepath',
    y_col='label',
    batch_size=BATCH_SIZE,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    seed=42,
    shuffle=False,
    interpolation="bilinear",
)

Found 2505 validated image filenames belonging to 1 classes.
Found 376 validated image filenames belonging to 2 classes.


In [17]:
from tensorflow.keras.applications.resnet50 import ResNet50
import pandas as pd

In [None]:
def extract_resnet(X):
    resnet_model = ResNet50(weights='imagenet', include_top=False, classes=False)

    features_array = resnet_model.predict(X)
    features_array = np.reshape(features_array, (-1, features_array.shape[1]*features_array.shape[2]*
                                                 features_array.shape[3]))
        
    return features_array


train_features = extract_resnet(train_flow)
with open(train_features_path, 'wb') as f:
    pickle.dump(train_features, f)

test_features = extract_resnet(test_flow)
with open(test_features_path, 'wb') as f:
    pickle.dump(test_features, f)

with open(test_labels_path, 'wb') as f:
    pickle.dump(np.where(test_df['label'].to_numpy()=='positive', 1, -1), f)

In [7]:
# load resnet features
with open(train_features_path, 'rb') as f:
    train_features = pickle.load(f)

with open(test_features_path, 'rb') as f:
    test_features = pickle.load(f)
with open(test_labels_path, 'rb') as f:
    test_labels = pickle.load(f)
# print(test_labels)

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [9]:
# Apply standard scaler to output from resnet50
ss = StandardScaler()
ss.fit(train_features)
with open(scaler_path, 'wb') as f:
    pickle.dump(ss, f)

train_features = ss.transform(train_features)
test_features = ss.transform(test_features)

with open(scaled_train_path, 'wb') as f:
    pickle.dump(train_features, f)

with open(scaled_test_path, 'wb') as f:
    pickle.dump(test_features, f)

    
# Take PCA to reduce feature space dimensionality
pca = PCA(n_components=512, whiten=True)
pca = pca.fit(train_features)

with open(pca_path 'wb') as f:
    pickle.dump(pca, f)

print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))

train_features = pca.transform(train_features)
test_features = pca.transform(test_features)

Explained variance percentage = 0.76


In [10]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import IsolationForest
from sklearn import svm

In [11]:
# Train classifier and obtain predictions for OC-SVM
grid = {
    'gamma': [0.001, 0.0001, 0.00001, 0.000001, 'auto', 'scale'],
    'kernel': ['rbf', 'linear'],
    'nu': [0.01, 0.02, 0.05, 0.08, 0.10, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26],
    'shrinking': [True, False]
}
# grid = {
#     'gamma': [0.001],
#     'kernel': ['rbf'],
#     'nu': [0.08],
#     'shrinking': [False]
# }


oc_svm = svm.OneClassSVM()
best_score = -1

for params in ParameterGrid(grid):
    oc_svm.set_params(**params)
    oc_svm.fit(train_features)
    
    y_pred = oc_svm.predict(test_features)
    score = f1_score(test_labels, y_pred)
    print(score)
    
    if score > best_score:
        best_score = score
        best_params = params
        best_y_pred = y_pred

    
print(best_params)
print('Best score:', best_score)
print()

with open(best_params_svm_path, 'wb') as f:
    pickle.dump(best_params, f)

0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23474178403755872
0.23474178403755872
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.23058823529411762
0.2311320754716981
0.2311320754716981
0.2311320754716981
0.2311320754716981
0.16770186335403728
0.152317880794702
0.15576323987538943
0.16000000000000003
0.2098360655737705
0.2027027027027027
0.16835016835016836
0.18118466898954705
0.19444444444444445
0.1741935483870968
0.19548872180451127
0.21960784313725493
0.21926910299003322
0.22471910112359553
0.17177914110429449
0.16314199395770393
0.1796875
0.18374558303886926
0.1346153846153846
0.134185303514377
0.16494845360824742
0.16783216783216787
0.16091954022988508
0.18532818532818532
0.15709969788519634
0.

In [None]:
# grid = {
#     'gamma': [0.001, 0.0001, 0.00001, 0.000001, 'auto', 'scale'],
#     'kernel': ['rbf', 'linear'],
#     'nu': [0.01, 0.02, 0.05, 0.08, 0.10, 0.12],
#     'shrinking': [True, False]
# }
grid = {
    'contamination': [0.01, 0.08],
    'max_features': [1, 2, 3, 4, 5],
    'max_samples': [1, 2, 3, 4, 5],
    'n_estimators': [40, 100, 120, 150, 300]
}


if_clf = IsolationForest()
best_score = -1

for params in ParameterGrid(grid):
    if_clf.set_params(**params)
    if_clf.fit(train_features)
    
    y_pred = if_clf.predict(test_features)
    score = f1_score(test_labels, y_pred)
    print(score)
    
    if score > best_score:
        best_score = score
        best_params = params
        best_y_pred = y_pred

    
print(best_params)
print('Best score:', best_score)

with open(best_params_if_path, 'wb') as f:
    pickle.dump(best_params, f)

In [38]:
print(best_y_pred, test_labels)

[ 1 -1 -1 -1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1 -1 -1 -1 -1  1  1  1 -1  1 -1  1  1  1 -1  1  1 -1
  1 -1 -1 -1 -1  1  1 -1  1 -1  1  1 -1 -1 -1  1 -1 -1  1  1  1  1 -1  1
 -1  1 -1 -1 -1 -1  1 -1  1  1 -1  1  1 -1  1 -1  1  1 -1 -1  1 -1  1  1
 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1  1  1  1  1  1  1 -1  1 -1 -1  1  1 -1
 -1  1  1 -1 -1  1  1 -1  1  1  1 -1 -1 -1 -1 -1  1  1  1  1 -1  1 -1 -1
 -1  1  1 -1 -1 -1  1 -1  1 -1  1  1 -1 -1 -1 -1 -1 -1  1  1  1  1  1  1
 -1 -1 -1  1  1 -1  1 -1 -1  1  1  1 -1  1 -1  1  1  1 -1 -1  1  1  1 -1
 -1  1  1  1 -1  1  1  1 -1 -1  1  1  1 -1 -1 -1  1  1  1 -1  1  1  1  1
  1 -1  1  1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1  1 -1  1  1 -1 -1
 -1 -1 -1 -1 -1  1  1  1 -1  1 -1  1  1 -1  1  1 -1  1 -1 -1  1 -1 -1 -1
  1  1 -1  1  1  1 -1  1 -1  1 -1  1  1 -1 -1 -1  1  1 -1  1 -1 -1  1  1
 -1 -1 -1  1 -1 -1 -1 -1 -1  1  1  1  1 -1 -1  1 -1  1 -1 -1  1  1  1  1
  1 -1  1  1  1  1 -1 -1  1 -1 -1 -1  1  1  1  1  1