In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12815932513595719851, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 9214062756
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 1864648334589026242
 physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"]

In [3]:
import os
import numpy as np
np.random.seed(777)

import keras.backend as K
from keras.preprocessing.image import ImageDataGenerator

import sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import xgboost

# from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [4]:
img_height, img_width = 224, 224

nb_train_samples = 386
nb_validation_samples = 199
nb_test_samples = 155

In [5]:
train_dir = 'data_reduced/train/'
validation_dir = 'data_reduced/validation'
test_dir = 'data_reduced/test'

In [6]:
random_seed = np.random.seed(777)

train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    featurewise_center=True,
    featurewise_std_normalization=True)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size = 1,
    seed = random_seed,
    shuffle = True,
    class_mode='categorical')

validation_generator = train_datagen.flow_from_directory(
    validation_dir,
    target_size=(img_height, img_width),
    batch_size = 1,
    seed = random_seed,
    shuffle = True,
    class_mode='categorical')

test_datagen = ImageDataGenerator(rescale=1. / 255)

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=1,
    seed = random_seed,
    shuffle = False,
    class_mode='categorical')

Found 386 images belonging to 2 classes.
Found 199 images belonging to 2 classes.
Found 155 images belonging to 2 classes.


In [9]:
X_train, y_train = [], []
for _ in tqdm(range(nb_train_samples)):
    x, y = train_generator.next()
    X_train.append(x[0])
    y_train.append(y[0])
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
y_train = np.argmax(y_train, axis=1)
# np.save('data/npy/X_train.npy', X_train)
# np.save('data/npy/y_train.npy', y_train)

100%|███████████████████████████████████████████████████████████████████████████████| 386/386 [00:01<00:00, 356.45it/s]


In [10]:
X_train = np.array([x.flatten() for x in X_train])

In [11]:
X_validation, y_validation = [], []
for _ in tqdm(range(nb_validation_samples)):
    x_val, y_val = validation_generator.next()
    X_validation.append(x_val[0])
    y_validation.append(y_val[0])
X_validation = np.asarray(X_validation)
y_validation = np.asarray(y_validation)
y_validation = np.argmax(y_validation, axis=1)
# np.save('data/npy/X_validation.npy', X_validation)
# np.save('data/npy/y_validation.npy', y_validation)

100%|███████████████████████████████████████████████████████████████████████████████| 199/199 [00:00<00:00, 426.55it/s]


In [12]:
X_validation = np.array([x.flatten() for x in X_validation])

In [13]:
X_test, y_test = [], []
for _ in tqdm(range(nb_test_samples)):
    x_t, y_t = test_generator.next()
    X_test.append(x_t[0])
    y_test.append(y_t[0])
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test, axis=1)
# np.save('data/npy/X_test.npy', X_test)
# np.save('data/npy/y_test.npy', y_test)

100%|███████████████████████████████████████████████████████████████████████████████| 155/155 [00:00<00:00, 360.83it/s]


In [14]:
X_test = np.array([x.flatten() for x in X_test])

In [15]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_validation shape:", X_validation.shape)
print("y_validation shape:", y_validation.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
# plt.imshow(X_train[0])
# plt.show()

X_train shape: (386, 150528)
y_train shape: (386,)
X_validation shape: (199, 150528)
y_validation shape: (199,)
X_test shape: (155, 150528)
y_test shape: (155,)


____

In [17]:
np.savez('models/bottleneck_datasets.npz', X_train, y_train, X_validation, y_validation)

data = np.load('models/bottleneck_datasets.npz')

train_x = data['arr_0']
train_y = data['arr_1']
val_x = data['arr_2']
val_y = data['arr_3']

train_x.shape, train_y.shape, val_x.shape, val_y.shape

((386, 150528), (386,), (199, 150528), (199,))

In [20]:
import time
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# f1_micro, f1_macro, f1_weighted
scoring = {'accuracy': 'accuracy', 'f1': 'f1_macro'}

svm_bottleneck_SVC = SVC(C=1.0, gamma='auto', probability=True, tol=0.001, verbose=False, decision_function_shape='ovr')


t0 = time.time()
scores = cross_validate(svm_bottleneck_SVC, train_x, train_y, scoring=scoring, cv=5, 
                        return_train_score=True, n_jobs=3, verbose=1)

print('finished in %.2fs' % (time.time() - t0))

print('Models took an average %ds to train and another %ds to score' % 
      (np.mean(scores['fit_time']), np.mean(scores['score_time'])))

print('Average accuracy was %.3f +- %.3f during training' % 
      (np.mean(scores['train_accuracy']), np.std(scores['train_accuracy'])))

print('Average accuracy was %.3f +- %.3f during testing' % 
      (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy'])))

print('Average F1 was %.3f +- %.3f during training' % 
      (np.mean(scores['train_f1']), np.std(scores['train_f1'])))

print('Average F1 was %.3f +- %.3f during testing' % 
      (np.mean(scores['test_f1']), np.std(scores['test_f1'])))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  1.4min finished


finished in 85.48s
Models took an average 27s to train and another 2s to score
Average accuracy was 0.905 +- 0.019 during training
Average accuracy was 0.907 +- 0.015 during testing
Average F1 was 0.809 +- 0.046 during training
Average F1 was 0.814 +- 0.031 during testing


In [19]:
import time
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# f1_micro, f1_macro, f1_weighted
scoring = {'accuracy': 'accuracy', 'f1': 'f1_macro'}

svm_bottleneck_LinearSVC = LinearSVC(penalty='l2', loss='squared_hinge', tol=0.0001,C=1.0,multi_class='ovr',
                           verbose=0, max_iter=1000)


t0 = time.time()
scores = cross_validate(svm_bottleneck_LinearSVC, train_x, train_y, scoring=scoring, cv=5, 
                        return_train_score=True, n_jobs=3, verbose=1)

print('finished in %.2fs' % (time.time() - t0))

print('Models took an average %ds to train and another %ds to score' % 
      (np.mean(scores['fit_time']), np.mean(scores['score_time'])))

print('Average accuracy was %.3f +- %.3f during training' % 
      (np.mean(scores['train_accuracy']), np.std(scores['train_accuracy'])))

print('Average accuracy was %.3f +- %.3f during testing' % 
      (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy'])))

print('Average F1 was %.3f +- %.3f during training' % 
      (np.mean(scores['train_f1']), np.std(scores['train_f1'])))

print('Average F1 was %.3f +- %.3f during testing' % 
      (np.mean(scores['test_f1']), np.std(scores['test_f1'])))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  2.4min finished


finished in 145.94s
Models took an average 69s to train and another 0s to score
Average accuracy was 1.000 +- 0.000 during training
Average accuracy was 0.870 +- 0.017 during testing
Average F1 was 1.000 +- 0.000 during training
Average F1 was 0.768 +- 0.025 during testing


### Predict on Test Set

In [None]:
val_x_lda = lda.transform(X_validation)


svms = [SVC(C=1.0, gamma='auto', probability=True, tol=0.001, 
            verbose=False, decision_function_shape='ovr') for _ in range(3)]

t0 = time.time()
svms[0].fit(train_x_lda, train_y) # 
svms[1].fit(train_x_pca, train_y) # 
svms[2].fit(train_x, train_y)     # 
print('finished in %.1fs' % (time.time() - t0))

test_x_lda = lda.transform(X_test)
test_x_pca = pca.transform(X_test)

lda_preds  = svms[0].predict(val_x_lda)
pca_preds  = svms[1].predict(val_x_pca)
nodr_preds = svms[2].predict(val_x)

print(accuracy_score(val_y, lda_preds), f1_score(val_y, lda_preds, average='macro'))
print(accuracy_score(val_y, pca_preds), f1_score(val_y, pca_preds, average='macro'))
print(accuracy_score(val_y, nodr_preds), f1_score(val_y, nodr_preds, average='macro'))

In [None]:
# RBF SVC variance estimated s^2 = (0.003)^2

def inverse_variance_weighting(predictions, variances):
    if len(predictions) != len(variances):
        print('Precictions-variances mismatch.')
        sys.exit(0)
    
    aa = np.sum(np.divide(predictions, variances))
    bb = 1 / np.sum(variances)
    
    return aa / bb

In [None]:
voting_clf = VotingClassifier([model, svm_bottleneck])  # (inception model (first top model))
voting_clf 