In [1]:
import os
import sys
import json

#os.environ['CUDA_VISIBLE_DEVICES']='1'
sys.path.append('..')

import config

In [2]:
from utils.dataset import ImageNetDataset
from utils.transform import train_transform, valid_transform
from utils.generator import generator
from utils.callbacks import get_callbacks

Using TensorFlow backend.


# Prepare data

In [3]:
# labels mapping
with open(config.MAP_CLS) as f:
    label_to_class = json.load(f)  
folder_to_label = {v[0]: k for k, v in label_to_class.items()}

In [4]:
# prepare datasets
train_dataset = ImageNetDataset(config.TRAIN_DIR, folder_to_label, transform=train_transform)
valid_dataset = ImageNetDataset(config.VALID_DIR, folder_to_label, transform=valid_transform)

In [5]:
# prepare generators 
train_gen = generator(train_dataset, batch_size=256, num_workers=12, shuffle=True)
valid_gen = generator(valid_dataset, batch_size=1, num_workers=12, shuffle=True)

# Build Model

In [6]:
from nn_models.dpn import DPN92
from nn_models.se_models.se_resnet import SEResNet18, SEResNet34
from keras.optimizers import SGD
from keras.utils import multi_gpu_model

In [7]:
# prepare model

model = SEResNet34(input_shape=(224, 224, 3))

In [8]:
model = multi_gpu_model(model, gpus=2)

In [9]:
#model.load_weights('../../checkpoints/se_resnet18/weights_ep210.h5')

In [10]:
opt = SGD(lr=0.6, momentum=0.9)
model.compile(opt, 'categorical_crossentropy', 
              ['categorical_accuracy', 'top_k_categorical_accuracy'])

In [11]:
callbacks = get_callbacks('se_resnet34', 
                          checkpoints_dir=config.CHECKPOINTS_DIR, 
                          monitor='val_loss',
                          log_dir=config.LOGS_DIR)

Instructions for updating:
Use the retry module or similar alternatives.


# Train

In [21]:
import keras.backend as K

K.set_value(model.optimizer.lr, 0.0005)

In [13]:
new_callbacks = callbacks[:-1]

In [14]:
model.fit_generator(train_gen,
                    steps_per_epoch=5000, 
                    initial_epoch=0,
                    epochs=200, 
                    validation_data=valid_gen, 
                    validation_steps=10000, 
                    callbacks=callbacks)

Epoch 1/200

Epoch 00001: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00001: val_loss improved from inf to 4.58462, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.1897.h5
Epoch 2/200

Epoch 00002: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00002: val_loss improved from 4.58462 to 4.05150, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.2635.h5
Epoch 3/200

Epoch 00003: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00003: val_loss improved from 4.05150 to 3.80157, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.3071.h5
Epoch 4/200

Epoch 00004: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00004: val_loss improved from 3.80157 to 3.28695, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.3886.h5
Epoch 5/200

Epoch 00005: LearningRateScheduler reducing learning rate to 0.6000000238418579


Epoch 00020: val_loss did not improve
Epoch 21/200

Epoch 00021: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00021: val_loss did not improve
Epoch 22/200

Epoch 00022: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00022: val_loss did not improve
Epoch 23/200

Epoch 00023: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00023: val_loss did not improve
Epoch 24/200

Epoch 00024: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00024: val_loss did not improve
Epoch 25/200

Epoch 00025: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00025: val_loss did not improve
Epoch 26/200

Epoch 00026: LearningRateScheduler reducing learning rate to 0.6000000238418579.

Epoch 00026: val_loss improved from 2.44193 to 2.40386, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.5499.h5
Epoch 27/200

Epoch 00027: LearningRateScheduler reducing lear


Epoch 00041: val_loss improved from 2.35500 to 1.65993, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.6543.h5
Epoch 42/200

Epoch 00042: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00042: val_loss did not improve
Epoch 43/200

Epoch 00043: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00043: val_loss improved from 1.65993 to 1.65623, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.6412.h5
Epoch 44/200

Epoch 00044: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00044: val_loss did not improve
Epoch 45/200

Epoch 00045: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00045: val_loss did not improve
Epoch 46/200

Epoch 00046: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00046: val_loss did not improve
Epoch 47/200

Epoch 00047: LearningRateScheduler reducing learning rate to 0.06000000238418579.



Epoch 00062: val_loss did not improve
Epoch 63/200

Epoch 00063: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00063: val_loss did not improve
Epoch 64/200

Epoch 00064: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00064: val_loss did not improve
Epoch 65/200

Epoch 00065: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00065: val_loss did not improve
Epoch 66/200

Epoch 00066: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00066: val_loss improved from 1.58144 to 1.56885, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.6604.h5
Epoch 67/200

Epoch 00067: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00067: val_loss did not improve
Epoch 68/200

Epoch 00068: LearningRateScheduler reducing learning rate to 0.06000000238418579.

Epoch 00068: val_loss did not improve
Epoch 69/200

Epoch 00069: LearningRateScheduler reducin

Epoch 84/200

Epoch 00084: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00084: val_loss did not improve
Epoch 85/200

Epoch 00085: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00085: val_loss did not improve
Epoch 86/200

Epoch 00086: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00086: val_loss did not improve
Epoch 87/200

Epoch 00087: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00087: val_loss did not improve
Epoch 88/200

Epoch 00088: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00088: val_loss improved from 1.45256 to 1.43831, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.6864.h5
Epoch 89/200

Epoch 00089: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00089: val_loss improved from 1.43831 to 1.40206, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.68


Epoch 00105: val_loss did not improve
Epoch 106/200

Epoch 00106: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00106: val_loss did not improve
Epoch 107/200

Epoch 00107: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00107: val_loss did not improve
Epoch 108/200

Epoch 00108: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00108: val_loss did not improve
Epoch 109/200

Epoch 00109: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00109: val_loss did not improve
Epoch 110/200

Epoch 00110: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00110: val_loss did not improve
Epoch 111/200

Epoch 00111: LearningRateScheduler reducing learning rate to 0.006000000052154064.

Epoch 00111: val_loss improved from 1.39724 to 1.37183, saving model to /home/user/project/checkpoints/se_resnet34/weights-0.6855.h5
Epoch 112/200

Epoch 00112: LearningRateSche


Epoch 00126: val_loss did not improve
Epoch 127/200

Epoch 00127: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00127: val_loss did not improve
Epoch 128/200

Epoch 00128: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00128: val_loss did not improve
Epoch 129/200

Epoch 00129: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00129: val_loss did not improve
Epoch 130/200

Epoch 00130: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00130: val_loss did not improve
Epoch 131/200

Epoch 00131: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00131: val_loss did not improve
Epoch 132/200

Epoch 00132: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00132: val_loss did not improve
Epoch 133/200

Epoch 00133: LearningRateScheduler reducing learning rate to 0.0006000000284984708.

Epoch 00133: val_loss did not imp

Process Process-1927:
Process Process-1924:
Process Process-1873:
Process Process-1929:
Process Process-1930:
Process Process-1880:
Process Process-1928:
Process Process-1878:
Process Process-1921:
Process Process-1882:
Process Process-1884:
Process Process-1925:
Process Process-1879:
Process Process-1922:
Process Process-1877:
Process Process-1883:
Process Process-1926:
Process Process-1932:
Traceback (most recent call last):
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-1875:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-1874:
Traceback (most recent call last):
Traceback (most recent call l

  File "/opt/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 96, in _worker_loop
    r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
  File "/opt/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 96, in _worker_loop
    r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/queues.py", line 104, in get
    if not self._poll(timeout):
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **s

  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/selectors.py",

KeyboardInterrupt
KeyboardInterrupt
  File "/opt/anaconda/anaconda3/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
KeyboardInterrupt
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/queues.py", line 104, in get
    if not self._poll(timeout):
  File "/opt/anaconda/anaconda3/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/site-packages/albumentations/core/composition.py", line 58, in __call__
    data = self.run_transforms_if_needed(need_to_run, data)
KeyboardInterrupt
  File "/opt/anaconda/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
KeyboardInterrupt
  File "/opt/anaconda/anaconda3/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
  File "/opt/anaconda/anaconda3/lib/python3.6/sit

KeyboardInterrupt: 

In [23]:
model.save_weights('../../checkpoints/se_resnet18/weights_ep310.h5')