[View in Colaboratory](https://colab.research.google.com/github/shenshutao/machine_learning/blob/master/bde.ipynb)

# Prepare environment

In [0]:
import sys
!yes | {sys.executable} -m pip uninstall keras
!{sys.executable} -m pip install keras==2.1.4


In [0]:
import keras
print keras.__version__

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [0]:
!ls

In [0]:
!ls drive/bde
!cp drive/bde/bigdata.zip .

In [0]:
!apt install unzip
!unzip bigdata.zip
!mv -f bigdata train_all
!ls

# Define the CNN model

In [0]:
def preprocess_input(x):
    x /= 255.  # normalization
    x -= 0.5  # consider the black padding
    x *= 2.
    return x

In [0]:
from keras.applications.resnet50 import ResNet50, Flatten
from keras.models import Model
from keras.layers import Dense
from keras.optimizers import Adam, SGD
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import backend as K
from keras.utils import multi_gpu_model

def train_resNet(train_data_dir, validate_data_dir, res_dir, model_file_name, weight_file_name, img_width=224, img_height=224):

    if not os.path.exists(res_dir):
        os.makedirs(res_dir)

    batch_size = 40

    # # 2.augmentation (may try more)
    train_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
#         samplewise_std_normalization=True,  # divide each input by its std
        # zca_whitening=True,  # apply ZCA whitening
        channel_shift_range=100,
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        width_shift_range=0.05,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.05,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=True,  # randomly flip images
        vertical_flip=False,
        shear_range=0.05,
        zoom_range=0.05,
        fill_mode='nearest')

    validate_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input
    )

    train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        classes=['Beach', 'City', 'Forest', 'Mountain', 'Village'],
        class_mode='categorical')

    validate_generator = validate_datagen.flow_from_directory(
        validate_data_dir,
        target_size=(img_width, img_height),
        classes=['Beach', 'City', 'Forest', 'Mountain', 'Village'],
        batch_size=batch_size,
        class_mode='categorical')
    
    # # 3. model structure
    # # Base model Conv layers + Customize FC layers
    # # create the base pre-trained model with weights
    if K.image_data_format() == 'channels_first':
        the_input_shape = (3, img_width, img_height)
    else:
        the_input_shape = (img_width, img_height, 3)
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=the_input_shape)  # don't include the top (final FC) layers.

    x = base_model.output
    x = Flatten(input_shape=base_model.output_shape[1:])(x)
    predictions = Dense(5, activation='softmax', name='fc05')(x)

    # first: train only the FC layers (which were randomly initialized)
    # i.e. freeze all convolutional resnet layers
    for layer in base_model.layers:
        layer.trainable = False

    # this is the final model we will train
    model = Model(inputs=base_model.input, outputs=predictions)
#     model = multi_gpu_model(model, gpus=1)
#     model.summary()

    # # 4.compile the model (should be done *after* setting layers to non-trainable)
    model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # check model layers
#     for i, layer in enumerate(base_model.layers):
#        print i, layer.name

    # # 5.train the model on the new data for a few epochs
    model.fit_generator(
        train_generator,
        steps_per_epoch=train_generator.n // batch_size,
        epochs=2,
        validation_data=validate_generator,
        validation_steps=validate_generator.n // batch_size)

    # model.load_weights('weights_resnet_224_before_finetune.h5')
    # # 6.start fine tune.
    NB_IV3_LAYERS_TO_FREEZE = 0 # currently 0 freeze layers with low learning rate works best.
    for layer in model.layers[:NB_IV3_LAYERS_TO_FREEZE]:
        layer.trainable = False
    for layer in model.layers[NB_IV3_LAYERS_TO_FREEZE:]:
        layer.trainable = True

    # fine tune: stochastic gradient descent optimizer
    model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

    # fine tune: train again for fine tune
    check_pointer1 = ModelCheckpoint(monitor='val_acc', filepath=os.path.join(res_dir, weight_file_name),
                                               verbose=1, save_best_only=True, mode='auto', period=1)
    check_pointer2 = ModelCheckpoint(monitor='val_acc', filepath=os.path.join(res_dir, model_file_name),
                                               verbose=1, save_best_only=False, save_weights_only=False, mode='auto',
                                               period=1)
    
    tensorboard_callback = TensorBoard(log_dir='/content/logs', histogram_freq=0, write_graph=True, write_images=True)

    model.fit_generator(
        train_generator,
        steps_per_epoch=train_generator.n // batch_size,
        epochs=50,
        validation_data=validate_generator,
        validation_steps=validate_generator.n // batch_size,
        callbacks=[check_pointer1,check_pointer2,tensorboard_callback])

    model.save(os.path.join(res_dir, model_file_name))



# Split data into train / validate set

In [0]:
import numpy as np
import shutil

def split_data(train_data_dir, test_data_dir, test_precentage):
    if not os.path.exists(test_data_dir):
        os.makedirs(test_data_dir)

    # put all test data back to train data folder
    folders = os.listdir(test_data_dir)
    for fod in folders:
        if not fod.startswith('.'):
            files = os.listdir(test_data_dir + '/' + fod)

            for f in files:
                if not f.startswith('.'):
                    shutil.move(test_data_dir + '/' + fod + '/' + f, train_data_dir + '/' + fod + '/' + f)

    # redo split train / test
    folders = os.listdir(train_data_dir)
    for fod in folders:
        if not fod.startswith('.'):
            files = os.listdir(train_data_dir + '/' + fod)

            if not os.path.exists(test_data_dir + '/' + fod):
                os.makedirs(test_data_dir + '/' + fod)

            for f in files:
                if not f.startswith('.'):
                    if np.random.rand(1) < test_precentage:
                        shutil.move(train_data_dir + '/' + fod + '/' + f, test_data_dir + '/' + fod + '/' + f)


In [0]:
split_data('train_all', 'validate_all', 0.2)

# Start a tensorboard

In [0]:
! wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip ngrok-stable-linux-amd64.zip

In [0]:
!pwd

In [0]:
LOG_DIR = '/content/logs'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

get_ipython().system_raw('./ngrok http 6006 &')

! export PYTHONIOENCODING=utf8
! curl -s http://localhost:4040/api/tunnels | python -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

# Train Model !

In [0]:
import keras
print keras.__version__

# For big image.
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

train_resNet('train_all', 'validate_all', 'resNet', 'resnet_final.h5', 'tmp_weights_resnet_final.h5', 224, 224)

# Store the model into google drive
!cp -rf resNet drive/bde

2.1.4
Found 1332 images belonging to 5 classes.
Found 312 images belonging to 5 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.92857, saving model to resNet/tmp_weights_resnet_final.h5

Epoch 00001: saving model to resNet/resnet_final.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.92857 to 0.97426, saving model to resNet/tmp_weights_resnet_final.h5

Epoch 00002: saving model to resNet/resnet_final.h5
Epoch 3/50

Epoch 00003: val_acc improved from 0.97426 to 0.97426, saving model to resNet/tmp_weights_resnet_final.h5

Epoch 00003: saving model to resNet/resnet_final.h5
Epoch 4/50
 2/33 [>.............................] - ETA: 38s - loss: 0.0261 - acc: 0.9875


Epoch 00004: val_acc did not improve

Epoch 00004: saving model to resNet/resnet_final.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.97426 to 0.97426, saving model to resNet/tmp_weights_resnet_final.h5

Epoch 00005: saving model to resNet/resnet_final.h5
Epoch 6/50

Epoch 00006: val_acc did not improve

Epoch 00006: saving model to resNet/resnet_final.h5
Epoch 7/50

Epoch 00007: val_acc did not improve

Epoch 00007: saving model to resNet/resnet_final.h5
Epoch 8/50

Epoch 00008: val_acc improved from 0.97426 to 0.99265, saving model to resNet/tmp_weights_resnet_final.h5



Epoch 00008: saving model to resNet/resnet_final.h5
Epoch 9/50

Epoch 00009: val_acc did not improve

Epoch 00009: saving model to resNet/resnet_final.h5
Epoch 10/50

Epoch 00010: val_acc did not improve

Epoch 00010: saving model to resNet/resnet_final.h5
Epoch 11/50

Epoch 00011: val_acc did not improve

Epoch 00011: saving model to resNet/resnet_final.h5
Epoch 12/50

In [0]:
!ls logs

# Do prediction

In [0]:
!ls drive/bde
!cp drive/bde/pictures.zip .

!apt install unzip
!unzip pictures.zip
!ls

In [0]:
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.preprocessing import image

image.LOAD_TRUNCATED_IMAGES = True

def do_predict(model_h5, weights_h5, img_width, img_height, input_folder, output_file):
    model = load_model(model_h5)
    if weights_h5 is not None:
        model.load_weights(weights_h5)

    rows = []
    column_names = ['id', 'category']
    for f in os.listdir(input_folder):
        if not f.startswith('.'):
            try:
              img = image.load_img(input_folder + '/' + f, target_size=(img_width, img_height))
              img_array = image.img_to_array(img)
              x = np.expand_dims(img_array, axis=0)
              x = preprocess_input(x)
              y_prob = model.predict(x)
              y_classes = y_prob.argmax(axis=-1)

              row = [str(f), str(y_classes[0])]
              rows.append(row)
            except Exception as e:
              print 'Canot predict image: ' + f

    df = pd.DataFrame(rows, columns=column_names)
    df.to_csv(output_file, index=False, header=True)
    print 'Done'
    
do_predict('resNet/resnet_final.h5', 'resNet/tmp_weights_resnet_final.h5', 224, 224, 'pictures', 'resNet/output_result.csv')

# Reformat output accordingly

In [0]:
import pandas as pd
df = pd.read_csv('resNet/output_result.csv')
df

In [0]:
cat = {
       0: 'Beach', 
       1: 'City',
       2: 'Forest',
       3: 'Mountain',
       4: 'Village'
      }

df['city'] = df['id'].apply(lambda jpgname: jpgname.split('_')[1])
df['type'] = df['category'].apply(lambda c: cat.get(c))

In [0]:
aggdata = df.groupby(['city', 'type']).agg({'type':['count']})

In [0]:
pecentData = aggdata / aggdata.groupby(level=0).sum()

In [0]:
rows = []
column_names = ['Destination', 'Mountain', 'Beach', 'Forest', 'City', 'Village']
for city in pecentData.index.levels[0]:  
  row = [city,0,0,0,0,0]
  rows.append(row)

df = pd.DataFrame(rows, columns=column_names)
df = df.set_index('Destination')

In [0]:
for index, row in pecentData.iterrows():
  df.loc[index[0],index[1]] = str(row[0])
  
df

In [0]:
df.to_csv('geofile_shutao.csv', index=True, header=True)

In [0]:
!cp -rf geofile_shutao.csv drive/bde
!ls drive/bde