In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests, glob, Augmentor, os, json
from zipfile import ZipFile

from keras.models import Model, load_model
from keras.layers import Dense, Flatten
from keras.preprocessing import image
from keras.utils import to_categorical
from keras.applications.vgg19 import VGG19, preprocess_input
from keras import optimizers
from keras import backend as K

from keras.callbacks import TensorBoard
from keras.callbacks import TerminateOnNaN
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import CSVLogger
from keras.callbacks import History

from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [None]:
def download_file(url, out_file):
    chunk_size = 1024
    r = requests.get(url, stream=True)
    total_size = int(r.headers['content-length'])
    with open(out_file, 'wb') as f:
        for data in tqdm(iterable=r.iter_content(chunk_size=chunk_size),
                         total=total_size/chunk_size, unit='KB'):
            f.write(data)
    print('{} download Complete!'.format(out_file))

def extract_zipfile(data_path, zip_file):
    if not os.path.isdir(data_path):
        with ZipFile(zip_file, 'r') as f:
            print('Extracting all the files now ...')
            f.extractall(data_path)
            print('Done!')

def explore_data(data_path, labels_list):
    images_count = []
    X = []
    y = []
    plt.figure(1)
    print('No of images in:')
    for idx, label in enumerate(labels_list):
        label_path = os.path.join(data_path, label)
        images_list = glob.glob(os.path.join(label_path, '*.png'))
        num_images = len(images_list)
        images_count.append(num_images)
        X += images_list
        y += [label] * num_images
        img_path = images_list[0]
        img = plt.imread(img_path)

        plt.subplot(3,4,idx+1)
        plt.imshow(img)
        plt.title(label)
        plt.axis('off')
        print('{} directory: {}'.format(label, num_images))
    print()
    plt.show()
    return images_count, X, y

def plot_histogram(y):
    plt.hist(y, bins=86)
    plt.xlabel('labels')
    plt.ylabel('no of images')
    plt.show()

def augment_images(path, num, avg_img):
    p = Augmentor.Pipeline(path, '.', save_format='png')
    p.random_brightness(.5, .25, .75)
    p.random_color(.5, .25, .75)
    p.random_contrast(.5, .25, .75)
    p.rotate(.5, 10, 10)
    p.sample(avg_img - num)

def add_dicts(d, e):
    for key in e.keys():
        if key not in d:
            d[key] = e[key]
        else:
            d[key] += e[key]
    return d

def plot_metrics(d):
    plots = [i for i in d.keys() if i.find('val_') == -1]
    plt.figure(figsize=(15,25))
    for i, p in enumerate(plots):
        plt.subplot(len(plots), 2, i+1)
        plt.title(p)
        plt.plot(d[p], label=p)
        plt.plot(d['val_'+p], label = 'val_'+p)
        plt.legend()
    plt.show()

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    rec = true_positives / (possible_positives + K.epsilon())
    return rec

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    pre = true_positives / (predicted_positives + K.epsilon())
    return pre

def f1(y_true, y_pred):
    rec = recall(y_true, y_pred)
    pre = precision(y_true, y_pred)
    return 2*((pre*rec)/(pre+rec))

def get_callbacks():
    # TerminateOnNaN
    terminate_callback = TerminateOnNaN()
    # Tensorboard
    tb_callback = TensorBoard('./Graph', histogram_freq=0,
                    write_graph=True, write_images=True)
    # Model Checkppoint
    ckpt_callback = ModelCheckpoint('./output_data/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                                    verbose=1, save_weights_only=False,
                                    mode='auto', period=5)
    # CSV Logger
    csv_logger = CSVLogger('./training.log')
    # History
    hist = History()
    # Callbacks list
    callbacks = [terminate_callback, tb_callback, ckpt_callback,
                 csv_logger, hist]
    return callbacks

def data_generator(X, y, batch_size):
    idx = 0
    encoder = LabelBinarizer()
    y = encoder.fit_transform(y)
    num_batches = len(X)//batch_size
    while True:
        start = idx * batch_size
        end = start + batch_size
        batch_X = []
        batch_y = y[start:end, :]
        for filename in X[start:end]:
            img = image.load_img(filename, target_size=(299, 299, 3))
            img = image.img_to_array(img)
            batch_X.append(img)
        batch_X = np.array(batch_X)
        batch_X = preprocess_input(batch_X)
        batch_X /= 255
        idx += 1
        yield batch_X, batch_y
        if idx == num_batches:
            idx = 0

def get_model(lr):
    # Load model
    # include_top is used to remove all the layers after block conv5

    model = VGG19(include_top=False, input_shape=(299, 299, 3))

    # Freeze all layers
    for layer in model.layers:
        layer.trainable = False

    # re-add the removed layers
    x = model.output
    x = Flatten(name="flatten")(x)
    x = Dense(4096, activation="relu", name="fc1")(x)
    x = Dense(4096, activation="relu", name="fc2")(x)
    x = Dense(num_labels, activation="softmax", name="predictions")(x)

    # Redefine the model
    model = Model(inputs=model.input, outputs=x, name="final_model")

    model.summary()

    adam = optimizers.Adam(lr=lr)

    # compile the model
    model.compile(optimizer=adam, loss='categorical_crossentropy',
              metrics=['accuracy', precision, recall,
              f1])

    return model

In [None]:
url = 'https://vision.eng.au.dk/?download=/data/WeedData/NonsegmentedV2.zip'
zip_file = './data.zip'
data_path = './data'

if not os.path.isfile(zip_file):
    download_file(url, zip_file)

In [None]:
ls

In [None]:
extract_zipfile(data_path, zip_file)

In [None]:
ls

In [None]:
ls data/

In [None]:
labels_list = os.listdir(data_path)
num_labels = len(labels_list)

print('Labels:')

for idx, label in enumerate(labels_list):
    print('{}. {}'.format(idx+1, label))

In [None]:
images_count, _, y = explore_data(data_path, labels_list)

In [None]:
avg_img = sum(images_count)//len(images_count)

print(avg_img)

In [None]:
plot_histogram(y)

In [None]:
from subprocess import call

if not os.path.isdir('./data_copy'):
    _ = call(['cp', '-a', data_path, './data_copy'])

data_path = './data_copy'

In [None]:
ls

In [None]:
if len(os.listdir('./data_copy/Black-grass')) < avg_img+1:    
    for label in labels_list:
        label_path = os.path.join(data_path, label)
        images_list = os.listdir(label_path)
        if len(images_list) < avg_img:
            augment_images(label_path, len(images_list), avg_img) 

In [None]:
_, X, y = explore_data(data_path, labels_list)

In [None]:
plot_histogram(y)

In [None]:
## Split data

X, y = shuffle(X, y)

X, X_valid, y, y_valid = train_test_split(X, y, test_size=0.1,
                                          random_state=0, stratify=y)

X, X_test, y, y_test = train_test_split(X, y, test_size=0.2,
                                        random_state=0, stratify=y)

print('Train:', len(X), len(y))
print('Valid:', len(X_valid), len(y_valid))
print('Test:', len(X_test), len(y_test))

In [None]:
output_path = './output_data'
log_file = os.path.join(output_path, 'log.csv')

full_model_path = os.path.join(output_path, 'plant_vgg19.h5')
cross_model_path = os.path.join(output_path, 'plant_vgg19_cross.h5')

if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
n_splits = 3
batch_size = 1 #32
epochs = 1 #10
lr = 0.001

In [None]:
model = get_model(lr)

In [None]:
## Cross Validation

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

if not os.path.isfile(cross_model_path):

    # Training
    print("Start cross-validation training...")

    histories = []

    temp_X = np.array(X)
    temp_y = np.array(y)

    for train, val in skf.split(temp_X, temp_y):
        train_datagen = data_generator(temp_X[train], temp_y[train], batch_size)
        valid_datagen = data_generator(temp_X[val], temp_y[val], batch_size)

        history = model.fit_generator(train_datagen, steps_per_epoch=len(train)//batch_size,
                    validation_data=valid_datagen, epochs=epochs, 
                                     validation_steps = len(val)//batch_size)
        histories.append(history)

    model.save(cross_model_path)
    
    del model

In [None]:
## Full Training

batch_size = 1 #32

epochs = 1 #20

model = load_model(cross_model_path, custom_objects={'f1_micro':f1_micro, 'precision_micro':precision_micro, 
                                                     'recall_micro':recall_micro})

if not os.path.isfile(full_model_path):

    print("Full training...")

    train_datagen = data_generator(X, y, batch_size)
    valid_datagen = data_generator(X_valid, y_valid, batch_size)

    history = model.fit_generator(train_datagen, steps_per_epoch=len(X)//batch_size,
                        epochs=epochs, callbacks = callbacks,
                        validation_data=valid_datagen, validation_steps=len(X_valid)//batch_size)

    histories.append(history)

    print("Save whole model...")
    model.save(full_model_path)

In [None]:
## Evaluate

batch_size = 32

model = load_model(full_model_path, custom_objects={'f1_micro':f1_micro})

test_datagen = data_generator(X_test, y_test, batch_size)

Eval = model.evaluate_generator(test_datagen, steps = len(X_test)//batch_size, workers=12)    
print(Eval)