In [6]:
import os
import shutil
import numpy as np
import pickle
import random
from imutils import paths

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from keras.applications import VGG16
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

### Config

In [2]:
TRAIN_DATA_PATH = '../train_data'
TEST_DATA_PATH = '../train_test_data'

CLASSES = ['1', '2', '3', '4', '5']

BATCH_SIZE = 32

### Extract features using pre-trained CNNs

In [3]:
# load the VGG16 network and initialize the label encoder
model = VGG16(weights='imagenet', include_top=False)
le = None

In [4]:
splits = [
    ('train', TRAIN_DATA_PATH),
    ('test', TEST_DATA_PATH),
]

# loop over the data splits
for split_name,split_path in splits:
    print(f'Processing {split_name}')

    # grab all image paths in the current split
    image_paths = list(paths.list_images(split_path))

    # randomly shuffle the image paths and then extract the class labels from the file paths
    random.shuffle(image_paths)
    labels = [p.split(os.path.sep)[-2] for p in image_paths]

    # if the label encoder is None, create it
    if le is None:
        le = LabelEncoder()
        le.fit(labels)

    # open the output CSV file for writing
    csv_path = f'{split_name}.csv'
    csv = open(csv_path, 'w')

    # loop over the images in batches
    for (b, i) in enumerate(range(0, len(image_paths), BATCH_SIZE)):
        # extract the batch of images and labels, then initialize the
        # list of actual images that will be passed through the network for feature extraction
        print('Processing batch {}/{}'.format(b + 1, int(np.ceil(len(image_paths) / float(BATCH_SIZE)))))
        batch_paths = image_paths[i:i + BATCH_SIZE]
        batch_labels = le.transform(labels[i:i + BATCH_SIZE])
        batch_images = []

        # loop over the images and labels in the current batch
        for image_path in batch_paths:
            # load the input image using the Keras helper utility
            # while ensuring the image is resized to 224x224 pixels
            image = load_img(image_path, target_size=(224, 224))
            image = img_to_array(image)

            # preprocess the image by (1) expanding the dimensions and
            # (2) subtracting the mean RGB pixel intensity from the ImageNet dataset
            image = np.expand_dims(image, axis=0)
            image = imagenet_utils.preprocess_input(image)

            # add the image to the batch
            batch_images.append(image)

        # pass the images through the network and use the outputs as
        # our actual features, then reshape the features into a flattened volume
        batch_images = np.vstack(batch_images)
        features = model.predict(batch_images, batch_size=BATCH_SIZE)
        features = features.reshape((features.shape[0], 7 * 7 * 512))

        # loop over the class labels and extracted features
        for (label, vec) in zip(batch_labels, features):
            # construct a row that exists of the class label and
            # extracted features
            vec = ','.join([str(v) for v in vec])
            csv.write('{},{}\n'.format(label, vec))

    # close the CSV file
    csv.close()

# serialize the label encoder to disk
f = open('label_encoder.pickle', 'wb')
f.write(pickle.dumps(le))
f.close()

Processing train
Processing batch 1/176
Processing batch 2/176
Processing batch 3/176
Processing batch 4/176
Processing batch 5/176
Processing batch 6/176
Processing batch 7/176
Processing batch 8/176
Processing batch 9/176
Processing batch 10/176
Processing batch 11/176
Processing batch 12/176
Processing batch 13/176
Processing batch 14/176
Processing batch 15/176
Processing batch 16/176
Processing batch 17/176
Processing batch 18/176
Processing batch 19/176
Processing batch 20/176
Processing batch 21/176
Processing batch 22/176
Processing batch 23/176
Processing batch 24/176
Processing batch 25/176
Processing batch 26/176
Processing batch 27/176
Processing batch 28/176
Processing batch 29/176
Processing batch 30/176
Processing batch 31/176
Processing batch 32/176
Processing batch 33/176
Processing batch 34/176
Processing batch 35/176
Processing batch 36/176
Processing batch 37/176
Processing batch 38/176
Processing batch 39/176
Processing batch 40/176
Processing batch 41/176
Processi

### Train a ML model

In [5]:
def load_data_split(splitPath):
    # initialize the data and labels
    data = []
    labels = []

    # loop over the rows in the data split file
    for row in open(splitPath):
        # extract the class label and features from the row
        row = row.strip().split(',')
        label = row[0]
        features = np.array(row[1:], dtype='float')

        # update the data and label lists
        data.append(features)
        labels.append(label)

    # convert the data and labels to NumPy arrays
    data = np.array(data)
    labels = np.array(labels)

    # return a tuple of the data and labels
    return (data, labels)

# derive the paths to the training and testing CSV files
training_path = 'train.csv'
testing_path = 'test.csv'

# load the data from disk
print('[INFO] loading data...')
(trainX, trainY) = load_data_split(training_path)
(testX, testY) = load_data_split(testing_path)

# load the label encoder from disk
le = pickle.loads(open('label_encoder.pickle', 'rb').read())

# train the model
print('[INFO] training model...')
model = LogisticRegression(solver='lbfgs', multi_class='auto')
model.fit(trainX, trainY)

# evaluate the model
print('[INFO] evaluating...')
preds = model.predict(testX)
print(classification_report(testY, preds, target_names=le.classes_))

# serialize the model to disk
print('[INFO] saving model...')
f = open('lr_model.pickle', 'wb')
f.write(pickle.dumps(model))
f.close()

[INFO] loading data...
[INFO] training model...
[INFO] evaluating...
              precision    recall  f1-score   support

           1       0.83      0.86      0.85       212
           2       0.96      0.97      0.96       116
           3       0.96      0.93      0.94        91
           4       0.92      0.93      0.92        83
           5       0.77      0.71      0.74       121

   micro avg       0.87      0.87      0.87       623
   macro avg       0.88      0.88      0.88       623
weighted avg       0.87      0.87      0.87       623

[INFO] saving model...




In [7]:
f1 = f1_score(testY, preds, average='weighted')
print(f1)

0.8708200211344395
