In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 16

import os
from tqdm import tqdm # Fancy progress bars

import seaborn as sns
from keras.preprocessing import image
from keras.applications import xception
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
print(os.listdir("../input"))

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

Loading up Keras Models

In [None]:
!ls ../input/keras-pretrained-models/

In [None]:
cache_dir = os.path.expanduser(os.path.join('~', '.keras')) # Cache
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)
models_dir = os.path.join(cache_dir, 'models')
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

Copy a selection of our pretrained models files onto the keras cache directory so Keras can access them

In [None]:
!cp ../input/keras-pretrained-models/xception* ~/.keras/models

In [None]:
!ls ~/.keras/models

In [None]:
!ls ../input/plant-seedlings-classification

Preparing the Dataset for the Model

In [None]:
# Define Y-labels and NUM_CLASSES
CATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',
             'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']
NUM_CATEGORIES = len(CATEGORIES)

In [None]:
SAMPLE_PER_CATEGORY = 200
SEED = 7
data_dir = '../input/plant-seedlings-classification/'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

print(train_dir)
print(test_dir)

In [None]:
sample_submission.head(10)

In [None]:
# Displaying the training data: Note that the training images are organized into sub-folders within the main folder,
# organized by plant species. Hence, we are simply calling each directory name and printing out their lengths

for category in CATEGORIES:
    print('{} {} images'.format(category, len(os.listdir(os.path.join(train_dir, category)))))
    # "Print length of this directory -- an integer output"

**Creating Our Aggregate Training Sample for the CNN:**

In [None]:
# We are going to do a traversal over the directories and folders containing the training set data in order 
# to collate all the image-files and their corresponding class index and class_names into an aggregate training-set
# collection, and convert it into a pandas DataFrame.

train = []
for category_id, category in enumerate(CATEGORIES): # category_id is the integer index corresponding to each class_name
    for file in os.listdir(os.path.join(train_dir, category)): # Means: "for every "file" in this directory,:"
        train.append(['train/{}/{}'.format(category, file), category_id, category]) # Renaming the file names and
        # adding to the train list
train = pd.DataFrame(train, columns = ['file', 'category_id', 'category']) # Define a pandas DataFrame over training data
train.head(5) # Print preview of the training DataFrame
train.shape # Check shape: should be of dims (m, 3), where 3 represents file_name, category_id (int index), and class_name
# for each example

**Creating Our Training Set**

In [None]:
train = pd.concat([train[train['category'] == c][:SAMPLE_PER_CATEGORY] for c in CATEGORIES])
train = train.sample(frac=1)
train.index = np.arange(len(train))
train.head(5)
train.shape

**Creating Our Test Set**

In [None]:
test = []
for file in os.listdir(test_dir):
    test.append(['test/{}'.format(file), file])
test = pd.DataFrame(test, columns=['filepath', 'file'])
test.head(5)
test.shape

**Reading an Image to an Array**

In [None]:
def read_img(filepath, size):
    img = image.load_img(os.path.join(data_dir, filepath), target_size = size)
    img = image.img_to_array(img)
    return img

**Loading and Visualizing Sample Images (Training Examples)**

In [None]:
# Matplotlib

fig = plt.figure(1, figsize=(NUM_CATEGORIES, NUM_CATEGORIES))
grid = ImageGrid(fig, 111, nrows_ncols=(NUM_CATEGORIES, NUM_CATEGORIES), axes_pad=0.05)
i = 0 # Initialize counter

# Iterate through the files in the categories
for category_id, category in enumerate(CATEGORIES):
    for filepath in train[train['category'] == category]['file'].values[:NUM_CATEGORIES]:
        ax = grid[i]
        img = read_img(filepath, (224,224))
        ax.imshow(img/255.)
        ax.axis('off')
        if i % NUM_CATEGORIES == NUM_CATEGORIES - 1:
            ax.text(250, 112, filepath.split('/')[1], verticalalignment='center')
        i += 1
plt.show();

**Train-Validation Split**

In [None]:
np.random.seed(seed=SEED)
rnd = np.random.random(len(train))
train_idx = rnd < 0.8 # Indices in which rnd is <0.8 (which should come out to roughly 80% of the dataset)
valid_idx = rnd >= 0.8
ytr = train.loc[train_idx, 'category_id'].values # pandas function calls
yv = train.loc[valid_idx, 'category_id'].values
len(ytr)
len(yv)

**Run Examples through the Pre-trained Xception Model to Extract Xception Features / Representations:  
(pre-classification step)**

In [None]:
# Specify parameters:
INPUT_SIZE = 299
POOLING = 'avg'
x_train = np.zeros((len(train), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, file in tqdm(enumerate(train['file'])): # tqdm is a progress bar
    img = read_img(file, (INPUT_SIZE, INPUT_SIZE)) # Read the filepath into an array via our function call
    x = xception.preprocess_input(np.expand_dims(img.copy(), axis=0)) # Pre-process that into a format for Xception model
    x_train[i] = x # Set the i-th example in our initialized zero-4D-array to the particular example
print('Train Images shape: {} size: {:,}'.format(x_train.shape, x_train.size))

In [None]:
Xtr = x_train[train_idx]
Xv = x_train[valid_idx]
print((Xtr.shape, Xv.shape, ytr.shape, yv.shape)) # Print shapes

In [None]:
xception_bottleneck = xception.Xception(weights='imagenet', include_top=False, pooling=POOLING) # Define Xception object
    # based on "off-the-shelf" pre-trained Xception model
train_x_bf = xception_bottleneck.predict(Xtr, batch_size=32, verbose=1)
valid_x_bf = xception_bottleneck.predict(Xv, batch_size=32, verbose=1)

# Check output dims:
print("Xception train bottleneck-features shape: {} size: {:,}".format(train_x_bf.shape, train_x_bf.size))
print("Xception valid bottleneck-features shape: {} size: {:,}".format(valid_x_bf.shape, valid_x_bf.size))

**LogReg Classification on ("using") Resulting Xception-bottleneck Features:**

In [None]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=SEED)
logreg.fit(train_x_bf, ytr) # We need to fit the classifier to our (X,Y pairs)
valid_probs = logreg.predict_proba(valid_x_bf) # Classification on our dev set -- probabilities of various classes
valid_preds = logreg.predict(valid_x_bf) # Classification on our dev set -- predicted classes

In [None]:
print("Validation Xception Accuracy: {}".format(accuracy_score(yv, valid_preds)))

Illustrating the Results: Confusion Matrix

In [None]:
cnf_matrix = confusion_matrix(yv, valid_preds)
abbreviation = ['BG', 'Ch', 'Cl', 'CC', 'CW', 'FH', 'LSB', 'M', 'SM', 'SP', 'SFC', 'SB']
pd.DataFrame({'class': CATEGORIES, 'abbreviation': abbreviation})

In [None]:
# Plotting the confusion matrix
fig, ax = plt.subplots(1)
ax = sns.heatmap(cnf_matrix, ax=ax, cmap=plt.cm.Greens, annot=True)
ax.set_xticklabels(abbreviation)
ax.set_yticklabels(abbreviation)
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
fig.savefig('Confusion matrix.png', dpi=300)
plt.show();

Finalization and Creating the Submission

In [None]:
x_test = np.zeros((len(test), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, filepath in tqdm(enumerate(test['filepath'])):
    img = read_img(filepath, (INPUT_SIZE, INPUT_SIZE))
    x = xception.preprocess_input(np.expand_dims(img.copy(), axis=0))
    x_test[i] = x
print("Test images dataset shape: {} size: {:,}".format(x_test.shape, x_test.size))

In [None]:
test_x_bf = xception_bottleneck.predict(x_test, batch_size=32, verbose=1)
print('Xception test bottleneck features shape: {} size: {:,}'.format(test_x_bf.shape, test_x_bf.size))

test_preds = logreg.predict(test_x_bf)

In [None]:
test['category_id'] = test_preds
test['species'] = [CATEGORIES[c] for c in  test_preds]
test[['file', 'species']].to_csv('submission.csv', index=False)