In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
%matplotlib inline
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 16
import numpy as np
import os
import pandas as pd
import seaborn as sns
from keras.applications import xception
from keras.preprocessing import image
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm

In [None]:
CATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',
              'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']
NUM_CATEGORIES = len(CATEGORIES)


In [None]:
SAMPLE_PER_CATEGORY = 200
SEED = 1987
data_dir = '../input/'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

In [None]:
for category in CATEGORIES:
    print('{} {} images'.format(category, len(os.listdir(os.path.join(train_dir, category)))))

In [None]:
train = []
for category_id, category in enumerate(CATEGORIES):
    for file in os.listdir(os.path.join(train_dir, category)):
        train.append(['train/{}/{}'.format(category, file), category_id, category])
train = pd.DataFrame(train, columns=['file', 'category_id', 'category'])
train.head(2)
train.shape

In [None]:
train = pd.concat([train[train['category'] == c][:SAMPLE_PER_CATEGORY] for c in CATEGORIES])
train = train.sample(frac=1)
train.index = np.arange(len(train))
train.head(2)
train.shape

In [None]:
test = []
for file in os.listdir(test_dir):
    test.append(['test/{}'.format(file), file])
test = pd.DataFrame(test, columns=['filepath', 'file'])
test.head(2)
test.shape

In [None]:
def read_img(filepath, size):
    img = image.load_img(os.path.join(data_dir, filepath), target_size=size)
    img = image.img_to_array(img)
    return img

In [None]:
fig = plt.figure(1, figsize=(NUM_CATEGORIES, NUM_CATEGORIES))
grid = ImageGrid(fig, 111, nrows_ncols=(NUM_CATEGORIES, NUM_CATEGORIES), axes_pad=0.05)
i = 0
for category_id, category in enumerate(CATEGORIES):
    for filepath in train[train['category'] == category]['file'].values[:NUM_CATEGORIES]:
        ax = grid[i]
        img = read_img(filepath, (224, 224))
        ax.imshow(img / 255.)
        ax.axis('off')
        if i % NUM_CATEGORIES == NUM_CATEGORIES - 1:
            ax.text(250, 112, filepath.split('/')[1], verticalalignment='center')
        i += 1
plt.show();

In [None]:
np.random.seed(seed=SEED)
rnd = np.random.random(len(train))
train_idx = rnd < 0.8
valid_idx = rnd >= 0.8
ytr = train.loc[train_idx, 'category_id'].values
yv = train.loc[valid_idx, 'category_id'].values
len(ytr), len(yv)

In [None]:
from keras.applications.resnet50 import ResNet50,preprocess_input, decode_predictions
INPUT_SIZE=224
POOLING='avg'
x_train=np.zeros((len(train),INPUT_SIZE,INPUT_SIZE,3),dtype=np.float32)
for i,file in tqdm(enumerate(train['file'])):
    img = read_img(os.path.join(data_dir,file),(INPUT_SIZE,INPUT_SIZE))
    x=preprocess_input(np.expand_dims(img.copy(),axis=0)) #need to be changed for every model
    x_train[i]=x
print('Train image shape: {} size: {:,}'.format(x_train.shape,x_train.size))

In [None]:
xtrain=x_train[train_idx]
xvalid=x_train[valid_idx]
print((xtrain.shape,xvalid.shape))

In [None]:
from keras.preprocessing import image
vgg_bottleneck = ResNet50(weights='imagenet', include_top=False)

In [None]:
train_vgg_bf = vgg_bottleneck.predict(xtrain, batch_size=32, verbose=1)
valid_vgg_bf = vgg_bottleneck.predict(xvalid, batch_size=32, verbose=1)
print('VGG train bottleneck features shape: {} size: {:,}'.format(train_vgg_bf.shape, train_vgg_bf.size))
print('VGG valid bottleneck features shape: {} size: {:,}'.format(valid_vgg_bf.shape, valid_vgg_bf.size))

In [None]:
train_vgg_bf=train_vgg_bf.reshape(1899,100352)
valid_vgg_bf=valid_vgg_bf.reshape(501,100352)

In [None]:
import keras
one_hot_labels = keras.utils.to_categorical(ytr, num_classes=12)
valid_labels = keras.utils.to_categorical(yv, num_classes=12)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
model=Sequential()

model.add(Dense(1000, input_dim=100352, activation='relu',kernel_initializer='uniform'))
keras.layers.core.Dropout(0.3, noise_shape=None, seed=None)

model.add(Dense(500,input_dim=1000,activation='sigmoid'))
keras.layers.core.Dropout(0.4, noise_shape=None, seed=None)

model.add(Dense(150,input_dim=500,activation='sigmoid'))
keras.layers.core.Dropout(0.2, noise_shape=None, seed=None)

model.add(Dense(units=12))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

In [None]:
history= model.fit(train_vgg_bf,one_hot_labels, epochs=10, batch_size=128,validation_data=(valid_vgg_bf, valid_labels))
print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('batch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
model.save_weights('fc_model.h5')

Extra Work

In [None]:
test = []
for file in os.listdir(test_dir):
    test.append(['test/{}'.format(file),file])
test=pd.DataFrame(test,columns=['file_path','file'])
test.head(2)

In [None]:
x_test = np.zeros((len(test), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32')
for i, file in tqdm(enumerate(test['file_path'])):
    img = read_img(os.path.join(data_dir,file), (INPUT_SIZE, INPUT_SIZE))
    x = preprocess_input(np.expand_dims(img.copy(), axis=0))
    x_test[i] = x
print('test Images shape: {} size: {:,}'.format(x_test.shape, x_test.size))

In [None]:
test_x_bf = vgg_bottleneck.predict(x_test, batch_size=32, verbose=1)
print('Test bottleneck features shape: {} size: {:,}'.format(test_x_bf.shape, test_x_bf.size))


In [None]:
test_x_bf=test_x_bf.reshape(794,100352)

In [None]:
test_preds = model.predict(test_x_bf)

In [None]:
np.argmax(test_preds,axis=1).shape

In [None]:
test_pred_one = np.argmax(test_preds,axis=1)
test['category_id'] = test_pred_one
test['species'] = [CATEGORIES[c] for c in test_pred_one]
test[['file', 'species']].to_csv('submission.csv', index=False)

In [None]:
!ls