In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPool2D
from keras.applications import ResNet50, VGG16
from keras.optimizers import Adam

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split

import cv2
import os
from tqdm import tqdm

import time

from os import listdir
import csv

## Read Data

In [None]:
df = pd.read_csv("../input/planets-dataset/planet/planet/train_classes.csv")
df.head(5)

In [None]:
# Getting unique labels
label_list = {}
split = df['tags'].map(lambda x: x.split(' '))
for labels in split.values:
    for label in labels:
        label_list[label] = label_list[label] + 1 if label in label_list else 0

print("There are {} unique labels in our dataset".format(len(label_list)))

In [None]:
# creating a plot of label occurance against label names. This would let us know which label appears the most in our dataset.
plt.figure(figsize=(18, 6))
plt.title('Classes')
idxs = range(len(label_list.values()))
plt.xticks(idxs, label_list.keys(), rotation=-45)
plt.bar(idxs, label_list.values());

In [None]:
# Let's have a look at some images in our dataset
plt.rc('axes', grid = True)

_, ax = plt.subplots(1, 3, figsize=(20, 20))
random_img = np.random.randint(0,len(df) - 3)
for i , (file, label) in enumerate(df[random_img:random_img + 3].values):
    img = cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(file))
    ax[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[i].set_title('{} - {}'.format(file, label))
    
plt.show()

## spliting our data into train and test data

In [None]:

label_collection = split.values
labels = list(set([y for x in label_collection for y in x]))

def load_data(df, labels, resize):
    X_train = []
    y_train = []

    label_map = {l: i for i, l in enumerate(labels)}
    inv_label_map = {i: l for l, i in label_map.items()}

    for f, tags in df.values:
        img = cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(f))
        targets = np.zeros(17)
        for t in tags.split(' '):
            targets[label_map[t]] = 1

        X_train.append(cv2.resize(img,resize))
        y_train.append(targets)
        
    y_train = np.array(y_train, np.uint8)
    X_train = np.array(X_train, np.float16) / 255.

    return X_train, y_train

In [None]:
X, y = load_data(df, labels, resize=(64, 64))

In [None]:
# train = 80%, test = 20% of df
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 33)

## Model Building

In [None]:
def f_beta_score(y_true, y_pred):
    beta_squared = 4

    tp = K.sum(y_true * y_pred) + K.epsilon()
    fp = K.sum(y_pred) - tp
    fn = K.sum(y_true) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    result = (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
    return result

In [None]:
#### CNN
model = Sequential()
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu', padding='same', input_shape=(64, 64, 3)))
model.add(Conv2D(8, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.1))

model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(16, kernel_size=(3, 3), activation='relu'))
model.add(Dropout(0.1))

model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.1))

model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.1))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dense(17, activation='sigmoid')) 

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f_beta_score])
model.summary()

In [None]:
# fitting our model on X_train and y_train
model_fit = model.fit(
    X_train, y_train,
    batch_size=64,
    epochs=5,
    verbose=1,
    validation_data=(X_val, y_val)
)

Calculating Fbeta Score

In [None]:
# make predictions
y_pred = model.predict(X_val, batch_size=64)

In [None]:
cutoff = 0.45                             # decide on a cutoff limit
y_pred_classes = np.zeros_like(y_pred)    # initialise a matrix full with zeros
y_pred_classes[y_pred > cutoff] = 1       # add a 1 if the cutoff was breached

y_test_classes = np.zeros_like(y_pred)
y_test_classes[y_val > cutoff] = 1

In [None]:
# Getting our fbeta score
score = fbeta_score(y_test_classes, y_pred_classes, average="samples", beta=0.5)

print("F beta score: ", score)
print("Error: %.2f%%" % (100 - score * 100))

In [None]:
# Display learning curve
def learning_curve(model_fit, key='acc', ylim=(0.8, 1.01)):
    plt.figure(figsize=(12,6))
    plt.plot(model_fit.history[key])
    plt.plot(model_fit.history['val_' + key])
    plt.title('Learning Curve')
    plt.ylabel(key.title())
    plt.xlabel('Epoch')
    plt.ylim(ylim)
    plt.legend(['train', 'test'], loc='best')
    plt.show()

In [None]:
learning_curve(model_fit, key='loss', ylim=(0, 1))

Setting up our test folder

In [None]:
# Make dir to hold all our files from test-jpg folder and test-jpg-additional
os.mkdir("./test")

In [None]:
from distutils.dir_util import copy_tree

src1 = "../input/planets-dataset/test-jpg-additional/test-jpg-additional"
src2 = "../input/planets-dataset/planet/planet/test-jpg"
to = "./test"

copy_tree(src2, to)

In [None]:
# checking if there a total of 61191 files in our folder
print(len(listdir("./test")))

In [None]:
# kaggle submission
X_test = []
submission = []
for file in listdir('./test'):
    filename = file.split('.')[0]
    
    img = cv2.imread('./test/{}.jpg'.format(filename))
    targets = np.zeros(17)
    
    X_test.append(cv2.resize(img, (64, 64)))
    submission.append(filename)

X_test = np.array(X_test, np.float16) / 255

y_test = model.predict(X_test, batch_size=64)

with open('understanding_the_amazon_from_space_final.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(('image_name', 'tags'))
    for i, image in enumerate(submission):
        csv_writer.writerow((image, ' '.join(np.array(labels)[y_test[i] > 0.45])))