# Plan

*  **Xplore the files**           
Examinations of the files present in the Dataset is done in this section                
*  **Labels analysis**
Labels analysis of the labels found in the Train dataset            
*  **Creating the One Hot Encoding of the labels**  
This is a `Multilabel Classification` problem , hence a One Hot Encoding of the labels need to be done          
*   **Create the Data Generators**                   
Create the Train , Test Data Generators             
* **Flow from DataFrame**     
This section defines the functions so that the paths 
* **Model Creation**
This is the section where the Actual Model is built
* **Test Data Preparation**     
The Test Data is prepared so that it can be sent to the Model
* **Map Predictions**
The predictions are mapped for the Submission File


# Xplore the files

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from os import listdir

In [None]:
listdir('../input/train-jpg')[:10]

In [None]:
len(listdir('../input/train-jpg'))

In [None]:
test_path = '../input/test-jpg-v2/'

In [None]:
listdir(test_path)[:10]

In [None]:
len(listdir('../input/test-jpg-v2'))

# Labels analysis

In [None]:
import pandas as pd

In [None]:
labels_df = pd.read_csv('../input/train_v2.csv')

In [None]:
labels_df.head()

In [None]:
# Print all unique tags
from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in labels_df['tags'].values]))

In [None]:
labels_list[:10]

In [None]:
labels_set = set(labels_list)

In [None]:
len(labels_list)

In [None]:
import numpy as np
len(np.unique(np.array(labels_list)))

In [None]:
len(labels_set)

In [None]:
labels_set

In [None]:
for c_label in labels_set:
    labels_df[c_label] = labels_df['tags'].map(lambda finding: 1.0 if c_label in finding else 0)

In [None]:
labels_df.head()

# Creating the OHE vector for the labels

In [None]:
labels_set = list(labels_set)

In [None]:
labels_df[labels_set].values

In [None]:
len(labels_df) == len(labels_df[labels_set]) 

In [None]:
labels_df.apply(lambda x: [x[labels_set].values],1)[:10]

In [None]:
labels_df['labels_vec'] = labels_df.apply(lambda x: [x[labels_set].values],1).map(lambda x: x[0])

In [None]:
labels_df['labels_vec'].head()

In [None]:
labels_df.head()

In [None]:
train_data = pd.DataFrame()

In [None]:
train_data['image_path'] = listdir('../input/train-jpg')

In [None]:
train_data['image_name'] = train_data['image_path'].map(lambda x: os.path.splitext(os.path.basename(x))[0])

In [None]:
train_data.head()

In [None]:
train_data = pd.merge(train_data,labels_df)

In [None]:
train_data.head()

In [None]:
train_data['path'] = '../input/train-jpg/' + train_data['image_path']

In [None]:
train_data.head()

# Creat Data Generators

In [None]:
from keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (128, 128)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

In [None]:
path_col = 'path'

In [None]:
base_dir = os.path.dirname(train_data[path_col].values[0])

In [None]:
base_dir

# Flow from DataFrame

In [None]:
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, 
                                     class_mode = 'sparse',
                                    **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = '' # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen

In [None]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(train_data, 
                                   test_size = 0.25, 
                                   random_state = 2018)
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

In [None]:
train_gen = flow_from_dataframe(core_idg, train_df, 
                             path_col = 'path',
                            y_col = 'labels_vec', 
                            target_size = IMG_SIZE,
                            batch_size = 32)

valid_gen = flow_from_dataframe(core_idg, valid_df, 
                             path_col = 'path',
                            y_col = 'labels_vec', 
                            target_size = IMG_SIZE,
                            batch_size = 256) # we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm
test_X, test_Y = next(flow_from_dataframe(core_idg, 
                               valid_df, 
                             path_col = 'path',
                            y_col = 'labels_vec', 
                            target_size = IMG_SIZE,
                            batch_size = 1024)) # one big batch

In [None]:
t_x, t_y = next(train_gen)

In [None]:
t_x.shape[1:]

In [None]:
img_dim = t_x.shape[1:]

# Vgg16

In [None]:
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from sklearn.metrics import fbeta_score

In [None]:
num_classes = len(labels_set)

In [None]:
input_tensor = Input(shape=img_dim)
base_model = VGG16(include_top=False,input_shape=img_dim)
    
bn = BatchNormalization()(input_tensor)
x = base_model(bn)
x = Flatten()(x)
output = Dense(num_classes, activation='sigmoid')(x)
model = Model(input_tensor, output)
model.summary()

# Finetune Vgg16

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, History
from keras.optimizers import Adam

In [None]:
history = History()
callbacks = [history, 
             EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=1e-4),
             ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, cooldown=0, min_lr=1e-7, verbose=1),
             ModelCheckpoint(filepath='weights.best.hdf5', verbose=1, save_best_only=True, 
                             save_weights_only=True, mode='auto')]

In [None]:
model.compile(optimizer=Adam(lr=1e-4), loss='binary_crossentropy', metrics = ['accuracy'])

In [None]:
batch_size = 128
steps_per_epoch = len(train_df)/batch_size

In [None]:
steps_per_epoch

In [None]:
model.fit_generator(train_gen,steps_per_epoch=steps_per_epoch,validation_data = (test_X, test_Y), 
                                  epochs = 25,callbacks = callbacks)

In [None]:
model.load_weights('weights.best.hdf5')

# Test Data Preparation

In [None]:
import glob
from glob import glob
test_image_paths = glob(test_path +'*.jpg', recursive=True)

In [None]:
test_image_paths[:10]

In [None]:
X_test = pd.DataFrame()

In [None]:
X_test['path'] = test_image_paths

In [None]:
submission = pd.read_csv('../input/sample_submission_v2.csv')

In [None]:
submission.head()

In [None]:
X_test['image_name'] = X_test['path'].map(lambda x: os.path.splitext(os.path.basename(x))[0])

In [None]:
X_test.head()

In [None]:
test_gen = flow_from_dataframe(core_idg, X_test, 
                             path_col = 'path',
                            y_col = 'image_name', 
                            target_size = IMG_SIZE,
                            batch_size = 256) # we can use much larger batches for evaluation

In [None]:
pred_Y =  model.predict_generator(test_gen,verbose = 1)

In [None]:
y_map ={i:l for i,l in enumerate(labels_set)} 

In [None]:
y_map

In [None]:
train_data.head()

In [None]:
thresholds = [0.2] * len(labels_set)

In [None]:
thresholds

## Map Predictions

**Details for Understanding ( You Can Skip)**

In [None]:
i = 0
for prediction in pred_Y:
    i = i+1
    if i == 10:
        break;
    print(prediction)

In [None]:
type(prediction)

In [None]:
prediction

In [None]:
for i, value in enumerate(prediction):
    print(i, value)

**Actual Mapping**

In [None]:
len(prediction)

In [None]:
len(thresholds)

In [None]:
len(y_map)

In [None]:
predictions_label = []
for prediction in pred_Y:
    labels = [y_map[i] for i,value in enumerate(prediction) if value > thresholds[i]]
    predictions_label.append(labels)

In [None]:
predictions_label[:10]

In [None]:
submission = pd.DataFrame()

In [None]:
submission['image_name'] = X_test['image_name']

In [None]:
tags_list = [None] * len(predictions_label)
for i, tags in enumerate(predictions_label):
    tags_list[i] = ' '.join(map(str, tags))

In [None]:
submission['tags'] = tags_list

In [None]:
submission.head()

In [None]:
submission.to_csv('predictions.csv',index = False)