# Imports

In [None]:
import os
import io
import sys
import numpy as np 
import pandas as pd 
import bson
import cv2
import matplotlib.pyplot as plt
from skimage.io import imread , imshow
from PIL import Image
import seaborn as sns
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop,SGD,Adam

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
import keras
from keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf

import base64

from io import BytesIO

from skimage import color

%matplotlib inline
import matplotlib.image as mpimg

from tensorflow.keras.preprocessing.image import ImageDataGenerator 

# Importation des données

In [None]:
category_name_df = pd.read_csv('../input/cdiscount-image-classification-challenge/category_names.csv')
data_file = bson.decode_file_iter(open('../input/cdiscount-image-classification-challenge/train.bson', 'rb'))
sample_submission_file = pd.read_csv('../input/cdiscount-image-classification-challenge/sample_submission.csv')

test_bson_file =  bson.decode_file_iter(open('../input/cdiscount-image-classification-challenge/test.bson', 'rb'))
train_bson_file = bson.decode_file_iter(open('../input/cdiscount-image-classification-challenge/train.bson', 'rb'))
train_example_bson_file = bson.decode_file_iter(open('../input/cdiscount-image-classification-challenge/train_example.bson', 'rb'))

# Définition des méthodes utilisées

In [None]:
def encode_b64(data) :
    encoded = base64.b64encode(data)
    return encoded

def create_dataframe(path) :
    _ids = []
    category_ids = []
    imgs = []

    i=0
    for c, d in enumerate(path):
        i+=1
        if i>1000000:
            break
        product_id = d['_id']
        category_id = d['category_id']
        for img_dict in d['imgs']:
            img = encode_b64(img_dict['picture'])
            picture = img
            decoded_pic = picture.decode('utf-8')
            
            _ids.append(product_id)
            category_ids.append(category_id)
            imgs.append(decoded_pic)
    return pd.DataFrame({'_id':_ids, 'category_id':category_ids, 'picture':imgs})

def assure_path_exists(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    else : 
        print('Folder Exists')
        
def create_category_id_folders(list_category_id, dir) :
    assure_path_exists(dir)
    for cat_id in list_category_id :
        dest_path = os.path.join(dir,str(cat_id))
        assure_path_exists(dest_path)
        
def write_images_to_category_folder(data, list_category_id, dir) :
    i=0
    for j, d in data.iterrows() :
        if d['category_id'] in list_category_id :
            dest_path = os.path.join(dir,str(d['category_id']),str(d['category_id'])+'_'+str(i)+'.jpg')
            data_ = d['picture']
            imgdata = base64.b64decode(data_)
            with open(dest_path, 'wb') as f:
                f.write(imgdata)
            i+=1
            
def write_train_test_images(data, list_category_id, df_category_id, train_dir, test_dir, nbr_train, nbr_test) :
    df_category_id['count']=0
    dict_category_id = df_category_id.set_index('categories').T.to_dict('list')
    nbr_total=nbr_train+nbr_test
    for j, d in data.iterrows() :
        if d['category_id'] in list_category_id :
            if dict_category_id[d['category_id']][0]<=nbr_train :
                dest_path = os.path.join(train_dir,str(d['category_id']),str(d['category_id'])+'_'+str(dict_category_id[d['category_id']][0])+'.jpg')
                data_ = d['picture']
                imgdata = base64.b64decode(data_)
                with open(dest_path, 'wb') as f:
                    f.write(imgdata)
                dict_category_id[d['category_id']][0]+=1
            elif dict_category_id[d['category_id']][0]>nbr_train and dict_category_id[d['category_id']][0]<=nbr_total :
                dest_path = os.path.join(test_dir,str(d['category_id']),str(d['category_id'])+'_'+str(dict_category_id[d['category_id']][0])+'.jpg')
                data_ = d['picture']
                imgdata = base64.b64decode(data_)
                with open(dest_path, 'wb') as f:
                    f.write(imgdata)
                dict_category_id[d['category_id']][0]+=1
            
def plot_image(i, label, imgs):
#     label, img = label[i], imgs[i]
    img= imgs[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    
    images, index = next(imgs)
    
    lbl_i = np.argwhere(index[i]>0)[0][0]
    
    label = label[lbl_i]
    
    plt.imshow(images[i])
    
    categ_level_1 = category_name_df[category_name_df['category_id']==label].iloc[0,1] +'\n'
    categ_level_2 = category_name_df[category_name_df['category_id']==label].iloc[0,2] +'\n'
    categ_level_3 = category_name_df[category_name_df['category_id']==label].iloc[0,3] 
    title= categ_level_1+ categ_level_2+ categ_level_3 

    plt.xlabel("{}".format(title), color='blue')
    
#     plt.xlabel("({})".format(category_name_df[category_name_df['category_id']==label].iloc[0:4]),
#                                 color='blue')
    
    
def plot_image_pred(i, predictions_array, true_label, imgs):
    img = imgs[i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    
    images, index = next(imgs)
    
    lbl_i = np.argwhere(index[i]>0)[0][0]
    
    label = true_label[lbl_i]
    
    plt.imshow(images[i])

#     plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == label:
        color = 'blue'
    else:
        color = 'red'
        
    categ_level_1 = category_name_df[category_name_df['category_id']==label].iloc[0,1] +'\n'
    categ_level_2 = category_name_df[category_name_df['category_id']==label].iloc[0,2] +'\n'
    categ_level_3 = category_name_df[category_name_df['category_id']==label].iloc[0,3] 
    title= categ_level_1+ categ_level_2+ categ_level_3 

    plt.xlabel("{} {:2.0f}% \n ({})".format(category_name_df.iloc[predicted_label,1:4],
                                100*np.max(predictions_array),
                                title),
                                color=color, fontsize=30)

def plot_value_array(i, predictions_array, true_label, imgs):
    images, index = next(imgs)
    
    lbl_i = np.argwhere(index[i]>0)[0][0]
    
    true_label = true_label[lbl_i]
    
#     true_label = true_label['category_ids'].iloc[i]
    plt.grid(False)
    plt.xticks(range(194))
    plt.yticks([])
    thisplot = plt.bar(range(194), predictions_array, color="#777777")
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)

    thisplot[predicted_label].set_color('red')
    thisplot[lbl_i].set_color('blue')
    

def decode_images(item_imgs):
    nx = 2 if len(item_imgs) > 1 else 1
    ny = 2 if len(item_imgs) > 2 else 1
    composed_img = np.zeros((ny * 180, nx * 180, 3), dtype=np.uint8)
    for i, img_dict in enumerate(item_imgs):
        img = decode(img_dict['picture'])
        h, w, _ = img.shape        
        xstart = (i % nx) * 180
        xend = xstart + w
        ystart = (i // nx) * 180
        yend = ystart + h
        composed_img[ystart:yend, xstart:xend] = img
    return composed_img


def decode(data):
    arr = np.asarray(bytearray(data), dtype=np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

# Définition des variables statiques

In [None]:
IMAGES_DIR = os.path.join('.','data')
assure_path_exists(IMAGES_DIR)
bs=50
train_dir = "./train/"
validation_dir = "./test/"

assure_path_exists(train_dir)
assure_path_exists(validation_dir)

train_datagen = ImageDataGenerator( rescale = 1.0/255. )
test_datagen  = ImageDataGenerator( rescale = 1.0/255. )


# Affichage des images et leurs catégories

In [None]:
max_counter = 16
counter = 0
n = 4

level_tags = category_name_df.columns[1:]

for item in bson.decode_file_iter(open('../input/cdiscount-image-classification-challenge/train.bson', 'rb')):  
    if counter % n == 0:
        plt.figure(figsize=(14, 12))
    
    mask = category_name_df['category_id'] == item['category_id']    
    plt.subplot(1, n, counter % n + 1)
    cat_levels = category_name_df[mask][level_tags].values.tolist()[0]
    cat_levels = [c[:25] for c in cat_levels]
    title = str(item['category_id']) + '\n\n'
    title += '\n'.join(cat_levels)
    plt.title(title+'\n')
    plt.imshow(decode_images(item['imgs']))
    plt.axis('off')
    
    counter += 1
    if counter == max_counter:
        break

# Affichage du nombre des catégories et des niveaux

In [None]:
print("Unique categories: ", len(category_name_df['category_id'].unique()))
print("Unique level 1 categories: ", len(category_name_df['category_level1'].unique()))
print("Unique level 2 categories: ", len(category_name_df['category_level2'].unique()))
print("Unique level 3 categories: ", len(category_name_df['category_level3'].unique()))

# Affichage des noms des niveaux du première catégorie

In [None]:
for i in category_name_df['category_level1'].unique():
    print(i)

# Affichage des statistiques pour le premier niveau

In [None]:
plt.figure(figsize=(12,12))
_ = sns.countplot(y=category_name_df['category_level1'])

# Création du dataFrame et enregistrement sous forme csv

In [None]:
df = create_dataframe(data_file)
df.head()

In [None]:
assure_path_exists('./csv')

In [None]:
df.to_csv("./csv/data.csv", index=False)

# Lecture des données d'après le fichier .csv

In [None]:
data = pd.read_csv('./csv/data.csv')
data.shape

# nombre des images par catégorie

In [None]:
df_count = data.groupby(['category_id'])['_id'].count()

In [None]:
df_count

In [None]:
df_category_count = pd.DataFrame(df_count)

In [None]:
df_category_count

# Catégories qui ont plus que 1500 images

In [None]:
df_count_1500 = df_category_count[df_category_count['_id']>1500]

In [None]:
df_count_1500

In [None]:
list_category_id_1500 = list(df_count_1500.index.values)

In [None]:
list_category_id_1500

In [None]:
len(list_category_id_1500)
# list_category_id_1500.count()

In [None]:
df_category_id_1500 = pd.DataFrame(list_category_id_1500)
df_category_id_1500

In [None]:
df_category_id_1500 = df_category_id_1500.rename({0: 'categories'}, axis='columns')
df_category_id_1500

# Création des dossiers

In [None]:
# create_category_id_folders(list_category_id_1500, IMAGES_DIR)
create_category_id_folders(list_category_id_1500, train_dir)
create_category_id_folders(list_category_id_1500, validation_dir)

In [None]:
data['category_id']

# Ecriture des images dans les dossiers appropriés

In [None]:
write_train_test_images(data, list_category_id_1500, df_category_id_1500, train_dir, validation_dir, 1200, 300) # Lecture des données d'après le fichier .csv

# Visualisation des images

In [None]:
# i = 0
# for i in range(0,3):
#     plt.figure(figsize=(16, 16))
#     plt.subplot(2,3,i+1)
#     plot_image(i, list_category_id_1500, validation_generator)
#     plt.show()
#     i+=1

# Construction du réseau de neurons

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(16,(3,3),activation = "relu" , input_shape = (180,180,3)) ,
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(32,(3,3),activation = "relu") ,  
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(64,(3,3),activation = "relu") ,  
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Conv2D(128,(3,3),activation = "relu"),  
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.Flatten(), 
    tf.keras.layers.Dense(550,activation="relu"),
    tf.keras.layers.Dropout(0.1,seed = 2019),
    tf.keras.layers.Dense(400,activation ="relu"),
    tf.keras.layers.Dropout(0.3,seed = 2019),
    tf.keras.layers.Dense(300,activation="relu"),
    tf.keras.layers.Dropout(0.4,seed = 2019),
    tf.keras.layers.Dense(200,activation ="relu"),
    tf.keras.layers.Dropout(0.2,seed = 2019),
    tf.keras.layers.Dense(194,activation = "softmax")
])

In [None]:
model.summary()

In [None]:
adam=Adam(learning_rate=0.001)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['acc'])

In [None]:
train_generator = train_datagen.flow_from_directory(train_dir,
                                                    batch_size=bs,
                                                    class_mode='categorical',
                                                    target_size=(180,180))

validation_generator =  test_datagen.flow_from_directory(validation_dir,
                                                         batch_size=bs,
                                                         class_mode  = 'categorical',
                                                         target_size=(180,180))

In [None]:
history = model.fit(train_generator,
                    validation_data=validation_generator,
                    steps_per_epoch=1200 // bs,
                    epochs=30,
                    validation_steps=300 // bs,
#                     verbose=2
                             )

# Evaluation des résultats

In [None]:
plt.plot(history.history["acc"])
plt.plot(history.history['val_acc'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Accuracy","Validation Accuracy","loss","Validation Loss"])
plt.show()

# Test et évaluation des résultats

In [None]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [None]:
predictions = probability_model.predict(validation_generator)

In [None]:
predictions

In [None]:
# i = 0
# for i in range(0,5):
#     plt.figure(figsize=(6,3))
#     plt.subplot(1,2,1)
#     plot_image_pred(i, predictions[i], list_category_id_1500, validation_generator)
#     plt.subplot(1,2,2)
#     plot_value_array(i, predictions[i],  list_category_id_1500, validation_generator)
#     plt.show()
#     i+=1

In [None]:
num_rows = 10
num_cols = 1
num_images = num_rows*num_cols
plt.figure(figsize=(25*2*num_cols, 4*2*2*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image_pred(i, predictions[i], list_category_id_1500, validation_generator)
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions[i],  list_category_id_1500, validation_generator)
plt.tight_layout()
plt.show()