In [None]:
# Basic imports
import os
import numpy as np
import pandas as pd

# Tensorflow/Keras imports
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras import Model
from keras.layers import Conv2D, MaxPooling2D, Input, UpSampling2D


In [None]:
path = os.getcwd()
raw_data_directory = os.path.join(path, 'Raw_data')
list_filename = os.listdir(raw_data_directory)
list_filename_full = [os.path.join(path, 'Raw_data', f) for f in list_filename]

WIDTH = 128
HEIGHT = 192
NCHANNELS = 3

In [None]:
def load_images(f, width=128, height=192):
    im = load_img(f, color_mode='rgb', target_size = (width,height))
    im_array = img_to_array(im)
    im_array = im_array.astype('float32') / 255. 
    #im_array_x = preprocess_input(im_array)
    return im_array

In [None]:
import multiprocessing as mp
from sklearn.model_selection import train_test_split

# Load images with parallel processing
pool = mp.Pool(mp.cpu_count())
images = pool.starmap(load_images, [(f, WIDTH, HEIGHT) for f in list_filename_full])
images = np.array(images)
pool.close()

# Split train and validation test
x_train, x_test = train_test_split(images, test_size=0.2, random_state=123)
#print(x_train.shape)
#print(x_test.shape)

In [None]:

input_img = Input(shape=(WIDTH, HEIGHT, NCHANNELS))

x = Conv2D(3, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(3, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(3, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same')(x)

x = Conv2D(3, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(3, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(3, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')


In [None]:
from keras.callbacks import TensorBoard

autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=128,
                shuffle=True,
                validation_data=(x_test, x_test),
                callbacks=[TensorBoard(log_dir='./CNN/')])

In [None]:
import matplotlib.pyplot as plt

decoded_imgs = autoencoder.predict(x_test)

n = 4
plt.figure(figsize=(8, 4))
for i in range(1, n + 1):
    # Display original
    ax = plt.subplot(2, n, i)
    plt.imshow(x_test[i].reshape(WIDTH, HEIGHT, NCHANNELS))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # Display reconstruction
    ax = plt.subplot(2, n, i + n)
    plt.imshow(decoded_imgs[i].reshape(WIDTH, HEIGHT, NCHANNELS))
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
plt.show()

In [None]:
# Extracting features

# DNN
encoder = Model(input_img, encoded)
DNN_features = encoder.predict(images)
DNN_features = DNN_features.reshape(len(images), -1)
DNN_features.shape



In [None]:
# PCA
from sklearn.decomposition import PCA

images_vectorized = np.array(images.reshape(len(images), -1))
pca = PCA(n_components=0.8)
PCA_features = pca.fit_transform(images_vectorized)

In [None]:
# Clustering
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing


N_CLUSTERS = 10

# Kmeans

clf_kmeans_DNN = make_pipeline(preprocessing.StandardScaler(), KMeans(n_clusters=N_CLUSTERS, random_state=0))
clf_kmeans_DNN.fit(DNN_features)
colors = [str(elt) for elt in clf_kmeans_DNN['kmeans'].labels_]
Ydf_DNN = pd.DataFrame(DNN_features[:,0:2])
Ydf_DNN['class'] = colors
Ydf_DNN['id'] = list_filename
Ydf_DNN.columns = ['0', '1', 'class', 'id']


# Les données sont centrées/réduites
clf_kmeans_PCA = make_pipeline(preprocessing.StandardScaler(), KMeans(n_clusters=N_CLUSTERS, random_state=0))
clf_kmeans_PCA.fit(PCA_features)
colors = [str(elt) for elt in clf_kmeans_PCA['kmeans'].labels_]
Ydf_PCA = pd.DataFrame(PCA_features[:,0:2])
Ydf_PCA['class'] = colors
Ydf_PCA['id'] = list_filename
Ydf_PCA.columns = ['0', '1', 'class', 'id']

In [None]:
from sklearn.manifold import TSNE

# T-SNE

DNN_TSNE_features = TSNE(n_components=2).fit_transform(DNN_features)
clf_kmeans_DNN_TSNE = make_pipeline(preprocessing.StandardScaler(), KMeans(n_clusters=N_CLUSTERS, random_state=0))
clf_kmeans_DNN_TSNE.fit(DNN_TSNE_features)
colors = [str(elt) for elt in clf_kmeans_DNN_TSNE['kmeans'].labels_]
Ydf_DNN_TSNE = pd.DataFrame(DNN_TSNE_features[:,0:2])
Ydf_DNN_TSNE['class'] = colors
Ydf_DNN_TSNE['id'] = list_filename
Ydf_DNN_TSNE.columns = ['0', '1', 'class', 'id']

In [None]:
import pandas as pd
import bokeh.plotting as bpl
import bokeh.models as bmo
from bokeh.palettes import d3
bpl.output_file("line.html")

source = bpl.ColumnDataSource.from_df(Ydf_DNN)

# use whatever palette you want...
palette = d3['Category10'][len(Ydf_DNN['class'].unique())]
color_map = bmo.CategoricalColorMapper(factors=Ydf_DNN['class'].unique(), 
                                       palette=palette)

# create figure and plot
p = bpl.figure(plot_width=1400, plot_height=1000)
p.scatter(x='0', y='1',
          color={'field': 'class', 'transform': color_map},
          legend_label='class', source=source)
p.title.text = "DNN"
bpl.show(p)


In [None]:
bpl.output_file("line2.html")

source = bpl.ColumnDataSource.from_df(Ydf_PCA)

# use whatever palette you want...
palette = d3['Category10'][len(Ydf_PCA['class'].unique())]
color_map = bmo.CategoricalColorMapper(factors=Ydf_PCA['class'].unique(), 
                                       palette=palette)

# create figure and plot
p = bpl.figure(plot_width=1400, plot_height=1000)
p.scatter(x='0', y='1',
          color={'field': 'class', 'transform': color_map},
          legend_label='class', source=source)
p.title.text = "PCA"
bpl.show(p)


In [None]:
bpl.output_file("line3.html")

source = bpl.ColumnDataSource.from_df(Ydf_DNN_TSNE)

# use whatever palette you want...
palette = d3['Category10'][len(Ydf_DNN_TSNE['class'].unique())]
color_map = bmo.CategoricalColorMapper(factors=Ydf_DNN_TSNE['class'].unique(), 
                                       palette=palette)

# create figure and plot
p = bpl.figure(plot_width=1400, plot_height=1000)
p.scatter(x='0', y='1',
          color={'field': 'class', 'transform': color_map},
          legend_label='class', source=source)
p.title.text = "DNN_TSNE"
bpl.show(p)


In [None]:
import shutil

def cluster2folder(clf, df, folder_name):
    for c in pd.unique(clf['kmeans'].labels_):
        d = os.path.join(folder_name, str(c))
        if os.path.isdir(d):
            shutil.rmtree(d)
        os.mkdir(d)


    for i in range(len(Ydf_DNN)):
        f_src = os.path.join(path, 'Raw_data', df['id'][i])
        f_tar = os.path.join(path, folder_name, str(df['class'][i]), df['id'][i])
        #copyfile(f_src, f_tar)
        os.symlink(f_src, f_tar)

cluster2folder(clf_kmeans_DNN, Ydf_DNN, 'Results_DNN')
cluster2folder(clf_kmeans_PCA, Ydf_PCA, 'Results_PCA')
cluster2folder(clf_kmeans_DNN_TSNE, Ydf_DNN_TSNE, 'Results_DNN_TSNE')
