# Overview

- Goal: Create clustering based on image, and compare with the category provided from articles table. How good is the category to group similar items together? Is there a new category of similar item that is not tagged by the categorization?
- The first part of this notebook was taken from this notebook https://www.kaggle.com/hamditarek/similar-image-cnn-cosine-similarity, and then extended to look into clustering. 
- Side output of this notebook is to export the feature vector csv, so that others can immediately load from there without having to do the transformation

# Library installations

In [None]:
!pip install --upgrade numpy

In [None]:
!pip install --upgrade pycaret

# Data Loading and Feature Generation
- This step is taken from https://www.kaggle.com/hamditarek/similar-image-cnn-cosine-similarity

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from keras.applications.xception import Xception,preprocess_input
import tensorflow as tf
from keras.preprocessing import image
from keras.layers import Input
from keras.backend import reshape
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [None]:
N_IMAGE = 10000

images_dir = '../input/h-and-m-personalized-fashion-recommendations/images'

def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

def preprocess_img(img_path):
    dsize = (225,225)
    new_image=cv2.imread(img_path)
    new_image=cv2.resize(new_image,dsize,interpolation=cv2.INTER_NEAREST)  
    new_image=np.expand_dims(new_image,axis=0)
    new_image=preprocess_input(new_image)
    return new_image

def load_data():
    output=[]
    output=getImagePaths(images_dir)[:N_IMAGE]
    return output

def model():
    model=Xception(weights='imagenet',include_top=False)
    for layer in model.layers:
        layer.trainable=False
        #model.summary()
    return model

def feature_extraction(image_data,model):
    features=model.predict(image_data)
    features=np.array(features)
    features=features.flatten()
    return features

In [None]:
# features=[]
# output=load_data()
# main_model=model()
# #Limiting the data for training
# for i in output[:N_IMAGE-1]:
#     new_img=preprocess_img(i)
#     features.append(feature_extraction(new_img,main_model))
# feature_vec = np.array(features)

In [None]:
# dfFeatures = pd.DataFrame(feature_vec)

In [None]:
# dfFeatures.head()

# Get article_id and merge back to articles table

In [None]:
def getImagePaths_articleID(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    list_article_id = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            list_article_id.append(filename)
    return list_article_id

In [None]:
# list_article_id = getImagePaths_articleID(images_dir)

In [None]:
# dfFeatures['article_id'] = [x[1:-4] for x in list_article_id[:N_IMAGE-1]]
# dfFeatures.head()

In [None]:
# dfFeatures.to_csv('dfFeatures.csv')

# Clustering Visulization and EDA


### Load saved csv data

In [None]:
import random
p = 0.5 #sampling read csv
dfFeatures = pd.read_csv('../input/clustering-based-on-image-similarity-vs-categories/dfFeatures.csv',
                        header=0, 
                         skiprows=lambda i: i>0 and random.random() > p)

In [None]:
dfFeatures.head()

In [None]:
try:
    dfFeatures.drop('Unnamed: 0',axis=1)  # if we use sample for read_csv, this indicates the randomized index
except:
    print("No columns needed to be drop")

In [None]:
dfFeatures_articleid = dfFeatures['article_id']
dfFeatures_features = dfFeatures.drop('article_id',axis=1)

In [None]:
from sklearn.decomposition import PCA
n = 10
dfFeaturesPCA = PCA(10, random_state=123).fit_transform(dfFeatures_features)

In [None]:
dfFeaturesPCA = pd.DataFrame(dfFeaturesPCA)
dfFeaturesPCA.head()

In [None]:
dfFeaturesPCA['article_id'] = dfFeatures_articleid
dfFeaturesPCA.columns = ['pca'+str(i+1) for i in range(10) ] + ['article_id']
dfFeaturesPCA.head()

### With smaller data, we can merge with articles dataset and save it

In [None]:
dfArticles = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
dfArticlesFeaturesPCA = dfArticles.merge(dfFeaturesPCA,on='article_id',how='right')
dfArticlesFeaturesPCA.head(3)

In [None]:
dfArticlesFeaturesPCA.columns

In [None]:
dfArticlesFeaturesPCA.to_csv('dfArticlesFeaturesPCA.csv')

## Do clustering based on PCA 10 and visualize it

In [None]:
# from pycaret.clustering import *
# s = setup(dfArticlesFeaturesPCA[cols_pca], normalize = True)
# kmeans = create_model('kmeans')
# plot_model(kmeans, plot = 'elbow')

In [None]:
dfArticlesFeaturesPCA[cols_pca].head()

In [None]:
from sklearn.cluster import KMeans
arrKMeans = KMeans(n_clusters=8, random_state=123).fit_predict(dfArticlesFeaturesPCA[cols_pca])
dfArticlesFeaturesPCA['KMeans'] = arrKMeans
dfArticlesFeaturesPCA.head()

In [None]:
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Combined with image visualization below

list_categories = ['product_group_name', 'graphical_appearance_name', 'colour_group_name','index_group_name']
N_categories = len(list_categories)
for i in range(8):
    subset = dfArticlesFeaturesPCA[dfArticlesFeaturesPCA.KMeans==i]
    fig,axes = plt.subplots(1,N_categories,figsize=(N_categories*5,3))
    for j in range(N_categories):
        categ = list_categories[j]
        sns.countplot(categ,data=subset,ax=axes[j])

In [None]:
subset = dfArticlesFeaturesPCA[dfArticlesFeaturesPCA.KMeans==0]
subset_group = subset.groupby(categ)[article_id].count().resee_index()

In [None]:
import umap

cols_pca = ['pca'+str(i+1) for i in range(10) ]
mapper = umap.UMAP().fit(dfArticlesFeaturesPCA[cols_pca])

In [None]:
import umap.plot
umap.plot.points(mapper, labels=dfArticlesFeaturesPCA.KMeans)

In [None]:
umap.plot.points(mapper, labels=dfArticlesFeaturesPCA.index_group_name)

## Visualize Image

In [None]:
sample_items = {}
N_CLUSTERS = 8
N_SAMPLEITEM = 10
for i in range(N_CLUSTERS):
    subset = dfArticlesFeaturesPCA[dfArticlesFeaturesPCA.KMeans==i]    
    sample_items[i] = subset.article_id[:N_SAMPLEITEM].values
sample_items

In [None]:
dfArticlesFeaturesPCA['product_group_name'].unique()

In [None]:
import matplotlib.image as mpimg

def getImgFromArticle(article_id):
    subfolder = '0'+str(article_id)[:2]
    filename = '0'+str(article_id)+'.jpg'
    filename_root = '../input/h-and-m-personalized-fashion-recommendations/images/'
    filename_path = filename_root + subfolder + '/' + filename
    img = mpimg.imread(filename_path)
    return img


list_categories = ['product_group_name', 'graphical_appearance_name', 'colour_group_name','index_group_name']
N_categories = len(list_categories)

dict_hueorder = {}
for categ in list_categories:
    dict_hueorder[categ] = dfArticlesFeaturesPCA[categ].unique()

def VisualizeSamples(sample_items):
    for i in range(N_CLUSTERS):
        # Images
        fig, axes = plt.subplots(1,N_SAMPLEITEM,figsize=(N_SAMPLEITEM * 3, 3))
        plt.suptitle('Image for Cluster ' + str(i))
        for j in range(N_SAMPLEITEM):
            article_id = sample_items[i][j]
            imgSample = getImgFromArticle(article_id)
            axes[j].imshow(imgSample)
        
        # Profiling
        subset = dfArticlesFeaturesPCA[dfArticlesFeaturesPCA.KMeans==i]
        fig,axes = plt.subplots(1,N_categories,figsize=(N_categories*8,3))
        plt.suptitle('Profiling for Cluster ' + str(i))
#         plt.xticks(fontsize=8, rotation=70)
        for j in range(N_categories):
            categ = list_categories[j]
            categ_order = dict_hueorder[categ]
            sns.countplot(categ,data=subset,ax=axes[j],orient='v',order=categ_order)
            axes[j].tick_params(size=4,labelrotation=70)
        
            
VisualizeSamples(sample_items)

# Comment
The grouping based on image feature vector is not so neat yet - WIP