In [1]:
# importation des librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pickle

from sklearn import cluster, utils, metrics, decomposition
from scipy.cluster import hierarchy

cv2.__version__

'4.4.0'

In [2]:
# importation des données

raw_data = pd.read_csv("data/data.csv")

In [3]:
# étapes de traitement d'image

img = cv2.imread("data/Images/0bff0eda7a6677dc1acb0477a1f7a121.jpg")
    
# passage en nuances de gris
new_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite("img/img_grise.jpg", new_img )
    
# egalisation de l'histogramme
new_img = cv2.equalizeHist(new_img)
cv2.imwrite("img/img_egalisee.jpg", new_img )

# taille ramenée à 300x300 px
side = max(img.shape[0:2])
square = np.full((side,side), 255, dtype=np.uint8)
ax, ay = (side - img.shape[1])//2, (side - img.shape[0])//2
square[ay:img.shape[0]+ay, ax:ax+img.shape[1]] = new_img
new_img = cv2.resize(square,(300,300))
cv2.imwrite("img/img_taille.jpg", new_img)
    
# suppression du bruit
new_img = cv2.fastNlMeansDenoising(new_img, h=8)
cv2.imwrite("img/img_debruitee.jpg", new_img)

True

In [4]:
# définition de la fonction traitant les images

def transform_img(img_filename):
    """
    Reçoit le nom d'un fichier image, transforme l'image correspondante, 
    l'enregistre et renvoie le nom du nouveau fichier
    """
    img = cv2.imread("data/Images/" + img_filename)

    new_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    new_img = cv2.equalizeHist(new_img)

    side = max(img.shape[0:2])
    square = np.full((side,side), 255, dtype=np.uint8)
    ax, ay = (side - img.shape[1])//2, (side - img.shape[0])//2
    square[ay:img.shape[0]+ay, ax:ax+img.shape[1]] = new_img
    new_img = cv2.resize(square,(300,300))

    new_img = cv2.fastNlMeansDenoising(new_img, h=8)

    new_filename = "gr" + img_filename
    cv2.imwrite("data/Images/" + new_filename, new_img)
    
    return new_filename

In [5]:
# application du traitement d'image à chaque image du jeu de données

def adding_treated_image(line):
    """
    Cette fonction recoit une ligne du DataFrame, en traite l'image et renvoie la ligne complétée
    """
    line["treated_image"] = transform_img(line["image"])
    return line

data = raw_data.apply(adding_treated_image, axis="columns")

data

Unnamed: 0.1,Unnamed: 0,product_name,image,text,category,treated_image
0,0,Elegance Polyester Multicolor Abstract Eyelet ...,55b85ea15a1536d46b7190ad6fff8ce7.jpg,Elegance Polyester Multicolor Abstract Eyelet ...,Home Furnishing,gr55b85ea15a1536d46b7190ad6fff8ce7.jpg
1,1,Sathiyas Cotton Bath Towel,7b72c92c2f6c40268628ec5f14c6d590.jpg,Sathiyas Cotton Bath Towel\nSpecifications of ...,Baby Care,gr7b72c92c2f6c40268628ec5f14c6d590.jpg
2,2,Eurospa Cotton Terry Face Towel Set,64d5d4a258243731dc7bbb1eef49ad74.jpg,Eurospa Cotton Terry Face Towel Set\nKey Featu...,Baby Care,gr64d5d4a258243731dc7bbb1eef49ad74.jpg
3,3,SANTOSH ROYAL FASHION Cotton Printed King size...,d4684dcdc759dd9cdf41504698d737d8.jpg,SANTOSH ROYAL FASHION Cotton Printed King size...,Home Furnishing,grd4684dcdc759dd9cdf41504698d737d8.jpg
4,4,Jaipur Print Cotton Floral King sized Double B...,6325b6870c54cd47be6ebfbffa620ec7.jpg,Jaipur Print Cotton Floral King sized Double B...,Home Furnishing,gr6325b6870c54cd47be6ebfbffa620ec7.jpg
...,...,...,...,...,...,...
1045,1045,Oren Empower Extra Large Self Adhesive Sticker,958f54f4c46b53c8a0a9b8167d9140bc.jpg,Oren Empower Extra Large Self Adhesive Sticker...,Baby Care,gr958f54f4c46b53c8a0a9b8167d9140bc.jpg
1046,1046,Wallmantra Large Vinyl Sticker Sticker,fd6cbcc22efb6b761bd564c28928483c.jpg,Wallmantra Large Vinyl Sticker Sticker\nWallma...,Baby Care,grfd6cbcc22efb6b761bd564c28928483c.jpg
1047,1047,Uberlyfe Extra Large Pigmented Polyvinyl Films...,5912e037d12774bb73a2048f35a00009.jpg,Uberlyfe Extra Large Pigmented Polyvinyl Films...,Baby Care,gr5912e037d12774bb73a2048f35a00009.jpg
1048,1048,Wallmantra Medium Vinyl Sticker Sticker,c3edc504d1b4f0ba6224fa53a43a7ad6.jpg,Wallmantra Medium Vinyl Sticker Sticker\nBuy W...,Baby Care,grc3edc504d1b4f0ba6224fa53a43a7ad6.jpg


In [6]:
# visualisation des points d'intérêt et descripteurs détectés par ORB

orb = cv2.ORB_create(nfeatures=500)
img = cv2.imread("img/img_debruitee.jpg")
kp, des = orb.detectAndCompute(img, None)
img2 = cv2.drawKeypoints(img, kp, None, flags=0)
cv2.imwrite("img/pts_interet.jpg", img2)
print(des)

[[ 75 191 209 ... 134 131 221]
 [141 120  21 ...  51  11 155]
 [193 157  62 ...  80 193  11]
 ...
 [142 183  58 ... 124  15 247]
 [ 61  91  83 ... 175 114 149]
 [ 57 123  88 ... 131 106 153]]


In [7]:
# détection des points d'intérêt par ORB

orb = cv2.ORB_create(nfeatures=500)

def descript_img(line):
    """
    Cette fonction reçoit une ligne du DataFrame, détecte les points d'intêret sur l'image traitée
    et renvoie la ligne avec les descripteurs des points d'intêret.
    """
    img = cv2.imread("data/Images/"+line["treated_image"])
    keypoints, descriptors = orb.detectAndCompute(img, None)
    line["descriptors"] = descriptors
    return line

data = data.apply(descript_img, axis="columns")

data

Unnamed: 0.1,Unnamed: 0,product_name,image,text,category,treated_image,descriptors
0,0,Elegance Polyester Multicolor Abstract Eyelet ...,55b85ea15a1536d46b7190ad6fff8ce7.jpg,Elegance Polyester Multicolor Abstract Eyelet ...,Home Furnishing,gr55b85ea15a1536d46b7190ad6fff8ce7.jpg,"[[191, 253, 252, 87, 180, 190, 191, 190, 123, ..."
1,1,Sathiyas Cotton Bath Towel,7b72c92c2f6c40268628ec5f14c6d590.jpg,Sathiyas Cotton Bath Towel\nSpecifications of ...,Baby Care,gr7b72c92c2f6c40268628ec5f14c6d590.jpg,"[[47, 219, 122, 232, 249, 221, 93, 219, 143, 9..."
2,2,Eurospa Cotton Terry Face Towel Set,64d5d4a258243731dc7bbb1eef49ad74.jpg,Eurospa Cotton Terry Face Towel Set\nKey Featu...,Baby Care,gr64d5d4a258243731dc7bbb1eef49ad74.jpg,"[[67, 151, 221, 193, 178, 42, 61, 191, 174, 57..."
3,3,SANTOSH ROYAL FASHION Cotton Printed King size...,d4684dcdc759dd9cdf41504698d737d8.jpg,SANTOSH ROYAL FASHION Cotton Printed King size...,Home Furnishing,grd4684dcdc759dd9cdf41504698d737d8.jpg,"[[58, 243, 107, 227, 118, 211, 251, 252, 71, 1..."
4,4,Jaipur Print Cotton Floral King sized Double B...,6325b6870c54cd47be6ebfbffa620ec7.jpg,Jaipur Print Cotton Floral King sized Double B...,Home Furnishing,gr6325b6870c54cd47be6ebfbffa620ec7.jpg,"[[5, 61, 123, 3, 246, 191, 179, 242, 200, 237,..."
...,...,...,...,...,...,...,...
1045,1045,Oren Empower Extra Large Self Adhesive Sticker,958f54f4c46b53c8a0a9b8167d9140bc.jpg,Oren Empower Extra Large Self Adhesive Sticker...,Baby Care,gr958f54f4c46b53c8a0a9b8167d9140bc.jpg,"[[70, 81, 164, 150, 205, 84, 188, 141, 142, 18..."
1046,1046,Wallmantra Large Vinyl Sticker Sticker,fd6cbcc22efb6b761bd564c28928483c.jpg,Wallmantra Large Vinyl Sticker Sticker\nWallma...,Baby Care,grfd6cbcc22efb6b761bd564c28928483c.jpg,"[[36, 31, 247, 69, 246, 75, 131, 218, 22, 236,..."
1047,1047,Uberlyfe Extra Large Pigmented Polyvinyl Films...,5912e037d12774bb73a2048f35a00009.jpg,Uberlyfe Extra Large Pigmented Polyvinyl Films...,Baby Care,gr5912e037d12774bb73a2048f35a00009.jpg,"[[11, 112, 90, 197, 67, 148, 126, 84, 41, 96, ..."
1048,1048,Wallmantra Medium Vinyl Sticker Sticker,c3edc504d1b4f0ba6224fa53a43a7ad6.jpg,Wallmantra Medium Vinyl Sticker Sticker\nBuy W...,Baby Care,grc3edc504d1b4f0ba6224fa53a43a7ad6.jpg,"[[83, 134, 231, 223, 182, 187, 238, 183, 127, ..."


In [8]:
# cumul de tous les descripteurs dans une même liste

descriptors_all = np.ndarray(shape=(0,32))
for i in range(data.shape[0]):
    try :
        descriptors_all = np.concatenate([descriptors_all, data.loc[i,"descriptors"]])
    except :
        print(f"Problème avec l'image de l'article {i}")
        
print(f"\nNombre total de descripteurs : {descriptors_all.shape[0]}")


Nombre total de descripteurs : 474512


In [9]:
# échantillonnage stratifié des données (10%)

sample_data = utils.resample(data, 
                             n_samples=105, 
                             replace=False, 
                             stratify=data["category"], 
                             random_state=42)

sample_data.groupby(by="category").size()

category
Baby Care                     15
Beauty and Personal Care      15
Computers                     15
Home Decor & Festive Needs    15
Home Furnishing               15
Kitchen & Dining              15
Watches                       15
dtype: int64

In [10]:
# cumul des descripteurs ORB de l'échantillon

descriptors_sample = np.ndarray(shape=(0,32))
for i in range(sample_data.shape[0]):
    try :
        descriptors_sample = np.concatenate([descriptors_sample, sample_data.iloc[i]["descriptors"]])
    except :
        print(f"Problème avec l'image de l'article {i}")
        
descriptors_sample.shape

(47426, 32)

In [11]:
# regroupement des descripteurs en 2000 visual words par algorithme k-means

km = cluster.KMeans(n_clusters=2000, init="k-means++", n_init=1, random_state=42)
km.fit(descriptors_all)

KMeans(n_clusters=2000, n_init=1, random_state=42)

In [12]:
# comptage des visual words pour chaque article

features_freq = pd.DataFrame(index=data.index, columns = range(2000))
features_freq [list(range(2000))] = np.zeros((1050,2000), dtype=int)
for ind in features_freq.index:
    for desc in data.loc[ind, "descriptors"] :
        features_freq.loc[ind, km.predict(desc.reshape(1,-1))] +=1

features_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,1,0,0,0,0,0,0,1,...,0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1046,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,1,0,0,1,0,0
1047,0,0,0,0,1,2,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1048,0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [13]:
# conversion du comptage en une matrice tf-idf

nb_documents = features_freq.shape[0]
nb_words = features_freq.shape[1]
total_lines = features_freq.sum(axis=1)
idf = np.empty(nb_words, dtype="float64")
for i in range(2000):
    idf[i] = nb_documents / (nb_documents-features_freq[i].value_counts()[0])

tf_idf_matrix = features_freq.copy()
for line in range(nb_documents):
    for col in range(nb_words):
        tf_idf_matrix.iloc[line, col] = features_freq.iloc[line, col]/total_lines.iloc[line]*idf[col]

In [14]:
tf_idf_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.0,0.000000,0.017934,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.015390,...,0.000000,0.000000,0.000000,0.033995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.012626,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.011574
2,0.0,0.000000,0.000000,0.0,0.000000,0.013228,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.016190,0.000000,0.000000,0.000000,0.000000,0.000000,0.010044
3,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.014842,...,0.011094,0.000000,0.000000,0.016393,0.000000,0.000000,0.010821,0.000000,0.000000,0.000000
4,0.0,0.000000,0.000000,0.0,0.011091,0.000000,0.012437,0.0,0.000000,0.000000,...,0.000000,0.000000,0.011822,0.000000,0.000000,0.011758,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.000000,0.000000,0.0,0.011299,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.011979,0.000000,0.000000,0.000000,0.000000
1046,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.000000,0.000000,0.025029,0.000000,0.010846,0.000000,0.000000,0.012245,0.000000,0.000000
1047,0.0,0.000000,0.000000,0.0,0.011299,0.026733,0.000000,0.0,0.000000,0.014811,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012971,0.000000
1048,0.0,0.012288,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.016548,0.000000,...,0.000000,0.000000,0.013639,0.000000,0.011820,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
# clusterisation en 7 clusters par CAH et évaluation du résultat

linkage_matrix = hierarchy.linkage(tf_idf_matrix, "ward")

clusters = hierarchy.fcluster(linkage_matrix, 7, criterion="maxclust")

metrics.adjusted_rand_score(clusters, data["category"])

0.024789467882759746

In [16]:
# clusterisation en 7 clusters par k_means et évaluation du résultat

km = cluster.KMeans(n_clusters=7)
km.fit(tf_idf_matrix)

metrics.adjusted_rand_score(km.labels_, data["category"])

0.014275311405382293

In [17]:
# réduction de dimension

pca = decomposition.PCA(n_components=300)
reduced_data = pca.fit_transform(tf_idf_matrix)

print(np.where(pca.explained_variance_ratio_.cumsum()>0.85))

(array([269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
       282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
       295, 296, 297, 298, 299]),)


In [18]:
# stockage des données réduites
reduced_data = reduced_data[:,:270]
reduced_data.shape

(1050, 270)

In [19]:
# clusterisation en 7 clusters par CAH et évaluation du résultat

linkage_matrix = hierarchy.linkage(reduced_data, "ward")

clusters = hierarchy.fcluster(linkage_matrix, 7, criterion="maxclust")

metrics.adjusted_rand_score(clusters, data["category"])

0.014282408461212894

In [20]:
# clusterisation en 7 clusters par k_means et évaluation du résultat

km = cluster.KMeans(n_clusters=7)
km.fit(reduced_data)

metrics.adjusted_rand_score(km.labels_, data["category"])

0.018194958142128386

In [21]:
# enregistrement des données échantillonées

sample_data.to_csv("data/sample_data.csv")