## Shopee 데이터를 이용한 유사 상품 이미지 매칭모델 만들기

Source : https://www.kaggle.com/competitions/shopee-product-matching

### 1. 라이브러리 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# utils
import os
from glob import glob
from termcolor import colored

# for data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for image
import cv2
from PIL import Image

# for KNN Search (ML)
from sklearn.neighbors import NearestNeighbors

# for image embedding (DL)
import tensorflow as tf
from tensorflow.keras.applications import ResNet101

### 2. 데이터 불러오기

In [None]:
base_path = "/content/drive/MyDrive/Colab Notebooks/data/shopee_samples/"
train = pd.read_csv(base_path + "train_samples.csv")
test = pd.read_csv(base_path + "test.csv")

In [None]:
train

In [None]:
test

In [None]:
train.info()

In [None]:
test.info()

### 3. EDA

#### 3-1. Target value EDA

In [None]:
print(f"Number of Unique Label Groups: {colored(train.label_group.nunique(), 'yellow')}")

In [None]:
# show top10

top10 = train.label_group.value_counts()[:10]
top10_df = pd.DataFrame({"Id" : top10.index, "Count" : top10.values})
display(top10_df)

plt.figure(figsize=(12, 6))
sns.set_palette("Set2")
sns.barplot(data=top10_df, x="Id", y="Count", order=top10_df.Id)
plt.xticks(rotation=45)
plt.xlabel("Label Group", fontsize=14)
plt.ylabel("Image Count", fontsize=14)
plt.title("Top10 Label Groups by Image Count")
plt.show()

#### 3-2. Image features EDA

In [None]:
train_img_folder = base_path + "train_images/"
test_img_folder = base_path + "test_images/"

In [None]:
def getImagePaths(path):
    image_paths = glob(path + "*.jpg")
    return image_paths

In [None]:
train_img_paths = getImagePaths(train_img_folder)
test_img_paths = getImagePaths(test_img_folder)

In [None]:
print(f"Number of train images: {len(train_img_paths)}\n")
print(f"Number of test images: {len(test_img_paths)}\n")

In [None]:
def get_image_Shape(img_paths):
    shape = cv2.imread(img_paths[0]).shape
    for img_path in img_paths:
        img_shape = cv2.imread(img_path).shape
        if img_shape != shape:
            return f"Has Different image shape in {img_path} with {img_shape}."
        else:
            return f"Has Same image shape, {shape}"

In [None]:
print(get_image_Shape(train_img_paths))
print(get_image_Shape(test_img_paths))

In [None]:
def get_image_pixelmat(img_path):
    img = cv2.imread(img_path)
    print("Min pixel value : ", np.min(img))
    print("Max pixel value : ", np.max(img))

    pixel_matrix = np.reshape(img, (img.shape[0] * img.shape[1], img.shape[2]))
    print("Shape of pixel matrix : ", pixel_matrix.shape)
    return pixel_matrix

In [None]:
get_image_pixelmat(train_img_paths[0])

In [None]:
def display_multiple_img(img_paths, nrows, ncols, title):
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16, 8))
    plt.suptitle(title, fontsize=20)
    for idx, img_path in enumerate(img_paths):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        try:
            ax.ravel()[idx].imshow(img)
            ax.ravel()[idx].set_axis_off()
        except:
            continue
    plt.tight_layout()
    plt.show()

In [None]:
display_multiple_img(train_img_paths[0:16], 4, 4, "Train Images")

In [None]:
display_multiple_img(test_img_paths, 1, 3, "Test Images")

### 4. 이미지 임베딩하여 가까운 이미지 찾기

- pretrained model(ResNet 101)을 이용하여 Image를 embedding vector로 만듭니다.

- KNN Search를 이용하여 가까운 이미지를 찾습니다.

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=train_img_folder): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange(len(self.df))
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(((len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size))
        return X

In [None]:
model = ResNet101(weights="imagenet", include_top=False, pooling="avg", input_shape=None)
train_gen = DataGenerator(train, batch_size=128)

In [None]:
image_embeddings = model.predict(train_gen, verbose=1)

In [None]:
K = 10
knn = NearestNeighbors(n_neighbors=K)
knn.fit(image_embeddings)
distances, indices = knn.kneighbors(image_embeddings)

In [None]:
ROWS=2
COLS=4
for c in range(5):
    print("Cluster ",c)  
    t = train.loc[(indices[c,:8])]
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for j in range(COLS):
            row = COLS*k + j
            name = t.iloc[row,1]
            img = cv2.imread(train_img_folder + name)
            
            #converting from BGR to RGB
            img = img[:, :, ::-1]
            
            plt.subplot(1,COLS,j+1)
            plt.axis('off')
            plt.imshow(img)
        plt.show()

### References

1. https://en.wikipedia.org/wiki/Data_analysis#Initial_data_analysis

2. https://www.kaggle.com/ishandutta/v7-shopee-indepth-eda-one-stop-for-all-your-needs

3. https://www.kaggle.com/heyytanay/shopee-eda-understanding-the-competition?scriptVersionId=60038837

4. https://www.kaggle.com/ruchi798/shopee-eda-rapids-preprocessing-w-b?scriptVersionId=59674647

5. https://www.kaggle.com/isaienkov/shopee-data-understanding-and-analysis