In [None]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-0553a41a-41e3-9e81-a52e-420df0134a7a)


# Download Dataset

In [None]:
# Removed
# Competition Use Only

!mkdir '/content/feature_data/'

# Code

In [None]:
import numpy as np
import pandas as pd
import joblib

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0, ResNet50

from scipy.spatial import distance
from scipy.stats import skew, kurtosis

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm.sklearn import LGBMClassifier

from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
PROJECT_DIR = '/content/drive/MyDrive/Colab Projects/product-pair-matching/'
DATA_DIR = PROJECT_DIR+'data/'
OUTPUTS_DIR = PROJECT_DIR+'outputs/'

TRAIN_IMG_DIR = '/content/data/training_img/training_img'
TEST_IMG_DIR = '/content/data/test_img/test_img'

TARGET_SIZE = (224, 224)
INPUT_SHAPE = (224, 224, 3)
BATCH_SIZE = 256

train_df = pd.read_csv(DATA_DIR+'raw/new_training_set.csv', index_col=0)
test_df = pd.read_csv(DATA_DIR+'raw/new_test_set.csv', index_col=0)

## Feature Extractor Model

In [None]:
def create_model():
    tf.keras.backend.clear_session()
    print('Create model...')
    pretrained_model = ResNet50(include_top=False,
                                weights='imagenet',
                                input_shape=INPUT_SHAPE)
    x = pretrained_model.get_layer('conv5_block3_2_bn').output # ResNet50
    # x = pretrained_model.get_layer('block7a_project_bn').output # EffnetB0
    x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(x)
    
    model = tf.keras.models.Model(inputs=pretrained_model.input,
                                  outputs=x)
    model.trainable = False
    return model

In [None]:
# MODEL_DIR = OUTPUTS_DIR+'extractor/resnet50-notop.h5'
MODEL_DIR = '/content/models/resnet50-notop.h5'

model = create_model()
print('Save model...')
model.save(MODEL_DIR)
print(MODEL_DIR)

Create model...
Save model...
/content/models/resnet50-notop.h5


## Feature Extraction

### Create feature vector

In [None]:
def calculate_distance(vect_1, vect_2):
    return [
        distance.euclidean(vect_1, vect_2),
        distance.braycurtis(vect_1, vect_2),
        distance.canberra(vect_1, vect_2),
        distance.chebyshev(vect_1, vect_2),
        distance.cityblock(vect_1, vect_2),
        distance.cosine(vect_1, vect_2),
        distance.minkowski(vect_1, vect_2),
        skew(np.nan_to_num(vect_1)),
        skew(np.nan_to_num(vect_2)),
        kurtosis(np.nan_to_num(vect_1)),
        kurtosis(np.nan_to_num(vect_2)),
    ]

def create_image_gen(df, x_col, img_dir):
    datagen = ImageDataGenerator(rescale=1./255)

    image_gen = datagen.flow_from_dataframe(
        dataframe=df,
        directory=img_dir,
        x_col=x_col,
        y_col=None,
        class_mode=None,
        target_size=TARGET_SIZE,
        classes=None,
        batch_size=BATCH_SIZE,
        shuffle=False
        )
    return image_gen

def create_image_feature(df, model, img_dir, dir):
    print('Create image data generator...')
    image_1_gen = create_image_gen(df, 'image_1', img_dir)
    image_2_gen = create_image_gen(df, 'image_2', img_dir)

    print('Calculate image vector...')
    vect_1 = model.predict(image_1_gen, batch_size=BATCH_SIZE, verbose=1)
    vect_2 = model.predict(image_2_gen, batch_size=BATCH_SIZE, verbose=1)

    print('Calculate distance...')
    distance = []
    for i in range(len(vect_1)):
        distance.append(calculate_distance(vect_1[i], vect_2[i]))

    image_feat = np.concatenate([vect_1, vect_2, np.array(distance)],axis=1)
    print('Save image vector...')
    joblib.dump(image_feat, dir)
    print(dir)
    return image_feat

In [None]:
print('TRAIN DATA')
TRAIN_VECTOR_DIR = DATA_DIR+'interim/train_image_vector.pkl'
train_image_vector = create_image_feature(train_df, model, TRAIN_IMG_DIR, TRAIN_VECTOR_DIR)
print(train_image_vector.shape)

print('TEST DATA')
TEST_VECTOR_DIR = DATA_DIR+'interim/test_image_vector.pkl'
test_image_vector = create_image_feature(test_df, model, TEST_IMG_DIR, TEST_VECTOR_DIR)
print(test_image_vector.shape)


TRAIN DATA
Create image data generator...
Found 10181 validated image filenames.
Found 10181 validated image filenames.
Calculate image vector...
Calculate distance...
Save image vector...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/interim/train_image_vector.pkl
(10181, 1035)
TEST DATA
Create image data generator...
Found 32580 validated image filenames.
Found 32580 validated image filenames.
Calculate image vector...
Calculate distance...
Save image vector...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/interim/test_image_vector.pkl
(32580, 1035)


### Create feature dataframe

In [None]:
def create_feature_df(df, vect, dir, label_col=None):
    vect_len = 512
    dist_len = 11
    col_list = [f'img_1_{i}' for i in range(vect_len)] + \
                [f'img_2_{i}' for i in range(vect_len)] + \
                [f'img_dist_{i}' for i in range(dist_len)]

    feats_df = pd.DataFrame(
        data=vect,
        columns=col_list)
    
    if label_col is not None:
        feats_df[label_col] = df[label_col]

    print('Save dataframe...')
    feats_df.to_csv(dir, index=False)
    print(dir)
    return feats_df

In [None]:
print('TRAIN DATA')
TRAIN_DF_DIR = DATA_DIR+'clean/train_image_df.csv'
train_image_df = create_feature_df(train_df, train_image_vector, TRAIN_DF_DIR, 'Label')
print(train_image_df.shape)

print('TEST DATA')
TEST_DF_DIR = DATA_DIR+'clean/test_image_df.csv'
test_image_df = create_feature_df(test_df, test_image_vector, TEST_DF_DIR)
print(test_image_df.shape)

TRAIN DATA
Save dataframe...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/clean/train_image_df.csv
(10181, 1036)
TEST DATA
Save dataframe...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/clean/test_image_df.csv
(32580, 1035)
