# Product Similarity with EfficientNet

In this notebook we will use a pretrained CNN (EfficientNetB0) to create embeddings for our embeddings. Afterwards, we will save these In a following notebook, we will use these embeddings to find simiilar products. 

In [None]:
import os
import pandas as pd

def get_article_images_df(path='../input/h-and-m-personalized-fashion-recommendations/images'):
    article_ids = []
    image_paths = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_path = fullpath
            article_id = fullpath.split('/')[-1].replace('.jpg', '')
            article_ids.append(article_id)
            image_paths.append(fullpath)
    return pd.DataFrame({'article_id': article_ids, 'image': image_paths})

In [None]:
df = get_article_images_df()

In [None]:
import cv2
import numpy as np
import tensorflow as tf


class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32):
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.df))

    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(((len(self.df)) % self.batch_size) != 0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        X = self.__data_generation(indexes)
        return X

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples'
        X = np.zeros((len(indexes), self.img_size, self.img_size, 3), dtype='float32')
        df = self.df.iloc[indexes]
        for i, (index, row) in enumerate(df.iterrows()):
            img = cv2.imread(row.image)
            X[i,] = cv2.resize(img, (self.img_size, self.img_size))  # /128.0 - 1.0
        return X

## Image Embeddings

Let's create the embeddings using EfficientNetB0, the lightest model of the EfficientNet series.

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from sklearn.neighbors import NearestNeighbors

model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg', input_shape=None)
train_gen = DataGenerator(df, batch_size=32)
image_embeddings = model.predict(train_gen, verbose=1)

In [None]:
with open('hm_embeddings_effb0.npy', 'wb') as f:
    np.save(f, image_embeddings)

## KNN Training

In [None]:
print('image embeddings shape is', image_embeddings.shape)
KNN = 12
knn = NearestNeighbors(n_neighbors=KNN)
knn.fit(image_embeddings)

In [None]:
from joblib import dump, load
dump(knn, 'knn.joblib')

A following notebook will be published using the KNN and the embeddings to find similar products.