# Problem Statement

Determine if images of two products are similar.

# Libraries

In [15]:
# Basic

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil, os, time
from pathlib import Path

# CNN
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPool2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.preprocessing import image

# Reading the Data File

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.head(2)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


In [4]:
df_train['label_group'] = df_train['label_group'].astype('str')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34250 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   34250 non-null  object
 1   image        34250 non-null  object
 2   image_phash  34250 non-null  object
 3   title        34250 non-null  object
 4   label_group  34250 non-null  object
dtypes: object(5)
memory usage: 1.3+ MB


# Reading the Image Data

In [5]:
TrainingImagePath = 'C:\\Users\\ASHUTOSH DAS\\Documents\\Compettitions\\Kaggle\\shopee-product-matching\\train_images'
TestingImagePath =  'C:\\Users\\ASHUTOSH DAS\\Documents\\Compettitions\\Kaggle\\shopee-product-matching\\test_images'

## Reading Images from Data Frame

In [6]:
train_datagen = ImageDataGenerator(rescale=1./255,
                                  shear_range = 0.1,
                                  zoom_range = 0.1,
                                  horizontal_flip = True)

training_set = train_datagen.flow_from_dataframe(dataframe = df_train,
                                                directory = TrainingImagePath,
                                                x_col = 'image',
                                                y_col = 'label_group',
                                                target_size = (64,64),
                                                class_mode = 'categorical')

validation_set = train_datagen.flow_from_dataframe(dataframe = df_train,
                                                directory = TrainingImagePath,
                                                x_col = 'image',
                                                y_col = 'label_group',
                                                target_size = (64,64),
                                                class_mode = 'categorical')

test_set = [x for x in list(os.listdir('test_images/'))]

Found 34250 validated image filenames belonging to 11014 classes.
Found 34250 validated image filenames belonging to 11014 classes.


## Creating a List of Images and Classes

In [7]:
TrainClasses = training_set.class_indices

ResultMap = {}
for facevalue, faceName in zip(TrainClasses.values(), TrainClasses.keys()):
    ResultMap[facevalue] = faceName
    
print('Mapping of Image and ID: ', ResultMap)

OutputNeurons = len(ResultMap)
print('\n The number of output Neurons: ', OutputNeurons)

Mapping of Image and ID:  {0: '1000051365', 1: '1000106726', 2: '1000644922', 3: '1000673727', 4: '100112565', 5: '1001292672', 6: '1001549466', 7: '1001789326', 8: '1002023044', 9: '1002199397', 10: '1002409299', 11: '1002765200', 12: '100304050', 13: '1003252496', 14: '1003513025', 15: '100467555', 16: '1005010065', 17: '1005412638', 18: '100596628', 19: '1006632886', 20: '1006682573', 21: '1006857895', 22: '1006973111', 23: '10069919', 24: '100773610', 25: '1007825488', 26: '1007887132', 27: '1008524614', 28: '1008700839', 29: '1009103687', 30: '1009426481', 31: '1009607791', 32: '1010342048', 33: '1010522046', 34: '1010747501', 35: '101129666', 36: '1011405834', 37: '1011603387', 38: '1011931446', 39: '1012512895', 40: '1012631413', 41: '1013017519', 42: '1013219548', 43: '1013339050', 44: '1013980683', 45: '1014266988', 46: '1014487628', 47: '1014722657', 48: '1014849774', 49: '1014916122', 50: '1015072454', 51: '1015815580', 52: '1016089803', 53: '1016161298', 54: '1016178132', 5

# Defining the CNN Model

In [8]:
classifier = Sequential()

classifier.add(Convolution2D(filters = 64, kernel_size = (5,5), strides = (2,2), input_shape = (64,64,3),
                            activation = 'relu'))
classifier.add(Convolution2D(filters = 32, kernel_size = (3,3), strides = (1,1), input_shape = (64,64,3),
                            activation = 'softmax'))


classifier.add(MaxPool2D(pool_size = (5,5)))
classifier.add(MaxPool2D(pool_size = (4,4)))
classifier.add(Flatten())
classifier.add(Dense(64, activation='relu'))

classifier.add(Dense(OutputNeurons, activation='softmax'))

classifier.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=["accuracy"])

## Fitting the Model

In [9]:
StartTime = time.time()

classifier.fit_generator(training_set,
                       steps_per_epoch = 30,
                       epochs = 10,
                       validation_data = validation_set,
                       validation_steps = 20)

EndTime = time.time()

print('################# Total Time Taken: ', round((EndTime-StartTime)/60, 2), 'Minutes ##########')

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
################# Total Time Taken:  2.62 Minutes ##########


In [21]:
# Generating Predictions

testImage = 'C:\\Users\\ASHUTOSH DAS\\Documents\\Compettitions\\Kaggle\\shopee-product-matching\\test_images\\0006c8e5462ae52167402bac1c2e916e.jpg'

test_im = image.load_img(testImage, target_size=(64,64))
test_image = image.img_to_array(test_im)
test_image=np.expand_dims(test_image,axis=0)
result = classifier.predict(test_image, verbose=0)

    
print('Predictions are: ', ResultMap[np.argmax(result)])

Predictions are:  1141798720


In [29]:
# Generating Predictions

testImage = [x for x in test_set]

matches = []

for i in range(len(testImage)):
    test_img_dir = '\\'.join([TestingImagePath,testImage[i]])
    test_im = image.load_img(test_img_dir, target_size=(64,64))
    test_image = image.img_to_array(test_im)
    test_image=np.expand_dims(test_image,axis=0)
    result = classifier.predict(test_image, verbose=0)
    matches.append(ResultMap[np.argmax(result)])
    
# Creating DataFrame with Predictions

sub_proj = pd.DataFrame()
sub_proj['posting_id'] = df_test['posting_id']
sub_proj['matches'] = matches

# Exproting the Predictions

sub_proj.to_csv('submission.csv')