# **Shopee Product Matching**
![](https://aseanrecords.world/wp-content/uploads/product/userfiles/upload/images/cach-seo-tren-shopee.png)

# Table Of Contents

* Description
* Import Libraries
* Load The Data
* Exploratory Data Analysis
* Prediction By Text 
* KNN Model
* Prediction By Image 


# Description (from Kaggle)


Shopee is the leading e-commerce platform in Southeast Asia and Taiwan. Customers appreciate its easy, secure, and fast online shopping experience tailored to their region. The company also provides strong payment and logistical support along with a 'Lowest Price Guaranteed' feature on thousands of Shopee's listed products.

Two different images of similar wares may represent the same product or two completely different items. Retailers want to avoid misrepresentations and other issues that could come from conflating two dissimilar products. Currently, a combination of deep learning and traditional machine learning analyzes image and text information to compare similarity. But major differences in images, titles, and product descriptions prevent these methods from being entirely effective.

## Import Libraries

In [None]:
# Load libraries
import os
import re
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import cv2
import random

import re
import nltk
nltk.download('popular')

import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

import seaborn as sns
import matplotlib.pyplot as plt

import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
from PIL import Image
from wordcloud import WordCloud, STOPWORDS


In [None]:
# Function that displays images
def plot_img(df_plot,cols=4,rows=4):
    path='../input/shopee-product-matching/train_images/'
    for k in range(rows):
        plt.figure(figsize=(20,5))
        for j in range(cols):
            row = cols*k + j
            image = df_plot.iloc[row,1]
            img = cv2.imread(path+image)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.subplot(1,cols,j+1)
            plt.axis('off')
            plt.imshow(img)
        plt.show()

## Load The Data

In [None]:
path = '../input/shopee-product-matching/'
train_path = '../input/shopee-product-matching/train_images'
test_path = '../input/shopee-product-matching/test_images'
data = pd.read_csv(path + 'train.csv')

# Creat full path feature
data["path"] = [os.path.join(train_path,s) for s in data["image"]]

# Exploratory Data Analysis

In [None]:
# Show the first 3 rows
data.head(3)

In [None]:
# Plot first 8 images
plot_img(data.iloc[0:8],4,2)

In [None]:
# Plot the first 8 titles
data['title'].iloc[0:8]

In [None]:
# Print the data shape
print(f'Shape: {data.shape}')
# Print how many unique labels
print('Unique label_groups = {}'.format( len(data["label_group"].unique()) ))

In [None]:
# Print the min\max sampels for label

num_groups = {}
len_groups = {}
for i in data['label_group']:
    num_groups[i] = data[data['label_group'] == i]
    
for i in num_groups:
    len_groups[i] = len(num_groups[i])

print(f'Maximum sum of label groups : {max(len_groups.values())}')
print(f'Minimum sum of label groups : {min(len_groups.values())}')

# Prediction By Text 

## Wordcloud

In [None]:
stopwords = set(STOPWORDS) 
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(data['title'])) 

# Plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 


### Cleaning The Title

In [None]:
# Define cleaning function for the title
def clean_text(text):

    lst_stopwords = nltk.corpus.stopwords.words("english")
    
    # Clean 
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    # Tokenize (convert from string to list)
    lst_text = text.split()
    
    # remove Stopwords
    lst_text = [word for word in lst_text if word not in 
                lst_stopwords]
    
    # Stemming (remove -ing, -ly)
    ps = nltk.stem.porter.PorterStemmer()
    lst_text = [ps.stem(word) for word in lst_text]                
    
    # back to string from list
    text = " ".join(lst_text)
    return text

data["title"] = data["title"].apply(lambda x: clean_text(x))

In [None]:
# Transfrom from pandas to cudf
data_cu = cudf.read_csv('../input/shopee-product-matching/train.csv')
data_cu['title']=data['title'].values
data_cu['path']=data['path'].values

## Tfidf Vectorization
![](https://miro.medium.com/max/638/1*Uucq42G4ntPGJKzI84b3aA.png)

In [None]:
# Sentence embeddings by Tfidf
model = TfidfVectorizer( binary=True)
text_embeddings = model.fit_transform(data_cu.title).toarray()

## KNN Model


In [None]:
# Fot the titls to KNN model with 51 neighbors
model = NearestNeighbors(n_neighbors=51)
model.fit(text_embeddings)
distances, indices = model.kneighbors(text_embeddings)

For our test group, 3 samples were randomly selected (samples with more than 10 samples in their group for presentation purposes)

In [None]:
DISTANCE = 1.15

# Function for plotting the nearest neighbors
def plot_knn_test(index_test):
    plt.figure(figsize=(10,3))
    plt.plot(np.arange(51),cupy.asnumpy(distances[index_test,]),'o-')
    plt.ylabel('Distance') 
    plt.xlabel('Index')
    plt.show()
    
    counter=0
    for dis in enumerate(cupy.asnumpy(distances[index_test,])):
        if ( dis[1] < DISTANCE ):
            counter +=1
            
    predictions= data_cu.loc[cupy.asnumpy(indices[index_test,: counter])]
    predictions=predictions.to_pandas()
    print(predictions[['title','label_group']])
    return predictions


plot_img(plot_knn_test(177), 4,4)    
plot_img(plot_knn_test(183), 3,1)    
plot_img(plot_knn_test(187), 4,4)   

# Prediction By Image 

## Resnet18 model for feature extraction

Inspired by : 
https://becominghuman.ai/extract-a-feature-vector-for-any-image-with-pytorch-9717561d1d4c

![](https://miro.medium.com/max/3960/1*1BEDb6N5T4ZRZVb31IpKsw.png)

In [None]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')
# Set model to evaluation mode
model.eval()

In [None]:
# Defining transformation for resize, crop, normalization
transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize(256),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
# Extract the feature vector function
def get_vector(image):
    # Create a PyTorch tensor with the transformed image
    t_img = transforms(image)
    # Create a vector of zeros that will hold our feature vector
    # The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)

    # Define a function that will copy the output of a layer
    def copy_data(m, i, o):
        my_embedding.copy_(o.flatten())                 # <-- flatten

    # Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # Run the model on our transformed image
    with torch.no_grad():                               
        model(t_img.unsqueeze(0))                       
    # Detach our copy function from the layer
    h.remove()
    # Return the feature vector
    return my_embedding

In [None]:
# Feature extraction & vectorization for all the images

img_list_vectors =[]
for comp_index in tqdm( range(len(data)) ):
    img_list_vectors.append( get_vector(Image.open(data["path"][comp_index])))


In [None]:
# Plotting the nearest samples by cosin similarity
def plot_test_pic(test_pic):
    THRESHOLD=0.8
    predictions =[]
    # Compare Cosine Similarity between the test and all the images
    for comp_index in tqdm( range(len(img_list_vectors)) ):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        cos_sim = cos(test_pic.unsqueeze(0),img_list_vectors[comp_index].unsqueeze(0))
        if (cos_sim > THRESHOLD):
            predictions.append(data.iloc[comp_index])
            
    return  pd.DataFrame(predictions) 

In [None]:
predictions = plot_test_pic( get_vector(Image.open(data["path"][177])) )
plot_img(predictions,4,4)

In [None]:
predictions = plot_test_pic( get_vector(Image.open(data["path"][183])) )
plot_img(predictions,4,1)

In [None]:
predictions = plot_test_pic( get_vector(Image.open(data["path"][187])) )
plot_img(predictions,5,3)

## Improvement that needs to be done:
* Find the optimal distance in KNN
* Finding the optimal threshold in cosin similarity
* Combining the predictions received from the two models into one model
* Finding a solution to the runtime problem when comparing cosin similarity