## Import necessary libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import matplotlib.image as mpimg


In [None]:
def plt_imgs(df, n_cols=4, n_rows=4):
    plt.figure(figsize=(n_cols*4.2, n_rows*4.2))
    for row in range(n_rows):
        for col in range(n_cols):
            index = n_cols * row + col
            plt.subplot(n_rows, n_cols, index+1)
            image = mpimg.imread(train_imgs_dir+df.iloc[index:index+1].image.item())
            plt.imshow(image, interpolation='nearest')
            plt.axis('off')
            plt.title(df.iloc[index:index+1].label_group.item())

## Read data

In [None]:
root_dir = '/kaggle/input/shopee-product-matching/'
train_imgs_dir = root_dir+'train_images/'
test_imgs_dir = root_dir+'test_images/'

train = pd.read_csv(root_dir+'train.csv')
test = pd.read_csv(root_dir+'test.csv')
submission = pd.read_csv(root_dir+'sample_submission.csv')


In [None]:
# take a look into csvs
print('Train shape:', train.shape)
train.head()

In [None]:
train.nunique(axis=0)

We have 34250 rows in training data, with 5 columns.
    1. "posting_id" column has unique values
    2. "image" column has some duplicates
    3. "image_phash" also has some duplicates
    4. "title" has some duplicates
    5. "label" group has some duplicates
    
Let's look at a sample of each column's duplicates.

In [None]:
# image duplicates sample
dup_img = train[train.image.duplicated()].sample(2).image.tolist()
train[train.image.isin(dup_img)]

In [None]:
tmp = train[train.image.isin(dup_img)]
n_rows = 2
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

We can see that this is just a duplicated row, except for posting_id which is a unique identifier.

In [None]:
# image_phash duplicate sample
dup_phash = train[train.image_phash.duplicated()].sample(2).image_phash.tolist()
train[train.image_phash.isin(dup_phash)]

In [None]:
tmp = train[train.image_phash.isin(dup_phash)].sort_values('label_group')
n_rows = 2
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

So we can see that images may duplicated in multiple files with different names and that is discovered by images with duplicates pHash.

In [None]:
# title duplicates sample
dup_title = train[train.title.duplicated()].sample(5).title.tolist()
train[train.title.isin(dup_title)].sort_values(['label_group'])

In [None]:
tmp = train[train.title.isin(dup_title)].sort_values(['label_group'])
n_rows = 2
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

We can see here the cases that the algorithm is supposed to solve, as some items have the same title, but completely different images.

Duplicate titles may have different image, image_phash but the same label_group.

In [None]:
# label_group duplicate sample
dup_label = train[train.label_group.duplicated()].sample(5).label_group.tolist()
train[train.label_group.isin(dup_label)].sort_values(by=['label_group'])

In [None]:
tmp = train[train.label_group.isin(dup_label)].sort_values(by=['label_group'])
n_rows = 7
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

So the final notes on this dataframe is that label_group represents an items that might have slightly different titles, with different images, but they are the same item after all. 

image_phash represents fingerprints of images similar to each other, and so far the sample showed that these images represent the same items, but is it possible for an image_phash to be the same with images of totaly different items? This could be found out by searching for rows that have similar image_phash, but a different label_group. 

In [None]:
phash_labels = train.groupby('image_phash')['label_group'].nunique().reset_index()
phash_labels[phash_labels.label_group > 1]

So evidently we have 147 image_phash that doesn't have the same label_group.
Let's look more into them.

In [None]:
phash_mult_labels = phash_labels[phash_labels.label_group > 1].image_phash.tolist()
train[train.image_phash.isin(phash_mult_labels[:5])].sort_values('image_phash')

We can see in train_2018235992 and train_1810772318 that they have the same image and title but have different label groups, and I think that if the the items are going to be judged to be the same or not based on label_group, this column needs to be cleaned in order to obtain good results, as if left the way it is the model will be fed contradictory inputs, where an item with literally the same title and pHash similarity will won't be the same.

In [None]:
tmp = train[train.image_phash.isin(phash_mult_labels[:5])].sort_values('image_phash')
n_rows = 3
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

We can see that two items with literally the same images have different label groups.

In [None]:
phash_mult_labels = phash_labels[phash_labels.label_group > 1].image_phash.tolist()
train[train.image_phash.isin(phash_mult_labels[120:131])]

In [None]:
tmp = train[train.image_phash.isin(phash_mult_labels[120:131])].sort_values('image_phash')
n_rows = 7
n_cols = tmp.shape[0]//n_rows
plt_imgs(tmp, n_rows=n_rows, n_cols=n_cols)

I think that this highlights a very important pint, and that is it will impossible to tag these images based only on their image, as there must some diffreence in their titles that deems them to be in different labels, or this is just mislabelled rows that need to be corrected.

Further exploration may be:
- looking into images with the same image_phash
- images with the same label_group
- exploring the title column separetely and in relationship with other columns

In [None]:
test.head()

## text preprocessing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# remove special characters
train.title = train.title.str.replace('[^A-Za-z0-9]+', ' ', regex=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
import random

def inspect(vectoriser, X):
    # Fit and transform
    start = time()
    # counts features
    print(f"There are {vectoriser.fit_transform(X).shape[1]} columns.\n")
    end = time()
    print(f"Took {round((end-start),2)} seconds.\n")
    
    # Inspect tokens
    tokens = list(vectoriser.vocabulary_.keys())
    tokens.sort()
    print(f"Example tokens: {tokens[:100]}\n")
    
    # Inspect ignored tokens
    ignored = vectoriser.stop_words_
    if len(ignored)==0:
        print("No token is ignored.")
    elif len(ignored)>50:
        print(f"Example ignored tokens: {random.sample(ignored, 50)}")
    else:
        print(f"Example ignored tokens: {ignored}")

In [None]:
vectoriser = TfidfVectorizer(token_pattern=r'[A-Za-z]+', stop_words='english', min_df=30, max_df=.7)
inspect(vectoriser, train['title'])

In [None]:
vectoriser.transform(train.title).shape

In [None]:
import numpy as np 
from scipy.spatial.distance import cdist

vectorised = vectoriser.transform(train.title)
sim_matrix = cdist(vectorised, vectorised, metric='cosine')
sim_matrix

In [None]:
vectorised.shape

1. Simplest approach is to classify similarity based on title and pHash simialrity
2. title similarity could be calculated using cosine simialrity of tfidf vectors generated from training data
3. pHash similarity could be calculated using a distance metric between two pairs of pHashs
4. After generating a similarity matrix using either title or pHash, training data should be generated using similarities to 
    predict whether the two images are the same or not
5. More complex methods should utilize the images (Not necessairly complex as the two images could be identical in shape and
    flattened then concatenated and directly fed to a logistic regression algorithm to predict whether they are the same or not).

Work in progress...