## Setting

In [None]:
! pip install contractions
! pip install Unidecode
! pip install word2number

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm.notebook import tqdm as note_book_tqdm

import contractions
import nltk
from wordcloud import WordCloud, STOPWORDS
from unidecode import unidecode
from word2number import w2n
import re

note_book_tqdm.pandas(desc="progress: ")

import tensorflow as tf
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

print('TF', tf.__version__)

In [None]:
plt.rcParams["font.size"] = 16

## Data Reading

In [None]:
df_train = pd.read_csv('../input/shopee-product-matching/train.csv')

## Data observation

### Train Data shape and sample rows

In [None]:
print('shape', df_train.shape)
df_train.head()

### Train Data info

In [None]:
df_train.info()

### label_group to index

In [None]:
label_mapper = dict(zip(df_train['label_group'].unique(), np.arange(len(df_train['label_group'].unique()))))
df_train['label_group'] = df_train['label_group'].map(label_mapper)

In [None]:
df_train.head()

### Number of items per label group 

In [None]:
label_groups = df_train['label_group'].value_counts(ascending=False)
print('Unique Item Count', len(label_groups))

##### Memo
- the number of data is different for each label group.

In [None]:
plt.figure(figsize=(20,5))
plt.plot(np.arange(len(label_groups)),label_groups.values)
plt.ylabel('LabelGroup Item Count',size=14)
plt.xlabel('Index of Unique Item',size=14)
plt.title('Number of items per group',size=16)
plt.show()

### Visualizing Images

#### plot random images

In [None]:
def plot_random_images(images_count):
    
    plot_list = df_train['image'].sample(n=images_count).tolist()
    size = np.sqrt(images_count)
    if int(size)*int(size) < images_count:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(f'../input/shopee-product-matching/train_images/{image_id}', )
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(image)
        plt.title(image_id, fontsize=12)
        plt.axis("off")
        ind+=1
    plt.show()

In [None]:
plot_random_images(15)

#### plot group images

In [None]:
def plot_group_images(group_id, df):
    
    plot_list = df[df['label_group'] == group_id]
    plot_list = plot_list['image'].tolist()
    images_count = len(plot_list)
    size = np.sqrt(images_count)
    if int(size)*int(size) < images_count:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    ind=0
    for image_id in plot_list:
        plt.subplot(size, size, ind + 1)
        image = cv2.imread(f'../input/shopee-product-matching/train_images/{image_id}', )
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(image_id, fontsize=6)
        plt.axis("off")
        ind+=1
    plt.show()
    
    sample = df[df['label_group'] == group_id]
    print(f'Total number of items in group {group_id}: {len(sample)}, number of unique titles: {sample.nunique()}')

In [None]:
plt.figure(figsize=(20,5))
plt.bar(label_groups.index.values[:50].astype('str'), label_groups.values[:50])
plt.xticks(rotation = 45)
plt.ylabel('Duplicate Count',size=14)
plt.xlabel('Label Group',size=14)
plt.title('Top 50 Duplicated Items',size=16)
plt.show()

In [None]:
plot_group_images(106, df_train)

In [None]:
plot_group_images(48, df_train)

In [None]:
plot_group_images(307, df_train)

In [None]:
plot_group_images(979, df_train)

In [None]:
plot_group_images(252, df_train)

In [None]:
plot_group_images(283, df_train)

In [None]:
plot_group_images(714, df_train)

##### Memo
- there are several images that are exactly the same in the same group.

### Remove Dupilicated Data

In [None]:
def remove_duplicated_row(df, col_name):
    df_removed = df_train.drop_duplicates([col_name])
    print('Original data size', len(df))
    print('Reamoved data size', len(df_removed))
    removed_duplicated_label_groups = df_removed['label_group'].value_counts(ascending=False)
    print('Original Unique Item Count', len(label_groups))
    print('Reamoved Unique Item Count', len(removed_duplicated_label_groups))
    
    return df_removed

#### Same Imgae name

In [None]:
df_removed_same_image = remove_duplicated_row(df_train, 'image')

In [None]:
plot_group_images(979, df_removed_same_image)

#### Same title

In [None]:
df_removed_same_title = remove_duplicated_row(df_train, 'title')

In [None]:
plot_group_images(979, df_removed_same_title)

#### Same image_phash

In [None]:
df_removed_same_phash = remove_duplicated_row(df_train, 'image_phash')

In [None]:
plot_group_images(979, df_removed_same_phash)

#### Same dhash

In [None]:
def dhash(image_name, hashSize=8):
    """calculation to create a numerical representation of the image."""
    image_path = f'../input/shopee-product-matching/train_images/{image_name}'
    image = cv2.imread(image_path, )
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (hashSize + 1, hashSize))
    diff = resized[:, 1:] > resized[:, :-1]
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

In [None]:
df_train.loc[:, 'dhash'] = df_train.image.progress_apply(dhash)

In [None]:
df_removed_same_dhash = remove_duplicated_row(df_train, 'dhash')

In [None]:
plot_group_images(979, df_removed_same_dhash)

### Distribution of image shapes

In [None]:
def get_image_shape(image_name):
    image_path = f'../input/shopee-product-matching/train_images/{image_name}'
    image = cv2.imread(image_path)
    return image.shape

In [None]:
df_train.loc[:, 'shape'] = df_train.image.progress_apply(get_image_shape)

In [None]:
df_train[['width', 'height', 'chanel']] = pd.DataFrame(df_train['shape'].tolist(), index=df_train.index)

In [None]:
df_train.head()

In [None]:
images_shape = df_train['shape'].value_counts(ascending=False)
print('image shape variations', len(images_shape))
images_shape.head(15)

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(df_train['width'], df_train['height'], alpha=0.3)
plt.xlabel('image width')
plt.ylabel('image height')
plt.show()

##### Memo
- Most of the images are square in shape.
- The most common size is 640✖️640

### Distribution of title length

In [None]:
df_train.loc[:, 'title_length'] = df_train.title.progress_apply(lambda x: len(x))

In [None]:
df_train.head()

In [None]:
title_length = df_train['title_length'].value_counts(ascending=False)
title_length

### (TODO)Analyze the title column
- wordcloud
- preporocess
- etc

In [None]:
stop_words = set(stopwords.words('english'))
def text_preprocess(title):
    # Convert Accented Characters
    title = unidecode(title)
    # Expand Contractions
    title = contractions.fix(title)
    # Lowercase all texts
    title = title.lower()
    # Remove special characters
    title = re.sub(r"[^a-zA-Z0-9]+", ' ', title)
    # title to word list
    title = word_tokenize(title)
    # Remove stopwords
    title = [w for w in title if not w in stop_words]
    title = ' '.join(title)


    return title

In [None]:
df_train.loc[:, 'clean_title'] = df_train.title.progress_apply(text_preprocess)

In [None]:
df_train.loc[:, 'clean_title_length'] = df_train.clean_title.progress_apply(lambda x: len(x))

In [None]:
df_train.head()

In [None]:
clean_title_length = df_train['clean_title_length'].value_counts(ascending=False)
clean_title_length.head()

In [None]:
fig = plt.figure(figsize=(20, 8))
fig.suptitle("Difference in the number of characters between the original title and the clean title")
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
ax1.set_xlabel('original title length')
ax1.set_ylabel('item count')
ax2.set_xlabel('clean title length')
ax2.set_ylabel('item count')
ax1.bar(clean_title_length.index, clean_title_length.values, width=1, color='b', alpha=0.5)
ax2.bar(title_length.index, title_length.values, width=1, color='r', alpha=0.5)
plt.show()

# Simple model for the same titles

In [None]:
test = pd.read_csv('../input/shopee-product-matching/test.csv')

In [None]:
check = test.groupby(['title']).count().reset_index()['title'].tolist()
a = []
b = []
for item in check:
    res = test[test['title']== item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item) + " "
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission1 = pd.DataFrame()
submission1['posting_id'] = a
submission1['matches'] = b
submission1

In [None]:
check = test.groupby(['image_phash']).count().reset_index()['image_phash'].tolist()
a = []
b = []
for item in check:
    res = test[test['image_phash']== item]['posting_id'].tolist()
    ans = ""
    for id_item in res:
        ans = ans + str(id_item) + " "
    ans = ans[:-1]
    for id_item in res:
        a.append(id_item)
        b.append(ans)

In [None]:
submission2 = pd.DataFrame()
submission2['posting_id'] = a
submission2['matches'] = b
submission2

In [None]:
submission2 = pd.DataFrame()
submission2['posting_id'] = a
submission2['matches'] = b
submission2

In [None]:
sub = pd.merge(submission1, submission2, on='posting_id', how='inner')
sub['list'] = sub['matches_x'] + sub['matches_y']
sub

In [None]:
final = []
for index, row in sub.iterrows():
    res = list(set(row['list'].split(' ')))
    ans = ""
    for item in res:
        ans = ans + str(item) + " "
    ans = ans[:-1]
    final.append(ans)
    
submission = pd.DataFrame()
submission['posting_id'] = sub['posting_id']
submission['matches'] = final
submission

In [None]:
submission.to_csv('submission.csv', index=False)