In [None]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import random
#Text Color
from termcolor import colored

#NLP
from sklearn.feature_extraction.text import CountVectorizer

#WordCloud
from wordcloud import WordCloud, STOPWORDS

#Text Processing
import re
import nltk

# Load Dataset

### CSV Files

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
test = pd.read_csv('../input/shopee-product-matching/test.csv')
sample = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')

### Image Files

In [None]:
#Image Folder Paths
train_jpg_directory = '../input/shopee-product-matching/train_images'
test_jpg_directory = '../input/shopee-product-matching/test_images'

In [None]:
def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

In [None]:
#Get complete image paths for train and test datasets
train_images_path = getImagePaths(train_jpg_directory)
test_images_path = getImagePaths(test_jpg_directory)

# Tabular Exploration

### Dataset Head

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

This shows us that there are no nan values in the training dataset. One less thing to worry about!

### Dataset Size

In [None]:
print(f"Training Dataset Shape: {colored(train.shape, 'yellow')}")
print(f"Test Dataset Shape: {colored(test.shape, 'yellow')}")

### Column-wise Unique Values

In [None]:
for col in train.columns:
    print(col + ":" + colored(str(len(train[col].unique())), 'yellow'))

Thus, only the `posting_id` columns has unique values. Rest all the columns have duplicate values. 

### Number of Images in Each Directory

In [None]:
print(f"Number of train images: {colored(len(train_images_path), 'yellow')}")
print(f"Number of test images:  {colored(len(test_images_path), 'yellow')}")

In [None]:
def display_multiple_img(images_paths, rows, cols):
    """
    Function to Display Images from Dataset.
    
    parameters: images_path(string) - Paths of Images to be displayed
                rows(int) - No. of Rows in Output
                cols(int) - No. of Columns in Output
    """
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

### Train Images

In [None]:
display_multiple_img(train_images_path[100:150], 5, 5)

### Test Images

In [None]:
display_multiple_img(test_images_path, 1, 3)

# Image Title Exploration

### Wordcloud

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 800, 
                      height = 800,
                      background_color ='white',
                      min_font_size = 10,
                      stopwords = stopwords,).generate(' '.join(train['title'])) 

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.show() 

### Preproscessing text

In [None]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True):

    lst_stopwords = nltk.corpus.stopwords.words("english")
    
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()    
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [None]:
#Clean Address
train["clean_title"] = train["title"].apply(lambda x: preprocess_text(x, flg_stemm=False, flg_lemm=True, ))

In [None]:
#Length of Title
train['clean_title_len'] = train['clean_title'].apply(lambda x: len(x))

#Word Count
train['clean_title_word_count'] =train["clean_title"].apply(lambda x: len(str(x).split(" ")))

#Character Count
train['clean_title_char_count'] = train["clean_title"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

#Average Word Length
train['clean_title_avg_word_length'] = train['clean_title_char_count'] / train['clean_title_word_count']

In [None]:
train.head()

### Distribution Plots

In [None]:
def plot_distribution(x, title):

    fig = px.histogram(
    train, 
    x = x,
    width = 800,
    height = 500,
    title = title
    )
    
    fig.show()

In [None]:
plot_distribution(x = 'clean_title_len', title = 'Title Length Distribution')

In [None]:
plot_distribution(x = 'clean_title_word_count', title = 'Word Count Distribution')

In [None]:
plot_distribution(x = 'clean_title_char_count', title = 'Character Count Distribution')

In [None]:
plot_distribution(x = 'clean_title_avg_word_length', title = 'Average Word Length Distribution')

# Label Group Exploration

## Display Duplicated Items from Train Data
Using the column label_group which is the ground truth, we can display examples of duplicated items.

In [None]:
groups = train.label_group.value_counts()
plt.figure(figsize=(20,5))
plt.plot(np.arange(len(groups)),groups.values)
plt.ylabel('Duplicate Count',size=14)
plt.xlabel('Index of Unique Item',size=14)
plt.title('Duplicate Count vs. Unique Item Count',size=16)
plt.show()

plt.figure(figsize=(20,5))
plt.bar(groups.index.values[:50].astype('str'),groups.values[:50])
plt.xticks(rotation = 45)
plt.ylabel('Duplicate Count',size=14)
plt.xlabel('Label Group',size=14)
plt.title('Top 50 Duplicated Items',size=16)
plt.show()

In [None]:

def plot_from_label(group):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train[train['label_group'] == group]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)

    sq_num = int(sq_num)
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)

    plt.show()
    

    
def plot_from_title(title):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train[train['title'] == title]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)
    sq_num = int(sq_num)
    
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    fig.suptitle(f"Product Name: {title}")
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)
            
    plt.show()

In [None]:
# Plotting Products based on Image Label Group
plot_from_label(994676122)

In [None]:
# Product Images with Same Name
plot_from_title("Monde Boromon Cookies 1 tahun+ 120gr")

### Count of Unique Label Groups

In [None]:
print(f"No. of Unique Label Groups: {colored(train.label_group.nunique(), 'yellow')}")

### Image Label Groups by No. of Images

In [None]:
top10_names = train['label_group'].value_counts().index.tolist()[:15]
top10_values = train['label_group'].value_counts().tolist()[:15]
plt.style.use('dark_background')

plt.figure(figsize=(20, 10))
sns.barplot(x=top10_names, y=top10_values)
plt.xticks(rotation=45)
plt.xlabel("Label Group")
plt.ylabel("Image Count")
plt.title("Top-15 Label Groups by Image Count")
plt.show()

## Basic Image Exploration

Just look at image dimensions, confirm it's 3 band (RGB), byte scaled (0-255).

In [None]:
first = cv2.imread(train_images_path[0])
dims = np.shape(first)
print(dims)

In [None]:
np.min(first), np.max(first)

# Compute Baseline CV Score

A baseline is to predict all items with the same image_phash as being duplicate. Let's calcuate the CV score for this submission.

In [None]:
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

tmp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof'] = train.image_phash.map(tmp)

def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

train['f1'] = train.apply(getMetric('oof'),axis=1)
print('CV score for baseline =', train.f1.mean())