In [None]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

# Helper Functions

In [None]:
def get_image_paths(path:str) -> list:
    """Function to Combine Directory Path with individual Image Paths.
    
    Args:
        path (string): Path of directory
        
    Returns:
         list: A list with the Full Images Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

When the image file is read with the **OpenCV** function **imread()**, the order of colors is **BGR** (blue, green, red). On the other hand, in **Pillow**, the order of colors is assumed to be **RGB** (red, green, blue).

**Therefore, if you want to use both the Pillow function and the OpenCV function, you need to convert BGR and RGB.**

You can use the OpenCV function cvtColor() or simply change the order of ndarray.

In [None]:
def display_multiple_images(images_paths: list, rows:int, cols:int):
    """
    Function to Display Images from Dataset.
    
    Args: 
        images_path (list): Paths of Images to be displayed
        rows (int): No. of Rows in Output
        cols (int): No. of Columns in Output
    """
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()


# Load the Dataset

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
train_images_path = get_image_paths('../input/shopee-product-matching/train_images')

In [None]:
train_df.shape

In [None]:
train_df.head()

# Exploratory Data Analysis

In [None]:
train_df['label_group'].unique().shape

## Number of postings per product

ID code for all postings that map to the same product. Not provided for the test set.

1. **label_group:** ID code for all postings that map to the same product. Not provided for the test set.



In [None]:
group_size_df = pd.DataFrame({'group_size': train_df.groupby(['label_group']).agg('size')}).reset_index().sort_values(by='group_size', ascending=False)
group_size_df.head(10)

A **histogram** is a graphical representation commonly used to visualize the distribution of numerical data. When exploring a dataset, you'll often want to get a quick understanding of the distribution of certain numerical variables within it. You can do this by using a histogram. A histogram divides the values within a numerical variable into “bins”, and counts the number of observations that fall into each bin. By visualizing these binned counts in a columnar fashion, we can obtain a very immediate and intuitive sense of the distribution of values within a variable.



In [None]:
fig = px.histogram(group_size_df, x="group_size")
fig.show()

Get a random product with 6 postings and visualize the postings' images

## Get some samples of images belong to the same product 

In [None]:
group_size_df[
    group_size_df['group_size'] == 9
].sample(1)

In [None]:
sample_images = train_df[
    train_df['label_group'] == 373674159
]['image'].to_list()

In [None]:
full_path_sample_images = []
for img in sample_images:
    fullpath = os.path.join('../input/shopee-product-matching/train_images', img)
    full_path_sample_images.append(fullpath)

full_path_sample_images

In [None]:
display_multiple_images(full_path_sample_images, 3, 3)


In [None]:
group_size_df[
    group_size_df['group_size'] == 16
].sample(1)

In [None]:
sample_images = train_df[
    train_df['label_group'] == 4166375360
]['image'].to_list()

full_path_sample_images = []
for img in sample_images:
    fullpath = os.path.join('../input/shopee-product-matching/train_images', img)
    full_path_sample_images.append(fullpath)

In [None]:
display_multiple_images(full_path_sample_images, 4, 4)
