In [None]:
 !pip install -q rich

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
import glob
import random

from rich import print as _pprint

In [None]:
def cprint(string):
    
    _pprint(f"[black]{string}[/black]")

In [None]:
train_file=pd.read_csv('../input/shopee-product-matching/train.csv')

In [None]:
train_file.head()

In [None]:
test_file=pd.read_csv('../input/shopee-product-matching/test.csv')

In [None]:
test_file.head()

In [None]:
sample_sub=pd.read_csv('../input/shopee-product-matching/sample_submission.csv')

In [None]:
sample_sub.head()

In [None]:
total_train_files = glob.glob("../input/shopee-product-matching/train_images/*.jpg")
total_test_files = glob.glob("../input/shopee-product-matching/test_images/*.jpg")

cprint(f"[green]Total Training Images: {len(total_train_files)}[/green]")
cprint(f"[red]Total Testing Images: {len(total_test_files)}[/red]")

In [None]:
def plot(num):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    sq_num = np.sqrt(num)
    assert sq_num == int(sq_num), "Number of Images must be a perfect Square!"

    sq_num = int(sq_num)
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))

    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(IMG_PATHS + '/' + image_ids[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img); ax[i, j].set_title(f'{image_ids[idx]}', fontsize=6.5)

    plt.show()
    
def plot_from_label(group):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train_file[train_file['label_group'] == group]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)

    sq_num = int(sq_num)
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)
    plt.show()
    
def plot_from_title(title):
    IMG_PATHS = "../input/shopee-product-matching/train_images/"
    image_list = train_file[train_file['title'] == title]
    image_list = image_list['image'].tolist()
    num = len(image_list)
    
    sq_num = np.sqrt(num)
    sq_num = int(sq_num)
    
    image_ids = os.listdir(IMG_PATHS)
    random.shuffle(image_ids)
    fig, ax = plt.subplots(nrows=sq_num, ncols=sq_num, figsize=(10, 10))
    fig.suptitle(f"Product Name: {title}")
    path = [os.path.join(IMG_PATHS, x) for x in image_list]
    
    for i in range(sq_num):
        for j in range(sq_num):
            idx = i*sq_num + j
            ax[i, j].axis('off')
            img = cv2.imread(path[idx])
            img = img[:, :, ::-1]
            ax[i, j].imshow(img)
            
    plt.show()

In [None]:
plot(16)

In [None]:
plot_from_label(1141798720)

In [None]:
plot_from_label(994676122)

In [None]:
plot_from_title("Koko syubbanul muslimin koko azzahir koko baju")

In [None]:
plot_from_title("Monde Boromon Cookies 1 tahun+ 120gr")

In [None]:
sns.set_palette("tab20")
top10_names = train_file['label_group'].value_counts().index.tolist()[:15]
top10_values = train_file['label_group'].value_counts().tolist()[:15]

plt.figure(figsize=(10, 10))
sns.barplot(x=top10_names, y=top10_values)
plt.xticks(rotation=45)
plt.xlabel("Label Group")
plt.ylabel("Image Count")
plt.title("Top-15 Label Groups by Image Count")
plt.show()

In [None]:
top5_products = train_file['title'].value_counts()[:5].index.tolist()
for title in top5_products:
    plot_from_title(title)