In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cv2, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
print('TF',tf.__version__)


In [None]:
# RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
LIMIT = 12
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('so RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
print(train.shape)
train.head()

In [None]:
test = pd.read_csv('../input/shopee-product-matching/test.csv')
print(test.shape)
test.head()

In [None]:
BASE = '../input/shopee-product-matching/train_images/'

def displayDF(train, random=False, COLS=6, ROWS=4, path=BASE):
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for i in range(COLS):
            if random: row = np.random.randint(0,len(train))
            else: row = COLS*k + i
            name = train.iloc[row,1]
            title = train.iloc[row,3]
            title_with_return = ""
            for k,ch in enumerate(title):
                title_with_return += ch
                if (k!=0)&(k%20==0): title_with_return += '\n'
            img = cv2.imread(path+name)
            plt.subplot(1,COLS,i+1)
            plt.title(title_with_return)
            plt.axis('off')
            plt.imshow(img)
        plt.show()
        
displayDF(train,random=True)

In [None]:
gp = train['label_group'].value_counts()
gp

In [None]:
for k in range(5):
    print('TOP %i DUPLICATED ITEM:'%(k+1),gp.index[k])
    top = train.loc[train['label_group']==gp.index[k]]
    displayDF(top, random=True, ROWS=4, COLS=6)

In [None]:
train.label_group.nunique()

In [None]:
train_labels_count = train['label_group'].value_counts()
most_freq = train_labels_count[train_labels_count == train_labels_count.max()]
less_freq = train_labels_count[train_labels_count == train_labels_count.min()]

max_label = np.unique(train['label_group'][train['label_group'].isin(most_freq.index)].values)
lower_label = np.unique(train['label_group'][train['label_group'].isin(less_freq.index)].values)

print("Most frequency of the label group: ", max_label)
print("Less frequency of the label group: ", lower_label)

In [None]:
def display_image(images_paths, rows, cols):
    figure, ax = plt.subplots(nrows=rows,ncols=cols,figsize=(16,8) )
    for ind,image_path in enumerate(images_paths):
        image=cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 
        try:
            ax.ravel()[ind].imshow(image)
            ax.ravel()[ind].set_axis_off()
        except:
            continue;
    plt.tight_layout()
    plt.show()

In [None]:
def path(group,m):
    PATH = "../input/shopee-product-matching/train_images/"
    
    if m=='l':
        z = train['image'][train['label_group']==group].values
    if m=='t':
        z = train['image'][train['title']==group].values
   
    image_names = []
    for filename in z:
        fullpath = os.path.join(PATH, filename)
        image_names.append(fullpath)
    return image_names

In [None]:
train_gf = pd.read_csv('../input/shopee-product-matching/train.csv')
print('train shape is', train_gf.shape )
train_gf.head()

In [None]:
display_image(path(159351600,'l'), 3, 3)

In [None]:
display_image(path(297977, 'l'), 2,1)

In [None]:
train.columns

In [None]:
## checking the image with the same title name
train.title.nunique()

In [None]:
titlechecks = train['title'].value_counts().sort_values(ascending=False).reset_index()
titlechecks.columns = ['title','count']
titlechecks

In [None]:
display_image(path("Viva Air Mawar",'t'), 3, 3)

In [None]:
# checing Visually similar images in different label groups
# Same images with different titles
# Same titles have different images

In [None]:
test = pd.read_csv('../input/shopee-product-matching/test.csv')
print( test.shape )
test.head()

In [None]:
submis = pd.read_csv('../input/shopee-product-matching/sample_submission.csv')
print( submis.shape )
submis.head()