In this notebook, I will try to correct the label in the training dataset

# Preparation
import package
set data_path

In [None]:
DATA_PATH = '../input/shopee-product-matching/'
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

In [None]:
train = pd.read_csv(DATA_PATH + 'train.csv')
train['image'] = DATA_PATH + 'train_images/' + train['image']
train.head()

In [None]:
train.info()

In [None]:
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)
train.head()

# Analyzing

## Check how many labels in the label_group column

In [None]:
len(train.label_group.unique())

## Check how many items in each label group

In [None]:
train_label = train.groupby('label_group').posting_id.count().sort_values(ascending=False)
train_label

In [None]:
pd.cut(train_label, bins=[0, 10, 20, 30, 40, 51]).value_counts()

As we can see here, most of the label group only have less than 10 pictures. It may be a factor that affect the training process becasue of the lack of samples.

## Check whether the pictures are similar in each label_goup

In [None]:
def findByLabel(label_group_idx, figscale=2):
    train_a = train[train.label_group == label_group_idx].reset_index()
    count = len(train_a)
    showImgNumber = count
    if count > 5:
        col = 5
        row = int(np.ceil(count/col))  
    else:
        row = 1
        col = count
    fig, ax = plt.subplots(row, col, figsize=(col*figscale, row*figscale))
    
    if row == 1:
        for j in range(col):
            if showImgNumber == 0: break
            ax[j].imshow(cv2.imread(train_a.image[showImgNumber-1]))
            ax[j].set_xticks([])
            ax[j].set_yticks([])
            showImgNumber-=1
    else:
        for i in range(row):
            for j in range(col):
                if showImgNumber == 0: break
                ax[i, j].imshow(cv2.imread(train_a.image[showImgNumber-1]))
                ax[i, j].set_xticks([])
                ax[i, j].set_yticks([])
                
                showImgNumber-=1
    fig.text(0.1, 0.95, 'label_group: {}'.format(label_group_idx))
    return train_a

## groupby image_phash
To count pictures having the same image_phash

In [None]:
train.groupby('image_phash').posting_id.count().sort_values(ascending=False)

## Check how many images are labelled wrongly

In [None]:
def countHashinGroups(train):
    hash_group = train.image_phash.unique()
    count = 0
    hash_li = []
    count_li = []
    for each in hash_group:
        label_count = len(train[train.image_phash == each].label_group.unique())
        if label_count > 1:
            count += 1
            hash_li.append(each) # collect the hash that are labelled wrongly in the dataset
            count_li.append(label_count) # store the count to get more detail information
    print('{:.2f}% of the image are labelled in more than 2 groups'.format(100*count/len(hash_group)))
    print('{} out of {} are labelled in more than 2 groups'.format(count, len(hash_group)))
    return hash_li, count_li

In [None]:
hash_li, count_li = countHashinGroups(train)

In [None]:
plt.figure(figsize=(20, 5))
plt.bar(range(len(count_li)), count_li, alpha=0.4)
plt.show()

In [None]:
def showSameHash(phash):
    df = train[train.image_phash==phash]
    label_group = df.label_group.unique()
    for label in label_group:
        findByLabel(label)
    plt.show()

we try to pick one to figure out

In [None]:
df = train[train.image_phash==hash_li[19]]
df

# Correct the wrong label and re-label them

In [None]:
morePicsPath = '../input/shopeemorepicscsv/trainMorePics_in kaggle.csv'
train2 = pd.read_csv(morePicsPath)
train2.head()

In [None]:
data = pd.concat([train, train2]).reset_index(drop=True)
data

In [None]:
data.label_group.value_counts()

In [None]:
new_data = data.copy()
new_data['new_label_group'] = new_data.label_group
new_data.head(2)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def makeOneLabel(train, phash):
    '''
    input: dataset, phash
        find the index of certain phash, and then find out the max count of label and all labels.
        For each label in all labels, change it to the max-count label
    output: dataset
    '''
    train_hash = train[train.image_phash == phash]
    allLabels = train_hash.label_group.value_counts(ascending=False).index.tolist()
    label = train_hash.label_group.value_counts(ascending=False).index[0]
    for each in allLabels:
        idx = train[train.label_group == each].index.tolist()
        train['new_label_group'].iloc[idx] = label
    print('all label: {} to new label: {} \n'.format(allLabels, label))

In [None]:
for phash in hash_li:
    makeOneLabel(new_data, phash)

In [None]:
new_data.columns

In [None]:
# columns = ['image', 'label_group', 'new_label_group']
# new_data[columns].info()

In [None]:
new_data.to_csv('./trainMorePics_labelCorrection.csv', index=False)

In [None]:
new_data[new_data.new_label_group == 3781511357]

In [None]:
len(data.label_group.unique()),len(new_data.new_label_group.unique())