In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import random
%matplotlib inline

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 30
seed_everything(SEED)

In [None]:
data = pd.read_csv('../input/shopee-product-matching/train.csv')
original_data = pd.read_csv('../input/shopee-product-matching/train.csv')

## Data Exploration

In [None]:
data.head()

In [None]:
data.shape

In [None]:
for col in data.columns:
    print(col, len(np.unique(data[col])))

In [None]:
vc = dict(data.label_group.value_counts())
counts = list(set(vc.values()))
print('min = ', min(counts), ", max = ", max(counts), ', all = ', sum(vc.values()))

In [None]:
fig = plt.figure(figsize=(15, 5))
arr = plt.hist(vc.values(), bins=52)
for i in range(52):
    plt.text(arr[1][i]+0.1,arr[0][i]+200,str(arr[0][i]), rotation=90)
plt.xticks(list(range(52)))
plt.tight_layout()
plt.show()

## Oversampling

In [None]:
max_count = 10
labels = data.label_group.unique()
for i in tqdm(range(len(labels))):
    label_group = labels[i]
    if vc[label_group] > 5:
        continue
    current_data = data[data.label_group == label_group]
    duplicates = math.ceil(max_count / vc[label_group])
    for i in range(duplicates):
        data = data.append(current_data, ignore_index=True)

In [None]:
data.shape

In [None]:
vc = dict(data.label_group.value_counts())
counts = list(set(vc.values()))
print('min = ', min(counts), ", max = ", max(counts), ', all = ', sum(vc.values()))

In [None]:
fig = plt.figure(figsize=(15, 5))
arr = plt.hist(vc.values(), bins=52)
for i in range(52):
    plt.text(arr[1][i]+0.1,arr[0][i]+200,str(arr[0][i]), rotation=90)
plt.xticks(list(range(52)))
plt.tight_layout()
plt.show()

## Generate second image

In [None]:
data['label_group_2'] = 0

In [None]:
labels = data.label_group.unique()
same= 0.3
for i in tqdm(range(len(labels))):
    label_group = labels[i]
    current_data = data[data.label_group == label_group]
    ln = current_data.shape[0]
    same_group_ids = random.sample(current_data.posting_id.values.tolist(), math.ceil(ln * same))
    other_groups_ids = []
    labels2 = list(labels[:])
    labels2.remove(label_group)
    for j in range(ln - math.ceil(ln * same)):
        other_label_group = random.sample(labels2, 1)[0]
        other_data = data[data.label_group == other_label_group]
        other_groups_ids.append(random.sample(other_data.posting_id.values.tolist(), 1)[0])
    same_group_ids.extend(other_groups_ids)
    data.loc[data.label_group == label_group, 'label_group_2'] = same_group_ids

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data_merged = data.merge(original_data, how='inner', left_on='label_group_2', right_on='posting_id', suffixes=('', '_g2'),)

In [None]:
data_merged.drop(['label_group_2'], axis=1, inplace=True)

In [None]:
data_merged['label'] = (data_merged['label_group'] == data_merged['label_group_g2']).astype(int)

In [None]:
data_merged.shape

In [None]:
data_merged.head()

In [None]:
data_merged['train'] = True

In [None]:
labels = data_merged.label_group.unique()
valid = 0.2
for i in tqdm(range(len(labels))):
    label_group = labels[i]
    current_data1 = data_merged[(data_merged.label_group == label_group) & (data_merged.label == 1)]
    current_data0 = data_merged[(data_merged.label_group == label_group) & (data_merged.label == 0)]
    same_group_ids = random.sample(current_data1.index.values.tolist(), math.ceil(current_data1.shape[0] * valid / 2.))
    same_group_ids.extend(random.sample(current_data0.index.values.tolist(), math.ceil(current_data0.shape[0] * valid / 2.)))
    
    data_merged.loc[same_group_ids, 'train'] = False

In [None]:
print(data_merged[(data_merged.train == False) & (data_merged.label == 1)].shape)
print(data_merged[(data_merged.train == False) & (data_merged.label == 0)].shape)
print(data_merged[(data_merged.train == False)].shape[0] / data_merged[(data_merged.train == True)].shape[0])

In [None]:
data_merged.to_csv('data_merged.csv', index=False)

In [None]:
data_merged[(data_merged.train == False)].to_csv('data_merged_valid.csv', index=False)
data_merged[(data_merged.train == True)].to_csv('data_merged_train.csv', index=False)