# Data Preparation for Siemese Network

These notebook prepares data for training siemese network on shopee product matching compition data.

It prepares two types of image pairs:
1. Positive Image Pair (Both the images are similar or contain same products)
2. Negative Image Pair (Both the images are different or contain different products)

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import random

In [None]:
# Paths to data
BASE_PATH = '../input/shopee-product-matching'

In [None]:
# Load train csv
train_df= pd.read_csv(BASE_PATH + "/train.csv")

In [None]:
train_df.head()

In [None]:
TOTAL_ENTRIES = train_df['posting_id'].count()

In [None]:
# Creating a dictionary, label_group is key and list containing posting_id, image, and image_phase will be value
train_dict = {}
for ind in tqdm(range(TOTAL_ENTRIES)):
    label = train_df.iloc[ind, 4]
    if label in train_dict:
        train_dict[label].append([train_df.iloc[ind, 0], train_df.iloc[ind, 1], train_df.iloc[ind, 2], train_df.iloc[ind, 3]])
    else:
        train_dict[label] = []
        train_dict[label].append([train_df.iloc[ind, 0], train_df.iloc[ind, 1], train_df.iloc[ind, 2], train_df.iloc[ind, 3]])

In [None]:
train_dict[249114794]

# Train Dict

This dictionary grouped data using group label. Now It will be easy to make pos and neg pairs

## Pairs
Pair data will be saved into csv where first three columns will first image's data and second three columns will show second image's data.
1. First Column - Image Name
2. Second Column - Image Phash
3. Third Column - Title or text
4. Fourth Column - Label group

So pair in single row will be look like this:

[(Name, Phash, Title, Label), (Name, Phash, Title, Label)]

csv file will have one another column named 'similar'. 1 in similar represents both pair has similar label group while 0 represents different label group

# Create Positive Data Pair

In [None]:
# Convert data to pairs for positive
pos_pair_dict = {'image_1': [], 'phash_1': [], 'title_1': [], 'label_1': [],
        'image_2': [], 'phash_2': [], 'title_2': [], 'label_2': [],
        'similar': []}

for label, data in tqdm(train_dict.items()):
    for fInd in range(len(data)):
        for sInd in range(len(data)):
            pos_pair_dict['image_1'].append(data[fInd][1])
            pos_pair_dict['phash_1'].append(data[fInd][2])
            pos_pair_dict['title_1'].append(data[fInd][3])
            pos_pair_dict['label_1'].append(label)
            
            pos_pair_dict['image_2'].append(data[sInd][1])
            pos_pair_dict['phash_2'].append(data[sInd][2])
            pos_pair_dict['title_2'].append(data[sInd][3])
            pos_pair_dict['label_2'].append(label)
            
            pos_pair_dict['similar'].append(1) # 1 for positive pair
            
        
pos_pair_df = pd.DataFrame(data=pos_pair_dict)

In [None]:
# Check created data
pos_pair_df.head(2)

In [None]:
# Total entries in csv
print("TOTAL ENTRIES")
print(pos_pair_df['image_1'].count(), "\n\n")

# Unique values per column
print("UNIQUE VALUES:")
print(pos_pair_df.nunique())

In [None]:
pos_pair_df = pos_pair_df.sample(frac=1).reset_index(drop=True)
pos_pair_df.head(2)

In [None]:
# Save data to the csv file
pos_pair_df.to_csv('pos_pair.csv', index=False)

# Create Negative Data Pair

In [None]:
# Create a partial dataframe to shuffle second pair of images so that we can create negative pairs
partial_pair_df = pd.DataFrame()
partial_pair_df['image_2'] = pos_pair_df['image_2']
partial_pair_df['phash_2'] = pos_pair_df['phash_2']
partial_pair_df['title_2'] = pos_pair_df['title_2']
partial_pair_df['label_2'] = pos_pair_df['label_2']
partial_pair_df.head(2)

In [None]:
# Now shuffle the partial dataframe
partial_pair_df = partial_pair_df.sample(frac=1).reset_index(drop=True)
partial_pair_df.head(2)

In [None]:
# Create new dataframe for negative pairs of images
neg_pair_df = pos_pair_df.copy() # create copy

# modify second image in pair with parital_df data
neg_pair_df['image_2'] = partial_pair_df['image_2']
neg_pair_df['phash_2'] = partial_pair_df['phash_2']
neg_pair_df['title_2'] = partial_pair_df['title_2']
neg_pair_df['label_2'] = partial_pair_df['label_2']

# Change similar to 0 for negative pairs
neg_pair_df['similar'] = [0] * neg_pair_df['image_1'].count()
neg_pair_df.head(2)

In [None]:
# Checking how many similar pairs found in negative pair dataframe
similar_counts = 0
for ind in range(neg_pair_df['image_1'].count()):
    if neg_pair_df.iloc[ind, 3] == neg_pair_df.iloc[ind, 7]:
        similar_counts += 1
print("Similar Pairs Found:", similar_counts)

In [None]:
# Re-shuffle data to remove similar pairs
TOTAL_NEG_COUNTS = neg_pair_df['image_1'].count()
print("Total Negative Pairs:", TOTAL_NEG_COUNTS)
for ind in tqdm(range(TOTAL_NEG_COUNTS)):
    if neg_pair_df.iloc[ind, 3] == neg_pair_df.iloc[ind, 7]:
        while True:
            # Create a random index to generate negative pair
            randInd = random.randint(0, TOTAL_NEG_COUNTS)
            if neg_pair_df.iloc[ind, 3] != neg_pair_df.iloc[randInd, 7]:
                # replace second image with randomly selected image
                neg_pair_df.iloc[ind, 4] = neg_pair_df.iloc[randInd, 4]
                neg_pair_df.iloc[ind, 5] = neg_pair_df.iloc[randInd, 5]
                neg_pair_df.iloc[ind, 6] = neg_pair_df.iloc[randInd, 6]
                neg_pair_df.iloc[ind, 7] = neg_pair_df.iloc[randInd, 7]
                
                break

neg_pair_df.head(2)

In [None]:
# Checking how many similar pairs found in negative pair dataframe
similar_counts = 0
for ind in range(neg_pair_df['image_1'].count()):
    if neg_pair_df.iloc[ind, 3] == neg_pair_df.iloc[ind, 7]:
        similar_counts += 1
print("Similar Pairs Found:", similar_counts)

In [None]:
# Save data to the csv file
neg_pair_df.to_csv('neg_pair.csv', index=False)