# Importing packages

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Datasets

In [None]:
train = pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/train.csv')
test = pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/test.csv')
sample = pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/sample_submission.csv')

In [None]:
train_labels, valid_labels = train_test_split(train['label_group'].unique(), test_size=0.1, shuffle=True, random_state=1)

In [None]:
valid = train[train['label_group'].isin(valid_labels)]
train = train[train['label_group'].isin(train_labels)]

In [None]:
train.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
5,train_2464356923,0013e7355ffc5ff8fb1ccad3e42d92fe.jpg,bbd097a7870f4a50,CELANA WANITA (BB 45-84 KG)Harem wanita (bisa...,2660605217


# Displaying main attributes of the datasets

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30757 entries, 0 to 34249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   30757 non-null  object
 1   image        30757 non-null  object
 2   image_phash  30757 non-null  object
 3   title        30757 non-null  object
 4   label_group  30757 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.4+ MB


In [None]:
for column in train.columns:
    print(column + ":" + str(len(train[column].unique())))

posting_id:30757
image:29110
image_phash:25847
title:29761
label_group:9912


In [None]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3493 entries, 4 to 34246
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   3493 non-null   object
 1   image        3493 non-null   object
 2   image_phash  3493 non-null   object
 3   title        3493 non-null   object
 4   label_group  3493 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 163.7+ KB


In [None]:
for column in valid.columns:
    print(column + ":" + str(len(valid[column].unique())))

posting_id:3493
image:3307
image_phash:2907
title:3366
label_group:1102


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   posting_id   3 non-null      object
 1   image        3 non-null      object
 2   image_phash  3 non-null      object
 3   title        3 non-null      object
dtypes: object(4)
memory usage: 224.0+ bytes


In [None]:
for column in test.columns:
    print(column + ":" + str(len(test[column].unique())))

posting_id:3
image:3
image_phash:3
title:3


Facts of the 5 Attributes:

* `posting_id`: the ID code for the posting.
* `image` : the image id/md5sum.
* `image_phash` : a perceptual hash of the image.
* `title` : the product description for the posting.
* `label_group`:  ID code for all postings that map to the same product. Not provided for the test set.

Each row contains the data for a single posting. Multiple postings might have the exact same image ID, but with different titles or vice versa.

In [None]:
sample

Unnamed: 0,posting_id,matches
0,test_2255846744,test_2255846744
1,test_3588702337,test_3588702337
2,test_4015706929,test_4015706929


# Modifying the datasets to be compatiable with the loss function

## Triplet Loss ([ref](https://en.wikipedia.org/wiki/Triplet_loss))

In [None]:
train.shape

(30757, 5)

In [None]:
positive_train = train.merge(train, on='label_group', how='inner', suffixes=['_anchor', '_positive'])
positive_train = positive_train.rename(columns={'label_group': 'label_group_positive'})
positive_train = positive_train[positive_train['posting_id_anchor'] != positive_train['posting_id_positive']]

In [None]:
positive_valid = valid.merge(valid, on='label_group', how='inner', suffixes=['_anchor', '_positive'])
positive_valid = positive_valid.rename(columns={'label_group': 'label_group_positive'})
positive_valid = positive_valid[positive_valid['posting_id_anchor'] != positive_valid['posting_id_positive']]

In [None]:
print(positive_train.shape)
positive_train.head()

(146168, 9)


Unnamed: 0,posting_id_anchor,image_anchor,image_phash_anchor,title_anchor,label_group_positive,posting_id_positive,image_positive,image_phash_positive,title_positive
1,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET
2,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET,249114794,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret
5,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...
6,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...,2937985045,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO..."
9,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_3803689425,75dbd1e9f31f2d0f21d31c08b3e0b94e.jpg,eb969469cd384ac6,Maling Ham Pork Luncheon Meat TTS 397gr


In [None]:
negative_train = train.sample(n=len(positive_train), replace=True).reset_index(drop=True)
negative_train.columns = [column + '_negative' for column in negative_train.columns]

negative_valid = valid.sample(n=len(positive_valid), replace=True).reset_index(drop=True)
negative_valid.columns = [column + '_negative' for column in negative_valid.columns]

In [None]:
print(negative_train.shape)
negative_train.head()

(146168, 5)


Unnamed: 0,posting_id_negative,image_negative,image_phash_negative,title_negative,label_group_negative
0,train_3712296585,160a4689871befec7f5d8d414844b401.jpg,b39ccc66cc338c33,Emina Cheek Lit Cream Blush,2770620676
1,train_3913190043,7da59b1a67f68341b28670feeb76dfb2.jpg,fc13826cf03a9673,OTAJI Oseng Tuna Asap 500gr,2963928868
2,train_1007590256,1a3013542c558927db0d8c1de61690a1.jpg,a8049efee5c1627a,{LAMPU TUMBLR / LAMPU NATAL / TWINKLE LIGHT / ...,2014040846
3,train_3799652696,911fc70ad0b6ce3e61a6860e5d7b0aae.jpg,813d7a3b34c74cc3,Mamypoko pants extra dry s38/m32/l30/xl26/xxl22,373674159
4,train_2196844112,1cc82ac06d66845d965bc4be0d00a16e.jpg,d715ee6b95821835,NM547 Kotak Tempat Tissue Bahan Kain / Kotak Tisu,175000399


In [None]:
positive_train

Unnamed: 0,posting_id_anchor,image_anchor,image_phash_anchor,title_anchor,label_group_positive,posting_id_positive,image_positive,image_phash_positive,title_positive
1,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET
2,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET,249114794,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret
5,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...
6,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...,2937985045,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO..."
9,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_3803689425,75dbd1e9f31f2d0f21d31c08b3e0b94e.jpg,eb969469cd384ac6,Maling Ham Pork Luncheon Meat TTS 397gr
...,...,...,...,...,...,...,...,...,...
176915,train_3351458915,feb6ab149584d8e81d2d1408cb12162f.jpg,fa36c5cb1e7560a0,Good time rainbow chocochip cookies 16 gram,4057565955,train_4129819290,fd561ac0bb264f2ee27abb1555aba3dc.jpg,ecd3d2c26c68936c,Good Time Rainbow Chocochips Cookies 16 gr
176918,train_866113781,fd58c6f8518d9f6ef1b52d2ed81e9aa6.jpg,eb1f8dc8d827a4d0,Kedaung Cangkir / Mug Enamel Loreng 10 cm,1313560418,train_2743870047,fd97a173d7c60d27e6459fe586797864.jpg,eb7f430dbc8190d8,KedaungHome Cangkir Doreng Latte KI-LATTE
176919,train_2743870047,fd97a173d7c60d27e6459fe586797864.jpg,eb7f430dbc8190d8,KedaungHome Cangkir Doreng Latte KI-LATTE,1313560418,train_866113781,fd58c6f8518d9f6ef1b52d2ed81e9aa6.jpg,eb1f8dc8d827a4d0,Kedaung Cangkir / Mug Enamel Loreng 10 cm
176922,train_4221982820,ff512b2f4ff8bb431bf089e87c212922.jpg,f94186ff8fa6b010,Sprei Lady Rose 180x200 King terlaris Keroppi,53836859,train_4063409014,ff7180bf7d0cf29f0b173952e6cf7af2.jpg,f94186ff8fa6b010,Sprei king ladyrose size 180x200 kerokeroppi


In [None]:
negative_train

Unnamed: 0,posting_id_negative,image_negative,image_phash_negative,title_negative,label_group_negative
0,train_3712296585,160a4689871befec7f5d8d414844b401.jpg,b39ccc66cc338c33,Emina Cheek Lit Cream Blush,2770620676
1,train_3913190043,7da59b1a67f68341b28670feeb76dfb2.jpg,fc13826cf03a9673,OTAJI Oseng Tuna Asap 500gr,2963928868
2,train_1007590256,1a3013542c558927db0d8c1de61690a1.jpg,a8049efee5c1627a,{LAMPU TUMBLR / LAMPU NATAL / TWINKLE LIGHT / ...,2014040846
3,train_3799652696,911fc70ad0b6ce3e61a6860e5d7b0aae.jpg,813d7a3b34c74cc3,Mamypoko pants extra dry s38/m32/l30/xl26/xxl22,373674159
4,train_2196844112,1cc82ac06d66845d965bc4be0d00a16e.jpg,d715ee6b95821835,NM547 Kotak Tempat Tissue Bahan Kain / Kotak Tisu,175000399
...,...,...,...,...,...
146163,train_2105140929,5ef4113e104832808a1cd8fcab350130.jpg,e4c89b995a66e598,[1800 GR] ENFAGROW A+ 3 A+3 Vanila Vanilla Vnl...,310610689
146164,train_3672279995,7b031e56878b89ba5b4d224a143b7d58.jpg,eaa594d09f7931c2,[ COD ] KURMA SAYER 1 KG / KURMA EMIRATES / KU...,3441283383
146165,train_88299288,88f6fca0531f8cea87c949eda7aeab2b.jpg,bf87458f3ac0d078,[READY!] Wardah Lightening Powder Foundation,1563437474
146166,train_3996498216,e8b29b4bd07abfb4863fc8f50e128dd0.jpg,bc4fc3b0384f47e0,Waistbag Pria Simple Buffback Termurah,2933728554


# Saving the resulted datasets

In [None]:
triplet_train = positive_train.merge(negative_train, left_index=True, right_index=True)
triplet_train = triplet_train[triplet_train['label_group_positive'] != triplet_train['label_group_negative']]

triplet_valid = positive_valid.merge(negative_valid, left_index=True, right_index=True)
triplet_valid = triplet_valid[triplet_valid['label_group_positive'] != triplet_valid['label_group_negative']]

In [None]:
triplet_train.to_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_train.csv', index=False)
triplet_valid.to_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_valid.csv', index=False)

In [None]:
triplet_train

Unnamed: 0,posting_id_anchor,image_anchor,image_phash_anchor,title_anchor,label_group_positive,posting_id_positive,image_positive,image_phash_positive,title_positive,posting_id_negative,image_negative,image_phash_negative,title_negative,label_group_negative
1,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET,train_3913190043,7da59b1a67f68341b28670feeb76dfb2.jpg,fc13826cf03a9673,OTAJI Oseng Tuna Asap 500gr,2963928868
2,train_2278313361,f83b49a86a0ee8592e3bf0204da3fbdf.jpg,ac63931c3d4b42f6,PAPER BAG VICTORIA SECRET,249114794,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,train_1007590256,1a3013542c558927db0d8c1de61690a1.jpg,a8049efee5c1627a,{LAMPU TUMBLR / LAMPU NATAL / TWINKLE LIGHT / ...,2014040846
5,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...,train_3069587451,da80703ef767b8650acbee00f358280d.jpg,93e67c6a6159a6c1,Yazole 318 Jam Tangan Pria Original Business Q...,80347885
6,train_3423213080,8cbe4bf9706bc177fd61071ef776be8c.jpg,bfc6d01bc72c1d30,Double Tape VHB 3M ORIGINAL 12mm x 4.5mm Busa ...,2937985045,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",train_3446162576,c86c2afa855bc03a79f14bde79548ffe.jpg,dea1c13e523ea3a1,Garnier Color Naturals Express Creme 3 - Cokla...,2910074820
9,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,train_3803689425,75dbd1e9f31f2d0f21d31c08b3e0b94e.jpg,eb969469cd384ac6,Maling Ham Pork Luncheon Meat TTS 397gr,train_4248829581,5107e9669190bbf101ebdbfabb1acab5.jpg,f0aab378aa113e72,[Per Pc] Liptint Sasimi Aloe Vera 99%,1544174053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146160,train_76590387,9097bcf9064f1e9d844e8c7cb08da267.jpg,ed68c693912db196,SET SETELAN WANITA WINK NSP L-XL,2130234857,train_3559120886,a0cc26b44f7bf8affe124ae56dd403c8.jpg,ed68c693912db196,TFS SETELAN WINK BABYTERRY/ BAJU TIDUR / PIYAM...,train_4121268960,17654eb57a2ba1e008cdfbd39cf23f86.jpg,bcf095a38ef0c658,GNC| OPPO A33 A53 A52 A92 A31 A91 A5 A9 A1K F1...,3353200223
146161,train_3559120886,a0cc26b44f7bf8affe124ae56dd403c8.jpg,ed68c693912db196,TFS SETELAN WINK BABYTERRY/ BAJU TIDUR / PIYAM...,2130234857,train_3653591340,489beec762724cb397cf6cb474cf4c86.jpg,fee8c0328137d927,TFS SET WINK / SETELAN WINK/SETELAN WANITA / S...,train_3301603537,c3a70c8fc6e6800c6edae4bc5987ebe1.jpg,ef9ed062d4974268,Tripod Handphone Camera Mini Spider Gurita Uni...,359918838
146162,train_3559120886,a0cc26b44f7bf8affe124ae56dd403c8.jpg,ed68c693912db196,TFS SETELAN WINK BABYTERRY/ BAJU TIDUR / PIYAM...,2130234857,train_76590387,9097bcf9064f1e9d844e8c7cb08da267.jpg,ed68c693912db196,SET SETELAN WANITA WINK NSP L-XL,train_720116982,ead0836ba145aab44e2500f562f08f83.jpg,f88dc7920d792996,"b""ERTOS WHY ACNE NIGHT CREAM / KRIM MALAM ERTO...",1041750302
146165,train_2948165449,489d034df9f72999d40e3261a7c621eb.jpg,d6b8e0844b4f3f32,BEAUSLIM,2514750348,train_852655362,97f7f5e7563d654b154936c3ec4a098e.jpg,96c460d83e463f8f,BeauSlim,train_88299288,88f6fca0531f8cea87c949eda7aeab2b.jpg,bf87458f3ac0d078,[READY!] Wardah Lightening Powder Foundation,1563437474
