In [None]:
import pandas as pd

df = pd.read_json('cameras.json.gz', compression='gzip', lines=True)
df.head()

# Explanatory Data Analysis

In [None]:
df.label = df.label.astype('object')

In [None]:
df.info()

In [None]:
df.brand_left.value_counts()

In [None]:
# pd.crosstab(data.brand_left, [data.label, data.brand_right])
df.pivot_table(
    columns='brand_left',
    index=['label', 'brand_right'],
    aggfunc='count',
    margins=True
)

Cek Apa yang beda di Action Outdoor

In [None]:
# Bisa tes 1-5 produk karna diatas ada 5
df[(df.brand_left=='action outdoor')&(df.brand_right=='action outdoor')&(df.label==0)].title_left.iloc[0]

In [None]:
# Bisa tes 1-5 produk karna diatas ada 5
df[(df.brand_left=='action outdoor')&(df.brand_right=='action outdoor')&(df.label==0)].title_right.iloc[0]

In [None]:
# Contoh sample yang sama
print('Sample of Match Product')
temp = df[df.label == 1].sample(1)
print(temp.title_left.iloc[0])
print(temp.title_right.iloc[0], '\n')

print('Sample of Not Match Product')
temp = df[df.label == 0].sample(1)
print(temp.title_left.iloc[0])
print(temp.title_right.iloc[0])

In [None]:
from nltk import ngrams

def find_common(text1, text2, ngram):

  text1 = ngrams(text1.split(), ngram)
  text1 = [' '.join(grams) for grams in text1]

  text2 = ngrams(text2.split(), ngram)
  text2 = [' '.join(grams) for grams in text2]

  common = set(text1).intersection(text2)

  return list(common)

In [None]:
# Cari kata yang sama dari kalimat
find_common('sepatu olahraga indonesia', 'sepatu puma dari indonesia', 2)

In [None]:
temp = df.copy()
temp = temp[['title_left', 'title_right', 'label']]

temp['title_left_len']  = temp.title_left.apply(lambda x: len(x.split()))
temp['title_right_len'] = temp.title_right.apply(lambda x: len(x.split()))
temp['common_words']    = temp.apply(lambda x: find_common(x['title_left'], x['title_right'], 1), axis=1)
temp['common_words_len']= temp.common_words.apply(lambda x: len(x))

In [None]:
temp.describe()

In [None]:
temp.pivot_table(
    columns='label',
    aggfunc='mean',
)

In [None]:
temp[['label', 'common_words_len']].boxplot(by='label', figsize=(10,10))

In [None]:
!git clone https://github.com/amueller/word_cloud.git
!cd word_cloud
!pip install word_cloud/.

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white", width=500, height=500)

In [None]:
text = " ".join(title for title in temp.title_left)
show = wordcloud.generate(text)

# Display the generated image:
plt.figure(figsize=(20,10))
plt.imshow(show, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=10)

print(len(train), len(test))

In [None]:
import re

def preprocessing(text):
  text = str(text).lower()
  text = re.sub(r'[^\w ]', '', text)
  return text

In [None]:
train['title_left']  = train['title_left'].apply(preprocessing)
train['title_right'] = train['title_right'].apply(preprocessing)
test['title_left']   = test['title_left'].apply(preprocessing)
test['title_right']  = test['title_right'].apply(preprocessing)

# TF-IDF

In [None]:
train_left_title  = train.title_left.tolist()
train_right_title = train.title_right.tolist()
train_title       = train_left_title+train_right_title
train_labels      = train.label.tolist()
print(len(train_left_title), len(train_right_title), len(train_labels), len(train_title))

test_left_title  = test.title_left.tolist()
test_right_title = test.title_right.tolist()
test_title       = test_left_title+test_right_title
test_labels      = test.label.tolist()
print(len(test_left_title), len(test_right_title), len(test_labels), len(test_title))

In [None]:
# cek masing2 left dan right beserta label nya
train_left_title[0], train_right_title[0], train_labels[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_similarity(vector):
    cosine_sim_arr = []
    n              = int(len(vector) / 2)
    for i in range(n):
        j             = i + n
        cosine_sim    = cosine_similarity([vector[i]], [vector[j]])
        cosine_sim_arr.append(cosine_sim[0][0])
    return cosine_sim_arr

vectorizer   = TfidfVectorizer()
vectorizer.fit(train_title)

train_vector = vectorizer.transform(train_title).toarray()
test_vector  = vectorizer.transform(test_title).toarray()

In [None]:
# cek hasil train
train_vector[0]

In [None]:
# Misal ada 2 produk, dan memiliki tingkat kesamaan mencapai 70%
# 70% akan dibulatkan menjadi 0 / 1, sesuai dengan threshold yang kita pilih atau di tentukan

In [None]:
from sklearn import metrics

thresh         = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cosine_sim_arr = np.array(get_similarity(test_vector))
best_value     = (-1, -1)

for thrs in thresh:
    y_pred         = np.where(cosine_sim_arr > thrs, 1, 0)
    x              = metrics.classification_report(y_pred=y_pred, y_true=test_labels, labels=[0, 1], digits=4, output_dict=True)

    if best_value[1] < x['weighted avg']['f1-score']:
      best_value = (thrs, x['weighted avg']['f1-score'])

    print(thrs, x['0']['f1-score'], x['1']['f1-score'], x['weighted avg']['f1-score'])
    print('')

print('best value: ', best_value)

In [None]:
pred_label         = np.where(cosine_sim_arr > best_value[0], 1, 0)

temp               = test[['title_left', 'title_right', 'label']].copy()
temp['pred_label'] = pred_label
temp.head()

Unnamed: 0,title_left,title_right,label,pred_label
1508,fujifilm fp 3000b instant black and white film...,335 camera bag black w black trim,0,0
803,knog lights qudos action video light for gopro...,nilox chest mount harness foolish acheter et o...,0,0
1638,c mara canon sx510 negra alkosto tienda online,canon eos rebel t5i 18 megapixel digital slr c...,0,0
1671,silicon power 32gb microsdhc prijzen tweakers,transcend 32 gb class 10 sdhc flash memory car...,0,0
871,gopro hero3 hdmi micro cable ahdmc 301,gopro wi fi remote charging cable unique photo,0,0


In [None]:
def get_status(y_true, y_pred):
  if y_true==1:
    if y_pred==1:
      return 'tp'
    else:
      return 'fn'
  else:
    if y_pred==0:
      return 'tn'
    else:
      return 'fp'


temp['status'] = temp.apply(lambda x: get_status(x['label'], x['pred_label']), axis=1)
temp.head()

Unnamed: 0,title_left,title_right,label,pred_label,status
1508,fujifilm fp 3000b instant black and white film...,335 camera bag black w black trim,0,0,tn
803,knog lights qudos action video light for gopro...,nilox chest mount harness foolish acheter et o...,0,0,tn
1638,c mara canon sx510 negra alkosto tienda online,canon eos rebel t5i 18 megapixel digital slr c...,0,0,tn
1671,silicon power 32gb microsdhc prijzen tweakers,transcend 32 gb class 10 sdhc flash memory car...,0,0,tn
871,gopro hero3 hdmi micro cable ahdmc 301,gopro wi fi remote charging cable unique photo,0,0,tn


In [None]:
pd.set_option('display.max_colwidth', -1)

temp[temp.status == 'tp'].head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,title_left,title_right,label,pred_label,status
36,olympus m zuiko digital ed 40 150mm f 2 8 pro lens buy at connection public sector solutions,used olympus m zuiko digital ed 40 150mm f 2 8 pro lens l excellent mirrorless,1,1,tp
242,sigma 70 300mm f4 5 6 dg macro nikon prijzen tweakers,sigma zoom telephoto 70 300mm f 4 5 6 dg macro autofocus lens for nikon af fumfie com,1,1,tp
423,action outdoor floating hand grip bobber acheter et offres sur bikeinn,action outdoor floating hand grip bobber scubastore,1,1,tp
115,veho muvi hd pro 1080p mini camcorder,veho vcc 005 muvi hdpro mini hd body camera action camcorder cameras unique photo,1,1,tp
440,wireless 2 4ghz 802 11b internet camera dcs 1000w d link deutschland,wireless 2 4ghz 802 11b internet camera dcs 1000w d link sweden,1,1,tp


In [None]:
temp[temp.status == 'tn'].head()

Unnamed: 0,title_left,title_right,label,pred_label,status
1508,fujifilm fp 3000b instant black and white film no longer available unique photo,335 camera bag black w black trim,0,0,tn
803,knog lights qudos action video light for gopro silver kopen en aanbiedingen bikeinn,nilox chest mount harness foolish acheter et offres sur swiminn,0,0,tn
1638,c mara canon sx510 negra alkosto tienda online,canon eos rebel t5i 18 megapixel digital slr camera with lens mm 55,0,0,tn
1671,silicon power 32gb microsdhc prijzen tweakers,transcend 32 gb class 10 sdhc flash memory card ts32gsdhc10e computer accessories peripherals page 4 laptops outlet direct,0,0,tn
871,gopro hero3 hdmi micro cable ahdmc 301,gopro wi fi remote charging cable unique photo,0,0,tn


In [None]:
temp[temp.status == 'fp'].head()

Unnamed: 0,title_left,title_right,label,pred_label,status
1334,nikon af s nikkor 35mm f 1 8g dx lens fumfie com,nikon af s nikkor 105mm f 2 8g if ed vr,0,1,fp
667,sony alpha a7rii mirrorless digital camera body only e c pro video systems inc professional cameras,sony alpha a6000 mirrorless camera body only black,0,1,fp
643,transcend 128gb sdxc ultimate uhs 1 memory card class 10 ts128gsdxc10u1,kingston 128gb sdxc flash memory card class 3 sda3,0,1,fp
1457,hd wireless n day night outdoor cloud camera dcs 2330l d link deutschland,wireless ac day night hd mini bullet cloud camera d link portugal,0,1,fp
1187,fujifilm s electronics accessories instax instant film everything but water,fujifilm fp 3000b instant black and white film no longer available unique photo,0,1,fp


In [33]:
temp[temp.status == 'fn'].head()

Unnamed: 0,title_left,title_right,label,pred_label,status
418,gopro helmet front mount ahfmt 001,helmet mount go pro accessories video cameras electronic orienteering devices at barrabes com,1,0,fn
304,transcend 16gb compact flash card 1000x camera photo accessories page 1851 all tech toys,transcend ts16gcf1000 dfd 16gb 50p cf 1000x 160mb s compactflash card,1,0,fn
176,64gb sdxc class10 uhs i card tradineur com,transcend ts64gsdxc10u1 64gb sdxc class 10 600x uhs i for 49 48,1,0,fn
67,sandisk 32gb sdhc flash memory card sdsdb 032g b35 label may change camera photo accessories page 6 all tech toys,transcend secure digital card sdhc class 4 32gb,1,0,fn
10,canon ef 100mm f 2 8l macro is usm prime lens for eos slr cameras at crutchfield com,canon ef macro lens 100 mm series prices cnet,1,0,fn


# Add more features

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1508 entries, 1014 to 1289
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id_left                 1508 non-null   int64 
 1   title_left              1508 non-null   object
 2   description_left        1100 non-null   object
 3   brand_left              596 non-null    object
 4   price_left              115 non-null    object
 5   specTableContent_left   208 non-null    object
 6   keyValuePairs_left      164 non-null    object
 7   category_left           1508 non-null   object
 8   cluster_id_left         1508 non-null   int64 
 9   identifiers_left        1508 non-null   object
 10  id_right                1508 non-null   int64 
 11  title_right             1508 non-null   object
 12  description_right       1051 non-null   object
 13  brand_right             583 non-null    object
 14  price_right             81 non-null     object
 15  s

In [36]:
print('Sample of Match Product')
temp = df[df.label == 1].sample(1)
print(temp.description_left.iloc[0])
print(temp.description_right.iloc[0], '\n')

print('Sample of Not Match Product')
temp = df[df.label == 0].sample(1)
print(temp.description_left.iloc[0])
print(temp.description_right.iloc[0])

Sample of Match Product
gopro fixation lat rale fixez votre cam ra sur casque v hicule quipement et plus encore le bras articul tridirectionnel vous permet de cadrer vos images tr s facilement compatible avec toutes les ras hero3 hero2 hd hero original comprend adh sive incurv e
go pro camera side mount for attaching to the side of your helmet to a vehicle equipment etc the side mount can also be extended outwards so that you have a wider perspective and can also film yourself thanks to the 3 way adjustable pivot side mount for helmets 3 way adjust curved adhesive mount if you have any queries please contact one of our technical advisors 

Sample of Not Match Product
None
transcend ts32gsdhc10 secure digital high capacity sdhc ts32gsdhc10 flash memory


In [None]:
# Gabung title dan deskripsi
train.description_left    = train.description_left.fillna('')
train.description_right   = train.description_right.fillna('')
train['title_desc_left']  = train.apply(lambda x: x['title_left'] + ' ' + x['description_left'], axis=1)
train['title_desc_right'] = train.apply(lambda x: x['title_right'] + ' ' + x['description_right'], axis=1)

train_left_text           = train.title_desc_left.tolist()
train_right_text          = train.title_desc_right.tolist()
train_text                = train_left_text+train_right_text
train_labels              = train.label.tolist()
print(len(train_left_text), len(train_right_text), len(train_labels), len(train_text))

test.description_left    = test.description_left.fillna('')
test.description_right   = test.description_right.fillna('')
test['title_desc_left']  = test.apply(lambda x: x['title_left'] + ' ' + x['description_left'], axis=1)
test['title_desc_right'] = test.apply(lambda x: x['title_right'] + ' ' + x['description_right'], axis=1)

test_left_text          = test.title_desc_left.tolist()
test_right_text         = test.title_desc_right.tolist()
test_text               = test_left_text+test_right_text
test_labels             = test.label.tolist()
print(len(test_left_text), len(test_right_text), len(test_labels), len(test_text))

In [39]:
train_left_text[0], train_right_text[0], train_labels[0]

('disparador automatico a distancia rc 6 c maras disparador automatico a distancia rc 6 para c maras eos 550',
 'canon rs 60e3 camera remote control  series specs cnet ',
 0)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_similarity(vector):
    cosine_sim_arr = []
    n              = int(len(vector) / 2)
    for i in range(n):
        j             = i + n
        cosine_sim    = cosine_similarity([vector[i]], [vector[j]])
        cosine_sim_arr.append(cosine_sim[0][0])
    return cosine_sim_arr

vectorizer   = TfidfVectorizer()
vectorizer.fit(train_text)

train_vector = vectorizer.transform(train_text).toarray()
test_vector  = vectorizer.transform(test_text).toarray()

In [45]:
train_vector[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [46]:
from sklearn import metrics

thresh         = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cosine_sim_arr = np.array(get_similarity(test_vector))
best_value     = (-1, -1)

for thrs in thresh:
    y_pred         = np.where(cosine_sim_arr > thrs, 1, 0)
    x              = metrics.classification_report(y_pred=y_pred, y_true=test_labels, labels=[0, 1], digits=4, output_dict=True)

    if best_value[1] < x['weighted avg']['f1-score']:
      best_value = (thrs, x['weighted avg']['f1-score'])

    print(thrs, x['0']['f1-score'], x['1']['f1-score'], x['weighted avg']['f1-score'])
    print('')

print('best value: ', best_value)

0.1 0.6893424036281179 0.5650793650793651 0.6561398457090067

0.2 0.8365758754863812 0.6528925619834711 0.7874964716139107

0.3 0.9040139616055847 0.6994535519125684 0.8493562859997787

0.4 0.8983050847457628 0.6385542168674698 0.8289007523232559

0.5 0.8859934853420195 0.5070422535211268 0.784739320225855

0.6 0.8835725677830942 0.434108527131783 0.7634776786143577

0.7 0.869701726844584 0.3025210084033613 0.718153439642035

0.8 0.8642745709828393 0.2434782608695652 0.6984004246298217

0.9 0.8549382716049383 0.12962962962962962 0.6611388725586256

best value:  (0.3, 0.8493562859997787)


# Experiment with Another Preprocessing

In [47]:
import re

def preprocessing(text):
  text = str(text).lower()
  text = re.sub(r'[^\w ]', '', text)
  text = re.sub(r'([0-9])([a-z])', r'\1 \2', text)
  text = re.sub(r'([a-z])([0-9])', r'\1 \2', text)
  return text

preprocessing('Camera A53VV 100gb')

'camera a 53 vv 100 gb'

In [None]:
train['title_left']  = train['title_left'].apply(preprocessing)
train['title_right'] = train['title_right'].apply(preprocessing)
test['title_left']   = test['title_left'].apply(preprocessing)
test['title_right']  = test['title_right'].apply(preprocessing)

In [49]:
train_left_title  = train.title_left.tolist()
train_right_title = train.title_right.tolist()
train_title       = train_left_title+train_right_title
train_labels      = train.label.tolist()
print(len(train_left_title), len(train_right_title), len(train_labels), len(train_title))

test_left_title  = test.title_left.tolist()
test_right_title = test.title_right.tolist()
test_title       = test_left_title+test_right_title
test_labels      = test.label.tolist()
print(len(test_left_title), len(test_right_title), len(test_labels), len(test_title))

1508 1508 1508 3016
378 378 378 756


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_similarity(vector):
    cosine_sim_arr = []
    n              = int(len(vector) / 2)
    for i in range(n):
        j             = i + n
        cosine_sim    = cosine_similarity([vector[i]], [vector[j]])
        cosine_sim_arr.append(cosine_sim[0][0])
    return cosine_sim_arr

vectorizer   = TfidfVectorizer()
vectorizer.fit(train_title)

train_vector = vectorizer.transform(train_title).toarray()
test_vector  = vectorizer.transform(test_title).toarray()

In [51]:
from sklearn import metrics

thresh         = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cosine_sim_arr = np.array(get_similarity(test_vector))
best_value     = (-1, -1)

for thrs in thresh:
    y_pred         = np.where(cosine_sim_arr > thrs, 1, 0)
    x              = metrics.classification_report(y_pred=y_pred, y_true=test_labels, labels=[0, 1], digits=4, output_dict=True)

    if best_value[1] < x['weighted avg']['f1-score']:
      best_value = (thrs, x['weighted avg']['f1-score'])

    print(thrs, x['0']['f1-score'], x['1']['f1-score'], x['weighted avg']['f1-score'])
    print('')

print('best value: ', best_value)

0.1 0.6246973365617434 0.5481049562682216 0.6042321767478659

0.2 0.7866108786610878 0.6330935251798562 0.7455916916198064

0.3 0.8576923076923076 0.6864406779661018 0.8119345971040886

0.4 0.9025270758122743 0.7326732673267327 0.8571428571428571

0.5 0.9044368600682594 0.6705882352941176 0.8419534973640576

0.6 0.8885245901639344 0.5342465753424658 0.7938630041931187

0.7 0.870253164556962 0.3387096774193548 0.7282269947133156

0.8 0.8615863141524105 0.21238938053097342 0.6881236414122911

0.9 0.8496932515337424 0.05769230769230769 0.6380739517242585

best value:  (0.4, 0.8571428571428571)
