# Download Dataset

In [None]:
# Removed
# Competition Use Only

!mkdir '/content/feature_data/'
!mkdir '/content/models/'

mkdir: cannot create directory ‘/content/feature_data/’: File exists
mkdir: cannot create directory ‘/content/models/’: File exists


In [None]:
!pip install fasttext -q
!pip install fuzzywuzzy -q
!pip install textdistance -q
!pip install python-Levenshtein -q

# Code

In [None]:
import numpy as np
import pandas as pd
import re
import joblib
import fasttext as ft

from scipy.spatial import distance
from scipy.stats import skew, kurtosis
from fuzzywuzzy import fuzz
import textdistance

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm.sklearn import LGBMClassifier

In [None]:
PROJECT_DIR = '/content/drive/MyDrive/Colab Projects/product-pair-matching/'
DATA_DIR = PROJECT_DIR+'data/'
OUTPUTS_DIR = PROJECT_DIR+'outputs/'

train_df = pd.read_csv(DATA_DIR+'raw/new_training_set.csv', index_col=0)
test_df = pd.read_csv(DATA_DIR+'raw/new_test_set.csv', index_col=0)

## Clean text

In [None]:
default_stop_words = [
    'atau', 'dan', 'and', 'murah', 'grosir',
    'untuk', 'termurah', 'cod', 'terlaris', 'bisacod', 'terpopuler',
    'bisa', 'terbaru', 'tempat', 'populer', 'di', 'sale', 'bayar', 'flash',
    'promo', 'seler', 'in', 'salee', 'diskon', 'gila', 'starseller', 'seller'
]

def remove_stopwords(text):
    s = str(text).lower()
    s = ' '.join([word for word in s.split() if word not in default_stop_words])
    return s

def preprocess_text(text):
    s = str(text).lower()
    s = re.sub('&', ' and ', s)
    s = re.sub('/', 'atau', s, count=1)
    s = re.sub(r"[^a-zA-Z0-9]+", ' ', s)
    s = re.sub(' s ', 's ', s)
    s = re.sub(r"([0-9]+(\.[0-9]+)?)", r" \1 ", s).strip()
    return s

def preprocess_text_df(df, txt_cols=['title_1', 'title_2'], func=preprocess_text):
    txt_df = df[txt_cols].copy()
    for col in txt_cols:
        txt_df[col] = txt_df.apply(lambda x: func(x[col]), axis=1)
    return txt_df
    
def clean_text(df):
    print('Clean text...')
    df[['title_1', 'title_2']] = preprocess_text_df(df, txt_cols=['title_1', 'title_2'], 
                                                    func=preprocess_text)
    df[['title_1', 'title_2']] = preprocess_text_df(df, txt_cols=['title_1', 'title_2'], 
                                                    func=remove_stopwords)
    return df

In [None]:
print('TRAIN')
train_df = clean_text(train_df)
print('TEST')
test_df = clean_text(test_df)

TRAIN
Clean text...
TEST
Clean text...


## Feature Extractor Model

### Create text corpus

In [None]:
print('Concat text...')
texts = pd.concat([train_df['title_1'], train_df['title_2'],
                   test_df['title_1'], test_df['title_2']], axis=0)
texts = texts.reset_index().drop('index', axis=1)
texts = texts.rename(columns={0:'Title'})

print('Save text corpus...')
TXT_DIR = DATA_DIR+'interim/titles.txt'
texts.to_csv(TXT_DIR, header=False, index=False)
print(TXT_DIR)
texts.head()

Concat text...
Save text corpus...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/interim/titles.txt


Unnamed: 0,Title
0,johnsons top to toe hair body bath 500 ml
1,sandal humble
2,likuid likuit liquit baby pod liquid salt pod ...
3,6 pasang set anting tusuk bentuk lingkaran aks...
4,rorec natural skin care mask rorec sheet mask ...


### Create text model

In [None]:
# TXT_EMB_DIR = OUTPUTS_DIR+'extractor/fasttext_emb_128.bin'
TXT_EMB_DIR = '/content/models/fasttext_emb_128.bin'
EMB_DIM = 128

print('Create text embedding model...')
model = ft.train_unsupervised(TXT_DIR, minn=3, maxn=6, dim=EMB_DIM)
print('Save model...')
model.save_model(TXT_EMB_DIR)
print(TXT_EMB_DIR)

Create text embedding model...
Save model...
/content/models/fasttext_emb_128.bin


## Feature Extraction

### Create feature

In [None]:
def calculate_distance(vect_1, vect_2):
    return [
        distance.euclidean(vect_1, vect_2),
        distance.braycurtis(vect_1, vect_2),
        distance.canberra(vect_1, vect_2),
        distance.chebyshev(vect_1, vect_2),
        distance.cityblock(vect_1, vect_2),
        distance.cosine(vect_1, vect_2),
        distance.minkowski(vect_1, vect_2),
        skew(np.nan_to_num(vect_1)),
        skew(np.nan_to_num(vect_2)),
        kurtosis(np.nan_to_num(vect_1)),
        kurtosis(np.nan_to_num(vect_2)),
    ]

def calculate_crafted(temp_df):
    df = temp_df.copy()
    df['len_txt_1'] = df.title_1.apply(lambda x: len(x))
    df['len_txt_2'] = df.title_2.apply(lambda x: len(x))
    df['len_diff'] = np.abs(df.len_txt_1 - df.len_txt_2)

    df['len_char_txt_1'] = df.title_1.apply(lambda x: len(x.replace(' ', '')))
    df['len_char_txt_2'] = df.title_2.apply(lambda x: len(x.replace(' ', '')))
    df['len_char_diff'] = np.abs(df.len_char_txt_1 - df.len_char_txt_2)

    df['len_uniq_char_txt_1'] = df.title_1.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
    df['len_uniq_char_txt_2'] = df.title_2.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
    df['len_uniq_char_diff'] = np.abs(df.len_uniq_char_txt_1 - df.len_uniq_char_txt_2)

    df['len_word_txt_1'] = df.title_1.apply(lambda x: len(x.split()))
    df['len_word_txt_2'] = df.title_2.apply(lambda x: len(x.split()))
    df['len_word_diff'] = np.abs(df.len_word_txt_1 - df.len_word_txt_2)

    df['len_uniq_word_txt_1'] = df.title_1.apply(lambda x: len(set(x.split())))
    df['len_uniq_word_txt_2'] = df.title_2.apply(lambda x: len(set(x.split())))
    df['len_uniq_word_diff'] = np.abs(df.len_uniq_word_txt_1 - df.len_uniq_word_txt_2)

    df['common_words']  = df.apply(lambda x: len(set(x['title_1'].split()).intersection(set(x['title_2'].split()))), axis=1)
    df['union_words']   = df.apply(lambda x: len(set(x['title_1'].split()).union(set(x['title_2'].split()))), axis=1)
    df['jaccard_words'] = df.common_words / (df.union_words + 1)

    df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(
        lambda x: fuzz.partial_token_set_ratio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(
        lambda x: fuzz.partial_token_sort_ratio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(x['title_1'], x['title_2']), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(x['title_1'], x['title_2']),
                                            axis=1)

    df['txt_hamming'] = df.apply(
        lambda x: textdistance.hamming.normalized_similarity(x['title_1'], x['title_2']), axis=1)
    df['txt_jaro_winkler'] = df.apply(
        lambda x: textdistance.jaro_winkler.normalized_similarity(x['title_1'], x['title_2']), axis=1)
    df['txt_overlap'] = df.apply(
        lambda x: textdistance.overlap.normalized_similarity(x['title_1'], x['title_2']), axis=1)
    df['txt_mra'] = df.apply(lambda x: textdistance.mra.normalized_similarity(x['title_1'], x['title_2']),
                                axis=1)
    df.drop(columns=['title_1', 'title_2'], inplace=True)

    return df

In [None]:
def create_text_feature(temp_df, dir, model=model):
    df = temp_df[['title_1', 'title_2']].copy()
    print('Calculate text vector...')
    vect_1 = []
    vect_2 = []
    for i, row in df.iterrows():
        vect_1.append(model.get_sentence_vector(row['title_1']))
        vect_2.append(model.get_sentence_vector(row['title_2']))
    
    print('Calculate distance...')
    distance = []
    for i in range(len(vect_1)):
        distance.append(calculate_distance(vect_1[i], vect_2[i]))

    print('Calculate crafted feature...')
    crafted = calculate_crafted(df)
    
    text_feat = np.concatenate([vect_1, vect_2, distance, crafted.to_numpy()],axis=1)
    print('Save text vector...')
    joblib.dump(text_feat, dir)
    print(dir)
    return text_feat

In [None]:
print('TRAIN DATA')
TRAIN_VECTOR_DIR = DATA_DIR+'interim/train_text_vector.pkl'
train_text_vector = create_text_feature(train_df, TRAIN_VECTOR_DIR)
print(train_text_vector.shape)

print('TEST DATA')
TEST_VECTOR_DIR = DATA_DIR+'interim/test_text_vector.pkl'
test_text_vector = create_text_feature(test_df, TEST_VECTOR_DIR)
print(test_text_vector.shape)

TRAIN DATA
Calculate text vector...
Calculate distance...
Calculate crafted feature...
Save text vector...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/interim/train_text_vector.pkl
(10181, 296)
TEST DATA
Calculate text vector...
Calculate distance...
Calculate crafted feature...
Save text vector...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/interim/test_text_vector.pkl
(32580, 296)


### Create feature dataframe

In [None]:
def create_feature_df(df, vect, dir, label_col=None):
    vect_len = 128
    dist_len = 11
    crafted_len = 29
    col_list = [f'txt_1_{i}' for i in range(vect_len)] + \
                [f'txt_2_{i}' for i in range(vect_len)] + \
                [f'txt_dist_{i}' for i in range(dist_len)] + \
                [f'txt_crafted_{i}' for i in range(crafted_len)]

    feats_df = pd.DataFrame(
        data=vect,
        columns=col_list)
    
    if label_col is not None:
        feats_df[label_col] = df[label_col]

    print('Save dataframe...')
    feats_df.to_csv(dir, index=False)
    print(dir)
    return feats_df

In [None]:
print('TRAIN DATA')
TRAIN_DF_DIR = DATA_DIR+'clean/train_text_df.csv'
train_text_df = create_feature_df(train_df, train_text_vector, TRAIN_DF_DIR, 'Label')
print(train_text_df.shape)

print('TEST DATA')
TEST_DF_DIR = DATA_DIR+'clean/test_text_df.csv'
test_text_df = create_feature_df(test_df, test_text_vector, TEST_DF_DIR)
print(test_text_df.shape)


TRAIN DATA
Save dataframe...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/clean/train_text_df.csv
(10181, 297)
TEST DATA
Save dataframe...
/content/drive/MyDrive/Colab Projects/product-pair-matching/data/clean/test_text_df.csv
(32580, 296)
