# Product Clustering Approaches
## In this Notebook, Different  approaches are compared to map product title to product cluster name 


In [1]:
import pandas as pd
import numpy as np
import textdistance as td
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from utils.helper_functions import read_dataframes, encode_sequence, tokenize
import itertools

In [2]:
train_df, val_df, test_df  = read_dataframes()

In [3]:
train_df

Unnamed: 0,product_title,cluster_label,category
0,zanussi zwf71243w washing machines 7kg,zanussi zwf71243w,9
1,lg 43 lk5100pla led tv 43lk5100pla mw01,lg 43lk5100pla,8
2,hp amd opteron quad core 8380 2 5ghz fio kit 2...,hp amd opteron 8380 2 5ghz upgrade tray,0
3,bosch kiv87vf30g serie 4 70 30 integrated frid...,bosch kiv87vf30g integrated,4
4,amica einbau mikrowelle black design emw 13170,amica emw 13170 black,6
...,...,...,...
21181,intel xeon e3 1225 v6 4x 3 30ghz sockel 1151 b...,intel core e3 1225 v6 3 3ghz box,0
21182,1400rpm washing machine 8kg load class orange,gorenje w8543lo,9
21183,whirlpool uw8f2cxb 187cm freezer,whirlpool uw8 f2c xb uk stainless steel,3
21184,bosch smi50c12gb smi50c12gb,bosch smi50c12gb white,2


In [70]:
cluster_labels = np.array(list(set(train_df['cluster_label'].tolist() + val_df['cluster_label'].tolist() + test_df['cluster_label'].tolist())))

## The Baseline Approach is selecting cluster label with direct word matching

In [15]:
def search_token(word, cluster_label):
    if word in cluster_label:
        return True
    return False

In [17]:
def search_similarity(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    search_fun = np.vectorize(search_token)
    for word in text.split(' '):
        match_scores += search_fun(word, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [18]:
val_df['predicted_cluster'] = val_df['product_title'].apply(lambda x : search_similarity(x, cluster_labels))

In [21]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.46459926366468424


## Searching with cluster label tokenized

In [143]:
def search_token_tokenized(word, cluster_label):
    if word in cluster_label.split(' '):
        return True
    return False

In [198]:
def search_similarity_tokenized(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    search_fun = np.vectorize(search_token_tokenized)
    for word in text.split(' '):
        match_scores += search_fun(word, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [71]:
val_df['predicted_cluster'] = val_df['product_title'].apply(lambda x : search_similarity_tokenized(x, cluster_labels))

In [72]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.5016992353440951


## Cosine Similarity

In [161]:
def cosine_similarity(text, cluster_label):
    return td.cosine(text.split(' '), cluster_label.split(' '))

In [162]:
def search_max_cosine(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    cos_fun = np.vectorize(cosine_similarity)
    match_scores = cos_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [165]:
val_df['predicted_cluster_cosine'] = val_df['product_title'].apply(lambda x : search_max_cosine(x, cluster_labels))

In [166]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_cosine'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6367884451996602


## Dice Score

In [169]:
def dice_similarity(text, cluster_label):
    return td.sorensen_dice(text.split(' '), cluster_label.split(' '))

In [170]:
def search_max_dice(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    dice_fun = np.vectorize(dice_similarity)
    match_scores = dice_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [171]:
val_df['predicted_cluster_dice'] = val_df['product_title'].apply(lambda x : search_max_dice(x, cluster_labels))

In [172]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_dice'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6359388275276125


## Tversky Distance

In [183]:
tversky = td.Tversky(ks=(0.1, 0.4))

In [184]:
def tversky_distance(text, cluster_label):
    return tversky(text.split(' '), cluster_label.split(' '))

In [185]:
def search_max_tversky(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    tversky_fun = np.vectorize(tversky_distance)
    match_scores = tversky_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [186]:
val_df['predicted_cluster_tversky'] = val_df['product_title'].apply(lambda x : search_max_tversky(x, cluster_labels))

In [187]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_tversky'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6346644010195412


# Clustering Using Pretrained LSTM Model Embeddings
## Extract Embeddings from LSTM Output

In [4]:
from create_vocab import CreateVocab

In [5]:
vocab = CreateVocab(first_run=False, load_embeddings=False)

In [6]:
with open('model_files/cluster_label_mapping.json', 'r') as f:
    cluster_label_mapping = json.load(f)

In [7]:
unique_cluster_names = list(cluster_label_mapping)

In [8]:
df = pd.DataFrame()

In [9]:
df['cluster_label'] = unique_cluster_names

In [10]:
df['tokenized_x'] = df['cluster_label'].apply(lambda x : tokenize(x))

In [11]:
df['encoded_x'] = df['tokenized_x'].apply(lambda x : encode_sequence(x, vocab.word2idx))

In [12]:
class TextDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        x = torch.from_numpy(self.df.loc[idx, 'encoded_x'][0])
        l = self.df.loc[idx, 'encoded_x'][1]
        return x, l

In [13]:
cluster_label_dataset = TextDataset(df[['encoded_x', 'tokenized_x']])

In [14]:
def collate_fn(batch):
    """
       data: is a list of tuples with (example, label, length)
             where 'example' is a tensor of arbitrary shape
             and label/length are scalars
    """
    features, lengths = zip(*batch)
    features = pad_sequence(features, batch_first=True, padding_value=0)
    return features, torch.Tensor(lengths).int()

In [15]:
cluster_label_dataloader = DataLoader(cluster_label_dataset, batch_size=32, collate_fn=collate_fn, pin_memory=True, shuffle=False)

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [17]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix, hidden_size, num_output):
        super(LSTM, self).__init__()
        self.embeddings_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embeddings_layer.load_state_dict({'weight': torch.from_numpy(embedding_matrix)})
        self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True)
        self.dropout = nn.Dropout(p=0.2)
        self.dense = nn.Linear(2 * hidden_size, num_output)

    def forward(self, text_index, text_lengths):
        embedded = self.embeddings_layer(text_index)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True,
                                                            enforce_sorted=False)
        packed_output, (hidden_state, cell_state) = self.lstm(packed_embedded)
        hidden = torch.cat((hidden_state[-2, :, :], hidden_state[-1, :, :]), dim=1)
        output = self.dense(self.dropout(hidden))
        return output, hidden

In [18]:
model = LSTM(len(vocab.word2idx.keys()), vocab.embedding_dim, None, 100, num_output=10).to(device)

In [19]:
model_dict = torch.load('model_files/lstm/epoch_4.tar')

In [20]:
model.load_state_dict(model_dict)

<All keys matched successfully>

In [21]:
cluster_label_embeddings = []

In [22]:
for i, (text, text_lengths) in enumerate(cluster_label_dataloader):
    text = text.to(device)
    outputs, embeddings = model(text, text_lengths)
    cluster_label_embeddings.append(embeddings.detach().cpu())

In [23]:
cluster_label_embeddings = list(itertools.chain.from_iterable(cluster_label_embeddings))

In [24]:
for i in range(len(cluster_label_embeddings)):
    cluster_label_embeddings[i] = torch.mean(cluster_label_embeddings[i].reshape(2, 100), 0)

In [25]:
cluster_label_embeddings = torch.from_numpy(np.array(cluster_label_embeddings))

## Embeddings for Cluster Label are extracted

In [26]:
val_df['tokenized_x'] = val_df['product_title'].apply(lambda x : tokenize(x))

In [27]:
val_df['encoded_x'] = val_df['tokenized_x'].apply(lambda x : encode_sequence(x, vocab.word2idx))

In [28]:
class TextDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        x = torch.from_numpy(self.df.loc[idx, 'encoded_x'][0])
        l = self.df.loc[idx, 'encoded_x'][1]
        return x, l

In [29]:
val_dataset = TextDataset(val_df[['encoded_x', 'tokenized_x']])

In [30]:
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn, pin_memory=True, shuffle=False)

In [31]:
product_title_embeddings = []

In [32]:
for i, (text, text_lengths) in enumerate(val_dataloader):
    text = text.to(device)
    outputs, embeddings = model(text, text_lengths)
    product_title_embeddings.append(embeddings.detach().cpu())

In [33]:
product_title_embeddings = list(itertools.chain.from_iterable(product_title_embeddings))

In [34]:
for i in range(len(product_title_embeddings)):
    product_title_embeddings[i] = torch.mean(product_title_embeddings[i].reshape(2, 100), 0)

In [46]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)

In [36]:
product_title_embeddings = torch.from_numpy(np.array(product_title_embeddings))

In [37]:
cluster_label_embeddings = cluster_label_embeddings.to(device)

In [57]:
predicted_cluster_label = []
for i in range(len(product_title_embeddings)):
    x = product_title_embeddings[i].to(device)
    x = x.unsqueeze(0)
    cos_score = cos(x, cluster_label_embeddings)
    predicted_cluster_label.append(unique_cluster_names[torch.argmax(cos_score).detach().cpu().tolist()])

In [62]:
val_df['cluster_label']

0                                 praktica luxmedia wp240
1       amd phenom ii x4 970 3 5ghz socket am3 2000mhz...
2                                    liebherr t1404 white
3                                          lg 43uk6300plb
4                      hisense rb381n4wc1 stainless steel
                              ...                        
7057                                          lg 55lf580v
7058                                 swan sm40010redn red
7059                            samsung galaxy a3 sm a310
7060                               praktica luxmedia z250
7061                      intel core i7 3770t 2 5ghz tray
Name: cluster_label, Length: 7062, dtype: object

In [61]:
val_df['predicted_cluster_label'] = predicted_cluster_label

In [64]:
(val_df['predicted_cluster_label'] == val_df['cluster_label']).sum()

355

## The score is very less, probably training embeddings in siamese fashion would help increase