# Product Clustering Approaches
## In this Notebook, Different  approaches are compared to map product title to product cluster name 


In [202]:
import pandas as pd
import numpy as np
import textdistance as td
from utils.helper_functions import read_dataframes

In [203]:
train_df, val_df, test_df  = read_dataframes()

In [204]:
train_df

Unnamed: 0,product_title,cluster_label,category
0,zanussi zwf71243w washing machines 7kg,zanussi zwf71243w,9
1,lg 43 lk5100pla led tv 43lk5100pla mw01,lg 43lk5100pla,8
2,hp amd opteron quad core 8380 2 5ghz fio kit 2...,hp amd opteron 8380 2 5ghz upgrade tray,0
3,bosch kiv87vf30g serie 4 70 30 integrated frid...,bosch kiv87vf30g integrated,4
4,amica einbau mikrowelle black design emw 13170,amica emw 13170 black,6
...,...,...,...
21181,intel xeon e3 1225 v6 4x 3 30ghz sockel 1151 b...,intel core e3 1225 v6 3 3ghz box,0
21182,1400rpm washing machine 8kg load class orange,gorenje w8543lo,9
21183,whirlpool uw8f2cxb 187cm freezer,whirlpool uw8 f2c xb uk stainless steel,3
21184,bosch smi50c12gb smi50c12gb,bosch smi50c12gb white,2


In [70]:
cluster_labels = np.array(list(set(train_df['cluster_label'].tolist() + val_df['cluster_label'].tolist() + test_df['cluster_label'].tolist())))

## The Baseline Approach is selecting cluster label with direct word matching

In [15]:
def search_token(word, cluster_label):
    if word in cluster_label:
        return True
    return False

In [17]:
def search_similarity(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    search_fun = np.vectorize(search_token)
    for word in text.split(' '):
        match_scores += search_fun(word, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [18]:
val_df['predicted_cluster'] = val_df['product_title'].apply(lambda x : search_similarity(x, cluster_labels))

In [21]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.46459926366468424


## Searching with cluster label tokenized

In [143]:
def search_token_tokenized(word, cluster_label):
    if word in cluster_label.split(' '):
        return True
    return False

In [198]:
def search_similarity_tokenized(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    search_fun = np.vectorize(search_token_tokenized)
    for word in text.split(' '):
        match_scores += search_fun(word, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [71]:
val_df['predicted_cluster'] = val_df['product_title'].apply(lambda x : search_similarity_tokenized(x, cluster_labels))

In [72]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.5016992353440951


## Cosine Similarity

In [161]:
def cosine_similarity(text, cluster_label):
    return td.cosine(text.split(' '), cluster_label.split(' '))

In [162]:
def search_max_cosine(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    cos_fun = np.vectorize(cosine_similarity)
    match_scores = cos_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [165]:
val_df['predicted_cluster_cosine'] = val_df['product_title'].apply(lambda x : search_max_cosine(x, cluster_labels))

In [166]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_cosine'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6367884451996602


## Dice Score

In [169]:
def dice_similarity(text, cluster_label):
    return td.sorensen_dice(text.split(' '), cluster_label.split(' '))

In [170]:
def search_max_dice(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    dice_fun = np.vectorize(dice_similarity)
    match_scores = dice_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [171]:
val_df['predicted_cluster_dice'] = val_df['product_title'].apply(lambda x : search_max_dice(x, cluster_labels))

In [172]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_dice'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6359388275276125


## Tversky Distance

In [183]:
tversky = td.Tversky(ks=(0.1, 0.4))

In [184]:
def tversky_distance(text, cluster_label):
    return tversky(text.split(' '), cluster_label.split(' '))

In [185]:
def search_max_tversky(text, cluster_labels):
    match_scores = np.zeros(len(cluster_labels))
    tversky_fun = np.vectorize(tversky_distance)
    match_scores = tversky_fun(text, cluster_labels)
    return cluster_labels[np.argmax(match_scores)]

In [186]:
val_df['predicted_cluster_tversky'] = val_df['product_title'].apply(lambda x : search_max_tversky(x, cluster_labels))

In [187]:
print('Accuracy on Validation Dataset : ', (val_df['predicted_cluster_tversky'] == val_df['cluster_label']).sum()/val_df.shape[0])

Accuracy on Validation Dataset :  0.6346644010195412


In [193]:
import json

In [194]:
with open('model_files/cluster_label_mapping.json', 'r') as f:
    cluster_label_mapping = json.load(f)

In [213]:
df = pd.read_csv('data/pricerunner_aggregate.csv')

In [234]:
print(df.loc[35307]['Product Title'])

smeg fab28 60cm retro style left hand hinge fridge with icebox red


In [219]:
import pickle

In [222]:
with open('model_files/random_forest_classification.pkl', 'rb') as f:
    model = pickle.load(f)