# **This notebook is about finding similarity between two titles of the product using word2vec and cosine similarity. This feature can be used with image to find the similar product.**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
from gensim.models import Word2Vec
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')

from numpy import dot
from numpy.linalg import norm
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
DATA_PATH = '../input/shopee-product-matching/'

# Add target column in the dataframe

In [None]:
train_df['image'] = DATA_PATH + 'train_images/' + train_df['image']
tmp = train_df.groupby('label_group').posting_id.agg('unique').to_dict()
train_df['target'] = train_df.label_group.map(tmp)
train_df

In [None]:
train_df['title'][5]


# Data Cleaning

In [None]:

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            if token == 'xxxx':
                continue
            result.append(lemmatize_stemming(token))
    
    return result

In [None]:
processed_docs = train_df['title'].map(preprocess)
processed_docs =list(processed_docs)


In [None]:
processed_docs[:10] # clean document

# Word2vec model

I choose embedding dim of size 50. This means that each word will be represented by a vector of size 50

In [None]:
def word2vec_model():
    w2v_model = Word2Vec(min_count=1,
                     window=3,
                     vector_size=50,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)
    
    w2v_model.build_vocab(processed_docs)
    w2v_model.train(processed_docs, total_examples=w2v_model.corpus_count, epochs=300, report_delay=1)
    
    return w2v_model

In [None]:
w2v_model = word2vec_model()
w2v_model.save('word2vec_model')

# Getting embedding vector

In [None]:
emb_vec = w2v_model.wv

In [None]:
emb_vec['anak'] # It will return vector representation of the word anak

# Finding similarity between two vector using cosine similarity

In [None]:

def find_similarity(sen1, sen2, model):
    p_sen1 = preprocess(sen1)
    p_sen2 = preprocess(sen2)
    
    sen_vec1 = np.zeros(50)
    sen_vec2 = np.zeros(50)
    for val in p_sen1:
        sen_vec1 = np.add(sen_vec1, model[val])

    for val in p_sen2:
        sen_vec2 = np.add(sen_vec2, model[val])
    
    return dot(sen_vec1,sen_vec2)/(norm(sen_vec1)*norm(sen_vec2))
    

In [None]:
find_similarity('Bubble Wrap ( Hanya tambahan packing)', 'Bubble wrap',emb_vec )

In [None]:
find_similarity('Atasan Rajut Wanita LISDIA SWEATER', 'CELANA WANITA  (BB 45-84 KG)Harem wanita (bisa cod)',emb_vec )

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return np.unique(x)
if COMPUTE_CV:
    tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
    test['target'] = test.label_group.map(tmp)
    test['oof'] = test.apply(combine_for_cv,axis=1)
    test['f1'] = test.apply(getMetric('oof'),axis=1)
    print('Accuracy or CV Score =', test.f1.mean() )

test['matches'] = test.apply(combine_for_sub,axis=1)

Accuracy or CV Score = 0.9248077230326005


# Fell free to use this notebook and please upvote if you like the work.
# Thank You