In [None]:
import os
import numpy as np
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

SOURCE_DIR = '/kaggle/input/shopee-product-matching'
TRAIN_IMAGE = 'train_images'
OPENCV_EAST = '/kaggle/input/opencv-east/frozen_east_text_detection.pb'
OUTPUT_DIR = '/kaggle/working'

IMG_DIR = ''
for dirname, _, filenames in os.walk(os.path.join(SOURCE_DIR, TRAIN_IMAGE)):
    for filename in filenames:
        IMG_DIR = dirname
        break

In [None]:
train_df = pd.read_csv(os.path.join(SOURCE_DIR, 'train.csv'))
train_df.head(5)

In [None]:
train_df.info(memory_usage='deep')

**ANALYSIS of given Image height and width as big images use high memory**

In [None]:
#Lets take a sample dataframe before execution
from PIL import Image
#PIL Image.open is lazy function and does not load image into memory
def get_ht_wd(filename, file_path=IMG_DIR):
    H, W = None, None
    try:
        img = Image.open(os.path.join(file_path, filename))
        W, H = img.size
    except Exception as e:
        print('unable to read file {}'.format(file_path))
    return W,H

train_df['dimension'] = train_df['image'].apply(lambda x: get_ht_wd(x))
train_df[['width', 'height']] = pd.DataFrame(train_df['dimension'].tolist(), index=train_df.index)
train_df.head(2)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.scatter(x='width', y='height', data=train_df)
ax.set_xlabel('width')
ax.set_ylabel('height')
plt.show()

**From above scatter plot we can conclude that images are square shaped i.e height=width**

In [None]:
grouped_df = train_df.groupby('label_group', as_index = False).agg({'image': ','.join, 'title': '|'.join})
grouped_df['duplicate_count'] = grouped_df['image'].str.count(',') + 1
grouped_df.head(5)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
ax.hist(grouped_df['duplicate_count'])
plt.show()
#below plot analysis reveal most of the images are repeated 5 times in given database

In [None]:
pip install google_trans_new

In [None]:
#Sample Analysis
from google_trans_new import google_translator
import traceback
#translate title indonesian to english
translator = google_translator()
def translate_to_english(txt):
    eng_txt = ''
    try:
        eng_txt = translator.translate(txt)
        eng_txt = '|'.join([_.strip() for _ in eng_txt.split('|')])  # trim spaces
    except Exception as e:
        print(traceback.print_exc())
        #print(txt)
    finally:
        return eng_txt
sample_grp_labels = grouped_df.iloc[0:5]['label_group'].values
sample_df = train_df[train_df['label_group'].isin(sample_grp_labels)].copy()
sample_df.reset_index(inplace=True)
sample_df['title_eng'] = sample_df['title'].apply(translate_to_english)
sample_df['title_eng']

In [None]:
#covert images to 320 X 320 for processing
import shutil
import cv2
MAX_HEIGHT = 320
MAX_WIDTH = 320
#shutil.rmtree(os.path.join(OUTPUT_DIR, TRAIN_IMAGE))
os.mkdir(os.path.join(OUTPUT_DIR, TRAIN_IMAGE))
def resize_img(file_name,
               file_path=os.path.join(SOURCE_DIR, TRAIN_IMAGE),
               output_path=os.path.join(OUTPUT_DIR, TRAIN_IMAGE)):

    im = cv2.imread(os.path.join(file_path, file_name))
    ht, w, channels = im.shape
    if ht < MAX_HEIGHT & w < MAX_WIDTH:
        im = cv2.resize(im, (round(MAX_WIDTH / w), round(MAX_HEIGHT / ht)), interpolation=cv2.INTER_AREA)
    else:
        im = cv2.resize(im, (MAX_WIDTH, MAX_HEIGHT), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(os.path.join(output_path, file_name), im)

sample_df['image'].apply(resize_img)


In [None]:
from matplotlib.colors import rgb_to_hsv
class ImageAnalysis():
    def __init__(self, file_path):
        self.display_cols = 3
        self.file_path = file_path
        
    def get_RGB_HSV_GRAY(self, img):
        image = cv2.imread(os.path.join(self.file_path, img))
        orig = image.copy()
        #opencv read image in BGR and matplotlib reads in RGB format
        morig = cv2.cvtColor(orig, cv2.COLOR_BGR2RGB)
        # Convert BGR to HSV
        mhsv = rgb_to_hsv(morig)
        # Convert BGR to black/white
        gray = cv2.cvtColor(orig, cv2.COLOR_BGR2GRAY)
        return morig, mhsv, gray

    def plot_color_change(self, img, txt):
        orig, hsv, gray = self.get_RGB_HSV_GRAY(img)

        fig = plt.figure(figsize=(15, 15))
        print(txt)
        fig.add_subplot(1, self.display_cols, 1)
        plt.imshow(orig)
        plt.axis('off')
        plt.title('origin')

        fig.add_subplot(1, self.display_cols, 2)
        plt.imshow(hsv)
        plt.axis('off')
        plt.title('hsv')

        fig.add_subplot(1, self.display_cols, 3)
        plt.imshow(gray, cmap='gray')
        plt.axis('off')
        plt.title('gray')
        plt.show()

            
RESIZED_DIR = os.path.join(OUTPUT_DIR, TRAIN_IMAGE)

ia = ImageAnalysis(RESIZED_DIR)
mask = (sample_df['label_group'] == sample_grp_labels[0])
sample_df[mask].apply(
    lambda x: ia.plot_color_change(x['image'], x['title']), axis=1)


In [None]:
# Approach
# There are three steps  :
# Find similarity in title text
# Find similarity in Image
# Combine similarity of title text and image and predict

# 1) Find Similarity in title text
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from gensim import corpora
import gensim
import re
from gensim.parsing.preprocessing import remove_stopwords

class TextActions:
    def __init__(self):
        pass
    @staticmethod
    def lemmatize(words):
        lemmas = set()
        for word in words:
            lemma = wn.morphy(word)
            if lemma is None:
                lemmas.add(word)
            else:
                lemmas.add(lemma)
        return list(lemmas)

    @staticmethod
    def clean_sentence(sentence):
        # make all text lowercase
        sentence = sentence.lower()
        # removing everything except alphabets`
        sentence = re.sub('[^a-zA-Z]+', ' ', sentence)
        sentence = remove_stopwords(sentence)

        # removing short words
        sentence = ' '.join([w for w in sentence.split() if len(w)>3])
        return sentence

    @staticmethod
    def tokenize(sentence):
        return word_tokenize(sentence)
    
    @staticmethod
    def remove_duplicate(lst):
        return list(set(lst))
    
    @staticmethod
    def spell_checker(corpus, words):
        pass #TODO:symspell

    @staticmethod
    def fetch_req_words(sentence):
        sentence = TextActions.clean_sentence(sentence)
        words = TextActions.tokenize(sentence)
        words = TextActions.lemmatize(words)
        return words

ta = TextActions()

sample_df['word_tokens'] = sample_df['title_eng'].apply(ta.fetch_req_words)

sample_df['word_tokens'].head(5)

In [None]:
import itertools
class TitleAnalysis:
    def __init__(self):
        self.dictionary = None
    def create_dictionary(self, tokens):
        self.dictionary = list(set(itertools.chain.from_iterable(tokens)))
        #print(self.dictionary)
    def plot_titlewise_words(self, tokens, titles):
        fig, ax = plt.subplots(figsize=(15,10))
        for title_no, words in enumerate(tokens, 1):
            #consider no words match dictonary
            x , y = words, [title_no] * len(words)
            #print(x, y)
            ax.scatter(x, y, marker='*')
        ax.set_xticks(self.dictionary)
        ax.set_xticklabels(self.dictionary, rotation =70, fontsize=16)
        ax.set_xlabel('token', fontsize=16)
        ax.set_yticks([1,2,3])
        ax.set_yticklabels(titles, rotation =45, fontsize=16)
        ax.set_ylabel('title', fontsize=16)
        #plt.grid(True)
        plt.show()
        

ta = TitleAnalysis()
#print(sample_df[mask]['word_tokens'].to_list())
ta.create_dictionary(sample_df[mask]['word_tokens'].to_list())
ta.plot_titlewise_words(sample_df[mask]['word_tokens'].to_list(),sample_df[mask]['title_eng'].to_list())

In [None]:
from gensim.models import Word2Vec
MODEL_DIR = 'model'
w2v_model = 'product_word2vec.model'
#shutil.rmtree(os.path.join(OUTPUT_DIR, MODEL_DIR))
os.mkdir(os.path.join(OUTPUT_DIR, MODEL_DIR))

common_texts = sample_df['word_tokens'].to_list()
model = Word2Vec(sentences=common_texts, size=100,
                 window=5, min_count=1, workers=4, min_alpha=0.0007)

model.save(os.path.join(*[OUTPUT_DIR, MODEL_DIR, w2v_model]))


In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load(os.path.join(*[OUTPUT_DIR, MODEL_DIR, w2v_model]))
'|'.join(model.wv.vocab)

In [None]:
from sklearn.decomposition import PCA
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
plt.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))

In [None]:
#using word movers distance to calc similarity between title
#ON same products
fig, ax = plt.subplots(figsize=(16, 16))

sample_df = sample_df.sort_values(by='label_group')
ticks = (sample_df['label_group'].astype(str) + ',' + sample_df['title_eng'].str.slice(0, 10) + '..').to_list()
for record_idx_1 in sample_df.index:
    for record_idx_2 in sample_df.index:
        sent1 = sample_df.loc[record_idx_1, 'title_eng']
        sent2 = sample_df.loc[record_idx_2, 'title_eng']
        token1 = sample_df.loc[record_idx_1, 'word_tokens']
        token2 = sample_df.loc[record_idx_2, 'word_tokens']
        #print(model.wv.wmdistance(token1, token2))
        mv_distance = round(model.wv.wmdistance(token1, token2), 3)
        x = record_idx_1
        y = record_idx_2
        ax.plot(x, y)
        c = 'green' # is similar
        if mv_distance < 0.75 and mv_distance > 0.50:
            c = 'blue' # can be similar
        elif mv_distance >= 0.75:
            c = 'white' # no similarity
        ax.annotate(mv_distance, (x,y), bbox=dict(facecolor=c, pad=5))
ax.set_xticks(range(len(ticks)))
ax.set_xticklabels(ticks, rotation =90, fontsize=14)
ax.set_yticks(range(len(ticks)))
ax.set_yticklabels(ticks, fontsize=14)
#plt.grid(True)
plt.show()
#smaller the distance means sentences are similar

In [None]:
#using cosine similarity to calc similarity between title
fig, ax = plt.subplots(figsize=(16, 16))

sample_df = sample_df.sort_values(by='label_group')
ticks = (sample_df['label_group'].astype(str) + ',' + sample_df['title_eng'].str.slice(0, 10) + '..').to_list()
for record_idx_1 in sample_df.index:
    for record_idx_2 in sample_df.index:
        sent1 = sample_df.loc[record_idx_1, 'title_eng']
        sent2 = sample_df.loc[record_idx_2, 'title_eng']
        token1 = sample_df.loc[record_idx_1, 'word_tokens']
        token2 = sample_df.loc[record_idx_2, 'word_tokens']
        #print(model.wv.wmdistance(token1, token2))
        cosine_similarity = round(model.wv.n_similarity(token1, token2), 3)
        x = record_idx_1
        y = record_idx_2
        ax.plot(x, y)
        c = 'white'
        if cosine_similarity < 0.75 and mv_distance > 0.50:
            c = 'blue'
        elif cosine_similarity >= 0.75:
            c = 'green'
        ax.annotate(cosine_similarity, (x,y), bbox=dict(facecolor=c, pad=5))
ax.set_xticks(range(len(ticks)))
ax.set_xticklabels(ticks, rotation =90, fontsize=14)
ax.set_yticks(range(len(ticks)))
ax.set_yticklabels(ticks, fontsize=14)
#plt.grid(True)
plt.show()

In [None]:
#
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
d2v_model = 'product_doc2vec.model'
#shutil.rmtree(os.path.join(OUTPUT_DIR, MODEL_DIR))
#os.mkdir(os.path.join(OUTPUT_DIR, MODEL_DIR))

common_texts = sample_df['word_tokens'].to_list()
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents=documents, vector_size=100,
                 window=5, min_count=1, workers=4, min_alpha=0.0007)

model.save(os.path.join(*[OUTPUT_DIR, MODEL_DIR, d2v_model]))
model = Doc2Vec.load(os.path.join(*[OUTPUT_DIR, MODEL_DIR, d2v_model]))

In [None]:
test = sample_df.loc[0,'word_tokens']

new_vector = model.infer_vector(test)
similar = model.docvecs.most_similar([new_vector])
print(similar)



**#conclusion wmd , cosine similarity on wordvec or docvec dont work **

In [None]:
#using Levenshtein distance
#Levenshtein distance between two words is the minimum number of single-character edits
#(insertions, deletions or substitutions) required to change one word into the other
from fuzzywuzzy import fuzz
        
def compare_title(title1, title2):
    c1 = fuzz.ratio(title1, title2)
    c2 = fuzz.partial_ratio(title1, title2)
    c3 = fuzz.token_set_ratio(title1, title2)
    return c1, c2, c3

import matplotlib.colors as mcolors
bounds=[0, 50, 75, 101]
cols = ['white', 'blue', 'green']
cmap, norm = mcolors.from_levels_and_colors(bounds, cols)

fig, ax = plt.subplots(figsize=(16, 16))

sample_df = sample_df.sort_values(by='label_group')
ticks = (sample_df['label_group'].astype(str) + ',' + sample_df['title_eng'].str.slice(0, 10) + '..').to_list()

title_match = []
for record_idx_1 in sample_df.index:
    match = []
    for record_idx_2 in sample_df.index:
        title1 = sample_df.loc[record_idx_1, 'title_eng']
        title2 = sample_df.loc[record_idx_2, 'title_eng']
        c1, c2, c3 = compare_title(title1, title2)
        match.append(c2)
        x = record_idx_1
        y = record_idx_2
        c = 'white'
        if c2 < 75 and c2 > 50:
            c = 'blue'
        elif c2 >= 75:
            c = 'green'
        ax.plot(x, y)
        ax.annotate(c2, (x,y), bbox=dict(facecolor=c, pad=5))
    title_match.append(match)

ax.set_xticks(range(len(ticks)))
ax.set_xticklabels(ticks, rotation =90, fontsize=14)
ax.set_yticks(range(len(ticks)))
ax.set_yticklabels(ticks, fontsize=14)
ax.set_title('Levenshtein distance between titles')
plt.show()

In [None]:
#Lets topic model to see if we can segregate titles into groups
dictionary = corpora.Dictionary(sample_df.loc[:,'word_tokens'])
dictionary

In [None]:
corpus = [dictionary.doc2bow(text) for text in sample_df.loc[:,'word_tokens']]
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
import pyLDAvis.gensim
pd.options.display.max_colwidth = 5000
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary,  mds='tsne')
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display)

In [None]:
text = sample_df['word_tokens'][17]
print(text)
new_doc_bow = [dictionary.doc2bow(text)]
print(ldamodel.get_document_topics(new_doc_bow[0]))
#topic 4
topics[4]