In [None]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import random
from collections import Counter
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from sklearn.neighbors import NearestNeighbors
import seaborn as sns 
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
import cv2
import os

In [None]:
train = pd.read_csv("../input/shopee-product-matching/train.csv")
test = pd.read_csv("../input/shopee-product-matching/test.csv")

In [None]:
def len_seq(data): 
    return len(data.split()) 
train["number_tokens"] = train["title"].apply(lambda x : len_seq(x)) 
fig = px.histogram(
    train, 
    x="number_tokens",
    width=800,
    height=500,
    title='Number tokens distribution'
)
fig.show()

Distribution helps to set parameters: window, negative 

In [None]:
TOKEN_RE = re.compile(r'[\w]+')
def tokenize_text_simple_regex(txt, min_token_size=2):
    txt = str(txt).lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [wordnet_lemmatizer.lemmatize(token, pos="v") for token in all_tokens if len(token) >= min_token_size]

In [None]:
def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

In [None]:
train['title'] = train['image_phash'] + ' ' + train['title'] + ' ' + train['image_phash']
test['title'] = test['image_phash'] + ' ' + test['title'] + ' ' + test['image_phash']

In [None]:
corpus = tokenize_corpus(list(pd.concat([train['title'], test['title']])))

In [None]:
model = Word2Vec(
        sentences=corpus,
        vector_size=100, 
        window=15, 
        min_count=1, 
        sg=1, #skip-gram
        negative=7, 
        epochs=1000, 
        seed=42,
        workers=6)

In [None]:
def plot_vectors(vectors, labels, how='tsne', ax=None):
    if how == 'tsne':
        projections = TSNE().fit_transform(vectors)
    elif how == 'svd':
        projections = TruncatedSVD().fit_transform(vectors)
    x = projections[:, 0]
    y = projections[:, 1]
    ax.scatter(x, y)
    for cur_x, cur_y, cur_label in zip(x, y, labels):
        ax.annotate(cur_label, (cur_x, cur_y))

In [None]:
def n_grams(ngram, data):
    freq_dict = defaultdict(int)
    for text in data:
        tokens = [w for w in text.lower().split() if w != " " if w not in stopwords]
        ngrams = zip(*[tokens[i:] for i in range(ngram)])
        list_grams = [" ".join(ngram) for ngram in ngrams]
        for word in list_grams:
            freq_dict[word] += 1
    df_ngram =  pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])   
    df_ngram.columns = ["word", "wordcount"]
    return df_ngram 

In [None]:
df_3_grams = n_grams(3, train["title"]) 
print(df_3_grams.head(20))
print(df_3_grams.tail())

In [None]:
test_words = ['jam','tangan','wanita','xiaomi','redmi','note','somebymi','yuja','niacin','mm','3m']
gensim_words = [w for w in test_words if w in model.wv.index_to_key]
gensim_vectors = np.stack([model.wv[w] for w in gensim_words])

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches((10, 10))
plot_vectors(gensim_vectors, test_words, how='svd', ax=ax)

In [None]:
example_hash = 'a6f319f924ad708c'

In [None]:
def draw_img_hash(hash):
    plot_list = train[train['image_phash'] == example_hash]['image'].tolist()[0]
    plt.figure(figsize=(5, 5))
    image = cv2.imread(os.path.join('../input/shopee-product-matching/train_images/', plot_list))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.title(plot_list, fontsize=12)
    plt.axis("off")
    plt.imshow(image)
    plt.show()

In [None]:
draw_img_hash(example_hash)

In [None]:
#similar words
nn = model.wv.most_similar(example_hash)
nn

In [None]:
draw_img_hash('e69999663199cc93')

In [None]:
draw_img_hash('bce5c11a96393cc6')

W2V is working :)

In [None]:
import gc
del train
del test 
del corpus
gc.collect()

In [None]:
if len(pd.read_csv("../input/shopee-product-matching/test.csv")) > 3: 
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
else: 
    df = pd.read_csv("../input/shopee-product-matching/train.csv")

In [None]:
embeds = []
for phash in df['image_phash'].tolist():
    try:
        embeds.append(model.wv[phash].tolist())
    except KeyError:
        embeds.append(np.zeros((100), dtype='float32').tolist())

In [None]:
neighbors_model = NearestNeighbors(n_neighbors = 50, metric='cosine').fit(embeds)
text_distances, text_indices = neighbors_model.kneighbors(embeds)

In [None]:
predictions = []
for k in range(df.shape[0]):
    idx_text = np.where(text_distances[k,] < 0.17)[0]
    ids_text = text_indices[k,idx_text]
    posting_ids = ' '.join(df.iloc[ids_text]['posting_id'].values)
    predictions.append(posting_ids)

In [None]:
df['matches'] = predictions
df[['posting_id', 'matches']].to_csv('submission.csv', index = False)