# Import libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from emoji import UNICODE_EMOJI
import jieba.posseg as pseg
import jieba
import jieba.analyse
import os
import io

# Load dataset

In [2]:
train_en = pd.read_csv('train_en.csv')
test = pd.read_csv('test.csv')
train_tcn = pd.read_csv('train_tcn.csv')

# Jieba on chinese

Do word embeddings of chinese and english on different categories to map words/translations. 27 for english, 23 for chinese. 7 exactly the same.

In [29]:
a = train_tcn.product_title.apply(lambda x: str(x)).apply(jieba.lcut_for_search)

In [23]:
a

0    [Gucci,  , Gucci,  , Guilty,  , Pour,  , Femme...
1    [（, 二手, ）, PS4,  , GTA,  , 5,  , 俠盜, 獵車, 手, 5,...
2                                              [百獸, 卡]
3                           [nac,  , nac, 活氧全, 效柔, 衣素]
4    [#, Nike, 耐吉, 官方, F, ., C, .,  , 男子, 足球, 長, 褲,...
5    [火影, 忍者, 火影忍者, 六道, 鸣人, cos, 睡衣, 卡卡, 西宇智, 波佐助, ...
6      [第二, 二代, 第二代, 新, 巴克, 史萊姆,  , 210ml,  , 限量, 限量版]
7    [aaL, 皮商旋, ., 全新, 鐵鍋, 上, 五花, 五花肉, 造型, 立體, 冰箱, ...
8    [現貨, 24, 小時, 快速, 出貨, chanel, 香奈兒, 58601, 女包, 經...
9    [【, 拇指, 小拇指, 鞋坊, 】, Adidas,  , Superstar,  , 8...
Name: product_title, dtype: object

In [31]:
#Set of all words present in training
zh_training=set([element for list_ in a.values for element in list_])

In [78]:
#All chinese words present in training and fasttext
temp_zh=df_zh[df_zh.index.isin(zh_training)].index.to_list()

In [80]:
len(temp_zh)

155089

In [79]:
zh_training - set(temp_zh)

{'饃紙',
 '899475',
 '李明泰',
 'INFLUENCER',
 '旋風杯',
 '風格直',
 '夏沙灘',
 '手竿架桿',
 '卡盤',
 '小藍瓶',
 '78GP01',
 '裝書車',
 '興櫃坊',
 '農家樂壁',
 'GAMAKATSU',
 '僅男款',
 '白織',
 '裝沙發',
 '330Ci',
 'ZERIA',
 '潮隱形',
 '艾妮雅',
 '管麥克風',
 '8880192',
 '離心管',
 '沉水馬達',
 '環微',
 '裙速',
 'XHello',
 '兒童理',
 '襪小白襪',
 '朱慧君',
 '挡风玻璃',
 '停產四神',
 '離凝露',
 'TGUCCI',
 '潮媽秋款',
 'Sunnylife',
 '雀點',
 'Dscn8207',
 'X550',
 'GROZ',
 '包高爾夫托',
 'Q5MC1',
 '鐵牛頭',
 '852457',
 '喬盾',
 '全四冊',
 '種針頭',
 '純銀鎖',
 '熱銷貓',
 '眼筆液',
 '劉海碎',
 '宅魚',
 '亮麗桃',
 'WZ5718BY',
 '褲丹寧',
 '素原',
 '8000dpi',
 '車羅伊',
 '西西小鋪',
 '畫鋪',
 'stin',
 '萌洛麗塔',
 '06510',
 '縮片',
 '爐餐',
 '假劉',
 'WX836',
 '陰陽鞋',
 'lnvicta',
 '熱棒燒',
 '867034495',
 'F838',
 '籠貓',
 'ท',
 '恒溫電',
 '套靜',
 'GYQ41Y',
 '清錫渣',
 '舞重',
 'BWSr',
 'GPK01',
 '旅彩條',
 '芭樂雅',
 '商務均',
 '胡芷美肌',
 '贈貓',
 'gucci3D',
 '攜式學',
 '圍後備',
 '棉質圓',
 '鼓剎',
 '帶腳燒',
 '貓專賣',
 '風風口',
 'ysk',
 '套耳',
 '鋼木多層',
 '優惠壓',
 'IMMID',
 'Attack002',
 '視雪紡',
 'BH1001',
 '870022603',
 '離帽',
 'iclub',
 '風雙色',
 '環保口',
 'Dermaspa',
 'B25568',
 '裝拉線',

In [33]:
len(zh_training)

483133

In [45]:
483133-155089

328044

# NLTK on English

Lowercase title, remove emojis, separate words with slash, keep all numbers!

In [55]:
import nltk
from nltk.tokenize import word_tokenize
import re
import string
from itertools import chain

In [248]:
# Pre processing
def preprocess(sample_text):
    sample_text = sample_text.lower() #All lowercase
    sample_text = ('').join([i if i.isalnum() else ' ' for i in sample_text]) # Remove weird characters
    sample_text = sample_text.strip() #Strip
    sample_text = re.sub(' +', ' ', sample_text) #Trim whitespace
    sample_text = (('').join([x for x in sample_text if x not in UNICODE_EMOJI])) # Remove emoji stuck to word
    sample_text = word_tokenize(sample_text) #Word tokenizer
    sample_text = [j for k in [i.split('/') for i in sample_text] for j in k] #Split those joined with /
    sample_text = [x for x in sample_text if x not in UNICODE_EMOJI] #Remove emojis
    sample_text = [''.join(c for c in s if c not in string.punctuation) for s in sample_text] # Remove punctuations
    sample_text = [i for i in sample_text if i!= ''] #Trim extra spaces
#     sample_text = sample_text.translate(str.maketrans("","", string.punctuation))
    return sample_text

In [250]:
sample_text = train_en.iloc[7].product_title
print(sample_text)
a = preprocess(sample_text)
print(a)

【COD】Chanel Black/White Sneaker Shoes For Women
['cod', 'chanel', 'black', 'white', 'sneaker', 'shoes', 'for', 'women']


In [251]:
# Cross check dataframe
en_check = train_en.copy()
en_check['tokenize'] = train_en.product_title.apply(lambda x: preprocess(x))
en_check

Unnamed: 0,product_title,category,tokenize
0,Recollections Color Splash Clear Stamps & Stencil,Hobbies & Stationery,"[recollections, color, splash, clear, stamps, ..."
1,"soap,lotion scrub set 400",Health & Personal Care,"[soap, lotion, scrub, set, 400]"
2,Spigen Galaxy S10e Case Tough Armor Gunmetal,Mobile Accessories,"[spigen, galaxy, s10e, case, tough, armor, gun..."
3,Acrylic Lanalon Bright Red,Hobbies & Stationery,"[acrylic, lanalon, bright, red]"
4,303 FLAT SHEET/Blanket 100% cotton,Home & Living,"[303, flat, sheet, blanket, 100, cotton]"
5,Korean Set,Women's Apparel,"[korean, set]"
6,High-grade keychain,Home & Living,"[high, grade, keychain]"
7,【COD】Chanel Black/White Sneaker Shoes For Women,Women Shoes,"[cod, chanel, black, white, sneaker, shoes, fo..."
8,Cat eyeglasses,Women's Apparel,"[cat, eyeglasses]"
9,Baby shoes by Stride Rite (BRAND NEW) (3-6 mon...,Babies & Kids,"[baby, shoes, by, stride, rite, brand, new, 3,..."


In [252]:
# Create bag of words from the english training set
bow_en = set(np.hstack(train_en.product_title.apply(lambda x: preprocess(x))))

In [253]:
len(bow_en)

139244

In [254]:
#All chinese words present in training and fasttext
temp_en=df_en[df_en.index.isin(bow_en)].index.to_list()

In [255]:
len(temp_en)

56853

In [289]:
df_en[df_en.index.isin(bow_en)].to_csv('en_embeddings.csv')

In [265]:
en_check[en_check.tokenize.apply(lambda x: 'infiniwhite' in x)]

Unnamed: 0,product_title,category,tokenize
148883,Infiniwhite BEAUTY CREAM (Whipp Cream) 10g,Health & Personal Care,"[infiniwhite, beauty, cream, whipp, cream, 10g]"
271599,Whitening Korean BEAUTY WHIP SET by Infiniwhite,Health & Personal Care,"[whitening, korean, beauty, whip, set, by, inf..."


In [259]:
test = "𝐒𝐄𝐓 𝐂𝐎𝐋𝐋𝐄𝐂𝐓𝐈𝐎𝐍 - Set of 4 Pairs"
test.lower()

'𝐒𝐄𝐓 𝐂𝐎𝐋𝐋𝐄𝐂𝐓𝐈𝐎𝐍 - set of 4 pairs'

In [257]:
# Items in bag of words english that aren't present in en word embed
set(bow_en) - set(temp_en)

{'prijoles',
 'kalea',
 'olavie',
 'cheekers',
 'covel',
 'limecrimes',
 'mk5865',
 'jomalone',
 'heki',
 'longsleves',
 'l252',
 'ginto',
 'av19',
 'w223',
 'pd38',
 'essrntial',
 'bq1525',
 'anastasie',
 'buho',
 '20122194',
 'frshener',
 'lcste',
 'el12',
 '1381g',
 'chynna',
 'zofia',
 'e4020',
 'trinka',
 'necklacewith',
 'michealkors',
 '5moulds',
 'benguet',
 '380629850',
 'sisleyn2',
 'beatter',
 'e5500',
 'topnet',
 'nicholeleebag',
 'k56v',
 '6139237216',
 'elegantrose',
 'wp55',
 'retaso',
 'mr03',
 'jaykensport',
 's2888',
 'rtw37',
 'psre453',
 'db6x',
 'lihgts',
 'jcot',
 'asdasdwqe123',
 '41a1h',
 '196627061',
 'akritax',
 'adolphii',
 'junradvillafrancedignadice',
 'cirtain',
 'snifflease',
 'umaku',
 '1763167625',
 'hongke',
 'giyo',
 '41532',
 'salveo',
 'oppof7',
 'e071',
 '4df',
 '90217',
 'sisyama',
 '3d4390',
 'pw23y',
 '0215205',
 '64x20x15cm',
 'evebaby',
 'leya6677',
 'voltplex',
 'undearmour',
 '60pg',
 '96tabs',
 'yuqiss',
 '4535549570',
 'trhirt',
 'tessel',

In [73]:
train_en[train_en.product_title.str.contains('-')]

Unnamed: 0,product_title,category
6,High-grade keychain,Home & Living
9,Baby shoes by Stride Rite (BRAND NEW) (3-6 mon...,Babies & Kids
10,ATI-ATIHAN COSTUME,Women Accessories
12,FACE MASK NON-WOVEN 50GSM,Health & Personal Care
13,[OFFICIAL] 2019 BTS 5TH MUSTER (MAGIC SHOP) BL...,"Toys, Games & Collectibles"
30,Pineng PN-968 with LCD Display and Flashlight ...,Mobile Accessories
32,4-WAY NURSING / BREASTFEEDING TOPS,Women's Apparel
46,Marvel Spider-Man Magazine Issue No. 09,Hobbies & Stationery
49,New Casio Gshock GA400 GA-400 Vampire Autoligh...,Men's Bags & Accessories
50,YFCB-008-2 Multifunctional Motor Mask Camoufla...,Sports & Travel


In [76]:
en_training=set([element for list_ in a.values for element in list_])

In [81]:
#All chinese words present in training and fasttext
temp_en=df_en[df_en.index.isin(en_training)].index.to_list()

In [97]:
len(temp_en)

99406

In [98]:
len(temp_zh)

155089

# Word embeddings from fasttext

In [35]:
import fasttext
import fasttext.util
import codecs

In [3]:
def load_vectors(embedding_path):
    fin = io.open(embedding_path + '.vec', 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    print(n,d)
    words = []
    vectors = np.zeros((n,d))
    data = {}
    for idx, line in enumerate(fin):
        tokens = line.rstrip().split(' ')
        words.append(tokens[0])
        vectors[idx,:] += [float(val) for val in tokens[1:]]
#         data[tokens[0]] = map(float, tokens[1:])
        if idx % 10000 == 0:
            print('.', end = ' ')
    np.save(embedding_path + ".npy", vectors)
    with open(embedding_path + ".txt", 'w', encoding='utf-8') as output:
        for row in words:
            output.write(row + '\n')
    return words, vectors

In [11]:
en_words, en_tokens = load_vectors('cc.en.300')

2000000 300
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [45]:
zh_words, zh_tokens = load_vectors('cc.zh.300')

2000000 300
. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [197]:
import codecs
def load_word_emb_binary(embedding_file_name_w_o_suffix):
    print("Loading binary word embedding from {0}.txt and {0}.npy".format(embedding_file_name_w_o_suffix))

    with codecs.open(embedding_file_name_w_o_suffix + '.txt', 'r', 'utf-8') as f_in:
        index2word = [line.strip() for line in f_in]

    wv = np.load(embedding_file_name_w_o_suffix + '.npy')
    word_embedding_map = {}
    for i, w in enumerate(index2word):
        word_embedding_map[w] = wv[i]

    return word_embedding_map

In [198]:
vec_en = load_word_emb_binary('cc.en.300')

Loading binary word embedding from cc.en.300.txt and cc.en.300.npy


In [199]:
vec_zh = load_word_emb_binary('cc.zh.300')

Loading binary word embedding from cc.zh.300.txt and cc.zh.300.npy


In [200]:
# Load embedding file
zh_tokens = np.load('cc.zh.300.npy')
en_tokens = np.load('cc.en.300.npy')

In [201]:
df_en = pd.DataFrame(vec_en.items(), columns=['words', 'embedding']).set_index('words')
df_zh = pd.DataFrame(vec_zh.items(), columns=['words', 'embedding']).set_index('words')

# Unsupervised learning

In [None]:
!python unsupervised.py --src_lang zh --tgt_lang en --src_emb cc.zh.300.vec --tgt_emb cc.en.300.vec --n_refinement 5 --normalize_embeddings center

In [None]:
import os
import pickle

if not os.path.isfile("params.pkl"):
    with open("params.pkl",'wb') as file:
        pickle.dump(, file)

# Getting nearest neighbor

In [58]:
def get_w2v(sentence, model):
    """
    :param sentence: inputs a single sentences whose word embedding is to be extracted.
    :param model: inputs glove model.
    :return: returns numpy array containing word embedding of all words    in input sentence.
    """
    return np.array([model.get(val, np.zeros(100)) for val in sentence.split()], dtype=np.float64)

In [198]:
def get_nn(word, src_emb, tgt_emb, df_tgt, K=5):
    print("Nearest neighbors of \"%s\":" % word)
    word_emb = get_w2v(word, src_emb).T
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.reshape(1,-1)[0].argsort()[-5:][::-1]
    for i, idx in enumerate(k_best):
        print('%.4f - %s' % (scores[idx], df_tgt.iloc[idx].name))

In [201]:
df_zh[df_zh.index == '貓']

Unnamed: 0_level_0,embedding
words,Unnamed: 1_level_1
貓,"[-0.2967, 0.0171, 0.4615, 0.3601, 0.0896, -0.2..."


In [206]:
get_nn('booty', vec_en, en_tokens, df_en)

Nearest neighbors of "booty":
1.0000 - booty
0.7237 - booty.
0.6722 - bootys
0.6376 - bootay
0.6169 - Booty


In [153]:
test_scores[270830]

array([0.25989007])

In [142]:
test_scores.reshape(1,-1).argsort()[-5:][::-1]

array([[ 32690, 615543, 413467, ..., 368506, 960130, 270830]], dtype=int64)

In [130]:
test_scores.argsort()[-5:][::-1]

array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [162]:
list(vec_en.keys())

{'，': array([ 8.000e-03,  3.360e-02,  5.720e-01, -1.105e-01, -2.000e-02,
         1.950e-02, -1.290e-02, -2.580e-02,  2.490e-02, -1.000e-03,
         1.820e-02, -7.400e-03, -2.880e-02, -1.160e-02, -3.550e-02,
         4.820e-02,  4.110e-02,  8.000e-04,  2.610e-02, -3.870e-02,
         1.040e-02, -3.500e-03,  7.150e-02, -1.000e-04, -3.970e-02,
        -3.530e-02,  5.460e-02, -4.620e-02, -4.390e-02,  6.210e-02,
        -2.760e-02,  1.510e-02, -5.280e-02, -1.600e-03,  7.400e-03,
        -3.120e-02,  3.200e-03, -8.430e-02, -6.980e-02, -3.090e-02,
         1.780e-02,  8.100e-03, -2.030e-02,  1.040e-02, -1.739e-01,
        -8.100e-03, -9.520e-02, -1.180e-02,  2.110e-02,  2.140e-02,
        -3.240e-02,  8.500e-03, -1.140e-02, -4.785e-01, -2.160e-02,
         4.690e-02, -2.600e-03,  1.520e-02, -4.380e-02,  5.540e-02,
        -4.410e-02, -2.560e-02, -8.400e-03, -1.210e-02, -3.720e-02,
        -1.090e-02,  7.040e-02,  2.080e-02, -4.570e-02, -2.520e-02,
        -1.150e-02,  4.140e-02,  7.800e-03,

In [156]:
k_best = test_scores.reshape(1,-1)[0].argsort()[-5:][::-1]
for i, idx in enumerate(k_best):
    print(idx)
#     print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))

270830
960130
368506
555694
97995


In [113]:
get_nn('cat', vec_en, zh_tokens)

Nearest neighbors of "cat":


array([[0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [96]:
word_emb = test
(vec_zh / np.linalg.norm(vec_zh, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))

AttributeError: 'dict' object has no attribute 'conjugate'

In [105]:
word_emb = test.T
(zh_tokens / np.linalg.norm(zh_tokens, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))

array([[-0.02646693],
       [-0.00920756],
       [-0.01300759],
       ...,
       [-0.00100558],
       [-0.04740634],
       [ 0.01967714]])

In [82]:
np.linalg.norm(tgt_emb.values(), 2, 1)

AttributeError: 'dict_values' object has no attribute 'conjugate'

In [79]:
tgt_emb = vec_zh
(tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None])

AttributeError: 'dict' object has no attribute 'conjugate'

In [69]:
test = get_w2v('bleh', vec_en)

In [None]:
# printing nearest neighbors in the target space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)