## Train Custom Word2Vec Model on Product Descriptions

Word vectors trained to represent the use in product description domain - results are averaged to represent item as average of word vectors in product description

In [1]:
import pandas as pd
import numpy as np

In [2]:
import gensim
from gensim.models import FastText
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
data_all = pd.read_pickle("data/processed_data/articles_info.pkl")

In [3]:
data_all.dtypes

anbieter_artikelnummer                 object
anbieterID                            float64
erstanlageDatum           datetime64[ns, UTC]
stueck_pro_ve                         float64
artikelID                             float64
anbietermarktplatz                     object
warengruppe                            object
groupID_1                             float64
groupID_2                             float64
preis_euro                            float64
text_clean                             object
dtype: object

In [4]:
data = data_all.text_clean.tolist()
len(data)

750206

Create Stopwords to delete from descriptions

In [5]:
my_stop_words = STOPWORDS.union(set(['size', 'color', 'material', 'product', 'dimension', 'length', 'package', 'brand', 
                     'pack', 'width', 'piece', 'height', 'quality', 'group', 'high', 'model', 'article', 'assort',
                     'price', 'weight', 'colour', 'products', 'type', 'design', 'diameter'])) 
# retrieved from previous project

In [6]:
my_stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'article',
           'as',
           'assort',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below'

Define and test process: 
- remove stopwords
- remove words less than 4 characters
- lemmatize words

In [10]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in my_stop_words and len(token) > 3: # standard stopwords and remove shorter 3 
            result.append(WordNetLemmatizer().lemmatize(token, pos='v'))
    return result

In [15]:
preprocess(data[500])

['serum',
 'model',
 'body',
 'firm',
 'verona',
 'olive',
 'serum',
 'model',
 'body',
 'firm',
 'promote',
 'slimming',
 'body',
 'model',
 'contain',
 'drenaliptm',
 'have',
 'anti',
 'celulitowych']

In [17]:
#preprocessed_data = data_all['text_clean'].map(preprocess)
preprocessed_data = [preprocess(str(data[i])) for i in range(len(data))]

Train word2vec model for all words that are in the corpus at least 3 times

In [20]:
model = Word2Vec(preprocessed_data, size=150, window=5, min_count=3, workers=4) 
model.save("data/models/word2vec/word2vec.model")
# killed on my computer.. try again on Google Cloud

In [46]:
model.wv.most_similar('christmas')

[('santa', 0.5143343210220337),
 ('claus', 0.5046408176422119),
 ('noel', 0.4972369968891144),
 ('snowman', 0.49606654047966003),
 ('sledge', 0.47944319248199463),
 ('xmas', 0.47759997844696045),
 ('reindeer', 0.47422361373901367),
 ('icicle', 0.46171554923057556),
 ('gingerbread', 0.45593175292015076),
 ('tannenbaum', 0.4540959298610687)]

In [32]:
model.wv['cup']

array([-2.1599779e+00,  6.8682367e-01, -7.2076142e-01, -4.7736767e-01,
        1.4280242e+00, -3.6823744e-01,  1.0172700e+00, -8.3808619e-01,
       -1.8237360e+00,  4.6261322e-02,  2.2415426e+00, -5.5890721e-01,
        2.5289288e+00,  1.7290318e-01, -4.0860283e-01,  1.7178463e+00,
       -1.5374784e-01,  9.7109646e-01, -2.5287887e-01, -4.0118721e-01,
        3.3619389e-02,  2.7784282e-01, -4.7964957e-01, -4.9002728e-01,
       -4.9541393e-01, -1.1239982e+00,  6.8191260e-01, -7.4850720e-01,
       -2.8618386e-01, -1.0420500e+00,  7.3444045e-01, -8.5160333e-01,
        9.9954742e-01,  9.1576286e-02,  1.7469946e+00,  1.3060845e+00,
        2.4147463e+00, -2.1092894e+00, -5.4620767e-01, -8.5532054e-02,
        3.7522030e-01,  1.0530726e+00, -2.0046477e+00, -4.4788820e-01,
       -1.2595268e+00, -4.9259755e-01, -1.3246691e+00,  3.2946162e-02,
       -3.7333685e-01,  1.9794825e-01, -3.5582104e-01, -1.1775303e+00,
        2.4583805e+00, -1.4190321e+00, -6.0393763e-01, -2.9245725e+00,
      

In [53]:
word_vectors = model.wv

Function to calculate the average vector across all words in the description

In [54]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word_vectors.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [72]:
get_mean_vector(model, preprocessed_data[1])

  """


array([ 0.19540575, -0.07194629, -0.34633487, -0.24592285, -1.0357192 ,
       -0.24799103,  0.2567575 ,  0.20529896,  1.3067074 ,  1.9637139 ,
        0.8820851 , -0.4385833 , -0.03654981, -0.7472502 ,  0.29228267,
        2.221755  , -0.01760486,  0.01895112,  0.07876571,  0.508968  ,
        0.64661425,  0.598393  ,  1.0541649 , -1.2646081 ,  0.02103028,
        0.9427252 , -0.66634536, -2.2871926 ,  0.32233435, -0.9723202 ,
        0.23202752,  2.888791  , -0.25944984, -1.2893883 , -0.8821279 ,
       -1.8014927 , -0.6933597 , -0.94349897,  0.97366196,  0.4523265 ,
       -0.48284388, -1.2377995 , -0.2942778 ,  0.02108408, -0.44196057,
       -0.90491664,  1.7155467 , -0.98383105,  0.3765968 , -0.09718102,
       -2.3913531 , -0.19783945, -1.0304915 ,  1.4013278 ,  0.02132689,
       -0.22636081,  0.5579515 ,  0.17752208, -0.556667  ,  0.02837368,
       -1.5287467 ,  0.08335496,  0.9742878 ,  0.1800721 ,  0.12457419,
        0.9014499 ,  0.50994945, -0.35209292,  0.76253396, -0.32

In [74]:
item_vec = [get_mean_vector(model, preprocessed_data[i]) for i in range(len(preprocessed_data))]

  """


In [75]:
len(item_vec)

750206

In [76]:
np.save("data/processed_data/item_text_vec.npy", item_vec)

## Test Performance

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# testing it against the best shown from word2vec it seems to work better ! some of the weird ones there dont get high scores
# here !!

In [91]:
sim = cosine_similarity(item_vec[99999].reshape(1,-1), item_vec[569553].reshape(1,-1))
sim

array([[0.22667369]], dtype=float32)

## Save array in item data

In [102]:
df = pd.DataFrame({"text_vec": item_vec})

In [103]:
df.head()

Unnamed: 0,text_vec
0,"[0.7984229, 0.09205151, -0.30848747, 0.669923,..."
1,"[0.19540575, -0.07194629, -0.34633487, -0.2459..."
2,"[0.5111147, 0.018874569, 0.22680217, 0.7675127..."
3,"[0.34992826, -0.059115075, -0.25212124, -0.110..."
4,"[0.13192588, -0.19249268, -0.3639368, -0.47083..."


In [104]:
item_text_vec = pd.merge(data_all, df, left_index = True, right_index= True)

In [105]:
len(item_text_vec)

750206

In [106]:
item_text_vec.head()

Unnamed: 0,anbieter_artikelnummer,anbieterID,erstanlageDatum,stueck_pro_ve,artikelID,anbietermarktplatz,warengruppe,groupID_1,groupID_2,preis_euro,text_clean,text_vec
0,0000400435550,4004.0,2016-03-04 14:39:47+00:00,12.0,11608786.0,DE,Reiseartikel_5199,,,2.1,Luggage strap with velcro colorful A tear-resi...,"[0.7984229, 0.09205151, -0.30848747, 0.669923,..."
1,000040053900017-002,4005.0,2018-07-09 12:51:10+00:00,1.0,17618200.0,DE,sonstige Taschen_5202,,,59.95,"Leonardo Verrelli Genuine Leather Bag, Color: ...","[0.19540575, -0.07194629, -0.34633487, -0.2459..."
2,0000403433309,4034.0,2018-11-08 13:04:42+00:00,12.0,18402615.0,DE,Kerzen & Kerzenhalter_5107,190.0,878.0,2.79,Tealight holder Buddha Tealight holder Buddha ...,"[0.5111147, 0.018874569, 0.22680217, 0.7675127..."
3,0000403434036,4034.0,2016-11-29 15:31:47+00:00,20.0,13334081.0,DE,Lichterketten_5114,,,2.1,Christmas LED fairy lights made of copper wire...,"[0.34992826, -0.059115075, -0.25212124, -0.110..."
4,00004034LB-03,4034.0,2017-11-15 14:27:01+00:00,10.0,16079725.0,DE,Lichterketten_5114,385.0,405.0,1.89,LED balloon with fairy lights Transparent ball...,"[0.13192588, -0.19249268, -0.3639368, -0.47083..."


In [109]:
item_text_vec.to_pickle('data/processed_data/articles_info.pkl')

In [126]:
np.array(item_text_vec.text_vec[0:10])

array([array([ 7.98422873e-01,  9.20515135e-02, -3.08487475e-01,  6.69923007e-01,
       -8.83945048e-01, -1.14545323e-01, -7.62080789e-01, -1.32942057e+00,
       -3.56621355e-01, -6.27002239e-01,  6.35893226e-01, -9.33547735e-01,
        8.77977908e-02,  1.89592123e+00,  4.07814831e-02,  4.49965030e-01,
        1.37192857e+00, -5.65821864e-03,  8.82907510e-01, -4.31870818e-02,
        1.00704181e+00, -1.06144154e+00,  2.58876998e-02, -2.71237977e-02,
       -4.70394403e-01, -2.95297932e-02, -3.74819785e-01, -5.82692921e-01,
       -8.64614919e-02, -2.53110975e-01, -1.39879727e+00, -4.12181728e-02,
        2.22234502e-01, -1.16490865e+00, -2.92082518e-01, -3.31293613e-01,
       -1.90424919e-01,  2.85656661e-01, -5.19328177e-01,  6.61127627e-01,
       -1.05312538e+00, -1.47793770e+00, -1.93734303e-01,  2.87531823e-01,
       -8.30013514e-01, -8.83850515e-01, -5.10800302e-01,  2.68803775e-01,
        1.14465284e+00, -1.23454869e+00, -1.49396777e+00,  2.92633325e-01,
        3.43452811

In [None]:
test = item_text_vec.text_vec[0:10]

In [153]:
sim = cosine_similarity(item_text_vec.text_vec[99999].reshape(1,-1), test.tolist())
sim

array([[ 0.2663017 ,  0.07443871,  0.0929203 ,  0.2621449 ,  0.36728087,
         0.35635257,  0.33802235,  0.4352266 ,  0.40028778, -0.11413221]],
      dtype=float32)