In [4]:
import itertools
from itertools import combinations, groupby
import collections
import numpy as np
import pandas as pd

# Embeddings
from gensim.models import Word2Vec
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['products.csv', 'departments.csv', 'sample_submission.csv', 'order_products__prior.csv', 'order_products__train.csv', 'aisles.csv', 'orders.csv']


In [5]:
products = pd.read_csv("../input/products.csv")
departments = pd.read_csv("../input/departments.csv")
aisles = pd.read_csv("../input/aisles.csv")

In [6]:
prod_names = list(products['product_name'])
product_table = pd.DataFrame(prod_names, columns=['Products'])

In [7]:
# Make everything lowercase.
product_table['Products_mod'] = product_table['Products'].str.lower()

# Clean special characters.
product_table['Products_mod'] = product_table['Products_mod'].str.replace('\W', ' ')

# Split products into terms: Tokenize.
product_table['Products_mod'] = product_table['Products_mod'].str.split()
product_table.head()

Unnamed: 0,Products,Products_mod
0,Chocolate Sandwich Cookies,"[chocolate, sandwich, cookies]"
1,All-Seasons Salt,"[all, seasons, salt]"
2,Robust Golden Unsweetened Oolong Tea,"[robust, golden, unsweetened, oolong, tea]"
3,Smart Ones Classic Favorites Mini Rigatoni Wit...,"[smart, ones, classic, favorites, mini, rigato..."
4,Green Chile Anytime Sauce,"[green, chile, anytime, sauce]"


In [8]:
# Add product and aisle information
enriched_prods = pd.merge(products, departments, on="department_id")
enriched_prods = pd.merge(enriched_prods, aisles, on="aisle_id")

In [9]:
enriched_prods[['product_id', 'product_name', 'department', 'aisle']]

Unnamed: 0,product_id,product_name,department,aisle
0,1,Chocolate Sandwich Cookies,snacks,cookies cakes
1,78,Nutter Butter Cookie Bites Go-Pak,snacks,cookies cakes
2,102,Danish Butter Cookies,snacks,cookies cakes
3,172,Gluten Free All Natural Chocolate Chip Cookies,snacks,cookies cakes
4,285,Mini Nilla Wafers Munch Pack,snacks,cookies cakes
5,331,Organic Lemon Gingersnap,snacks,cookies cakes
6,449,Chips Ahoy! Chewy Cookies,snacks,cookies cakes
7,559,Cookie Chips Crunchy Dark Chocolate Chocolate ...,snacks,cookies cakes
8,569,Golden Cupcakes 8 Pack,snacks,cookies cakes
9,574,Crunch Vanilla Sugar Mini Cookies,snacks,cookies cakes


In [10]:
# Append the tokenized column
product_table = pd.merge(enriched_prods[['product_name', 'department', 'aisle', 'department_id','aisle_id']], product_table, left_on="product_name", right_on="Products")
product_table.head()

Unnamed: 0,product_name,department,aisle,department_id,aisle_id,Products,Products_mod
0,Chocolate Sandwich Cookies,snacks,cookies cakes,19,61,Chocolate Sandwich Cookies,"[chocolate, sandwich, cookies]"
1,Nutter Butter Cookie Bites Go-Pak,snacks,cookies cakes,19,61,Nutter Butter Cookie Bites Go-Pak,"[nutter, butter, cookie, bites, go, pak]"
2,Danish Butter Cookies,snacks,cookies cakes,19,61,Danish Butter Cookies,"[danish, butter, cookies]"
3,Gluten Free All Natural Chocolate Chip Cookies,snacks,cookies cakes,19,61,Gluten Free All Natural Chocolate Chip Cookies,"[gluten, free, all, natural, chocolate, chip, ..."
4,Mini Nilla Wafers Munch Pack,snacks,cookies cakes,19,61,Mini Nilla Wafers Munch Pack,"[mini, nilla, wafers, munch, pack]"


In [11]:
w2vec_model = Word2Vec(list(product_table['Products_mod']), size=20, window=5, min_count=1, workers=4)

In [12]:
# Create  dictionaries to obtain product vectors

prod_word = dict()
for w in w2vec_model.wv.vocab:
    prod_word[w] = w2vec_model[w]

  """


In [13]:
display(list(prod_word.items())[:2])

[('chocolate',
  array([-0.7037704 ,  2.9269016 ,  1.4313685 ,  1.3194381 , -0.9869902 ,
          2.4666147 , -0.29313523, -3.6432633 ,  0.04991385,  0.1246125 ,
          2.8057313 ,  1.1690453 ,  0.21009387, -2.641713  ,  0.20084678,
         -0.11150149, -0.17785619, -1.7056086 , -1.7563633 ,  1.5764931 ],
        dtype=float32)),
 ('sandwich',
  array([-0.10577309,  0.43264827,  0.20517507,  0.5205766 , -1.27768   ,
          1.033573  ,  0.47063234, -1.1926118 , -0.8809756 ,  0.0500845 ,
          0.5143323 ,  0.44254893, -1.275482  , -1.8803289 ,  0.8199226 ,
          0.21343723,  0.6112116 , -0.5708859 , -0.9103039 ,  1.4354948 ],
        dtype=float32))]

In [24]:
# VECTOR CALCULATION FOR PRODUCTS
# Cycle through each word in the product name to generate the vector.
prods_w2v = dict()
for index, row in product_table.iterrows():
    word_vector = list()
    #print(row['Products_mod'])
    for word in row['Products_mod']:
        word_vector.append(prod_word[word])
    
    prods_w2v[row['Products']] = np.average(word_vector, axis=0)

In [26]:
display(list(prods_w2v.items())[:2])

[('Chocolate Sandwich Cookies',
  array([-0.21136063,  1.8709946 ,  0.7193214 ,  1.0028884 , -1.219966  ,
          1.7773724 ,  0.31743476, -2.5429494 , -0.32189876,  0.37106046,
          1.7924148 ,  0.66265345, -0.70871663, -2.2408504 ,  0.85238934,
          0.08909017, -0.11694751, -1.1796118 , -1.2502959 ,  1.4848709 ],
        dtype=float32)),
 ('Nutter Butter Cookie Bites Go-Pak',
  array([-0.4859861 ,  1.0834678 ,  0.3179979 ,  0.44258466, -0.6815359 ,
          1.0952351 ,  0.1932453 , -1.3168287 , -0.74037933,  0.23711114,
          0.901165  ,  0.03421742, -0.34427187, -1.0652884 ,  0.56833726,
          0.13860272,  0.05961088, -0.09044382, -0.8173497 ,  1.0002911 ],
        dtype=float32))]

In [16]:
product_table['vectors'] = prods_w2v.values()

In [17]:
product_table.head()

Unnamed: 0,product_name,department,aisle,department_id,aisle_id,Products,Products_mod,vectors
0,Chocolate Sandwich Cookies,snacks,cookies cakes,19,61,Chocolate Sandwich Cookies,"[chocolate, sandwich, cookies]","(-0.21136063, 1.8709946, 0.7193214, 1.0028884,..."
1,Nutter Butter Cookie Bites Go-Pak,snacks,cookies cakes,19,61,Nutter Butter Cookie Bites Go-Pak,"[nutter, butter, cookie, bites, go, pak]","(-0.4859861, 1.0834678, 0.3179979, 0.44258466,..."
2,Danish Butter Cookies,snacks,cookies cakes,19,61,Danish Butter Cookies,"[danish, butter, cookies]","(-0.80961174, 1.6220757, 0.96441585, 0.689753,..."
3,Gluten Free All Natural Chocolate Chip Cookies,snacks,cookies cakes,19,61,Gluten Free All Natural Chocolate Chip Cookies,"[gluten, free, all, natural, chocolate, chip, ...","(-0.3179844, 0.8896972, 0.6882824, 1.1404625, ..."
4,Mini Nilla Wafers Munch Pack,snacks,cookies cakes,19,61,Mini Nilla Wafers Munch Pack,"[mini, nilla, wafers, munch, pack]","(-0.105717584, 0.508803, -0.07997906, 0.141530..."
