<a href="https://colab.research.google.com/github/pgosar/AlphaHacks/blob/main/brand_embeddings/FoodBrandEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fire
!pip install wikipedia

Collecting fire
[?25l  Downloading https://files.pythonhosted.org/packages/11/07/a119a1aa04d37bc819940d95ed7e135a7dcca1c098123a3764a6dcace9e7/fire-0.4.0.tar.gz (87kB)
[K     |████████████████████████████████| 92kB 2.7MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115943 sha256=d30d59e519d5b7b47e362ceff079ff93e240b3f8e0d0f34a727b761b1ef6074e
  Stored in directory: /root/.cache/pip/wheels/af/19/30/1ea0cad502dcb4e66ed5a690279628c827aea38bbbab75d5ed
Successfully built fire
Installing collected packages: fire
Successfully installed fire-0.4.0
Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wi

In [None]:
import io
import os
import shutil
import re
import string
import tensorflow as tf
import numpy as np

import logging
import wikipedia
import random

import json
import google

import nltk
from nltk.corpus import stopwords

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_embeddings = tf.keras.utils.get_file("glove.6B.zip", glove_url, extract = True, cache_dir='.',
                                  cache_subdir='')

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
GLOVE_PATH = "/content/glove.6B.200d.txt"

In [None]:
class GloveEmbeddings:
    GLOVE_DIR = GLOVE_PATH
    EMBEDDING_DIM = 200

    @staticmethod
    def get_dict_word_embedding(path=GLOVE_DIR, embedding_dim=EMBEDDING_DIM):
        f = open(path.format(dim=embedding_dim))

        word2emb = dict()
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word2emb[word] = coefs
        f.close()
        return word2emb

In [None]:
GloveEmbeddings.get_dict_word_embedding()

In [None]:
IGNORE_WORDS = set(stopwords.words())

In [None]:
DEFAULT_SET_BRANDS

In [None]:
logger = logging.getLogger(__name__)

#LOAD DATA

In [None]:
title_api = "https://en.wikipedia.org/w/api.php?action=query&format=json&titles="
search_api = "https://en.wikipedia.org/w/api.php?action=opensearch&limit=1&namespace=0&format=json&search="
pageid_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts|revisions&rvslots=*&rvprop=content&format=json&pageids="
summary_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exsentences=10&exlimit=1&explaintext=1&format=json&pageids="
summary_api_title = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exsentences=10&exlimit=1&explaintext=1&format=json&titles="
page_content_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts|revisions&exlimit=1&explaintext=1&format=json&pageids="

brand_list_fp = "/content/cleaned_brand_list.json"
emb_save_fp = "embeddings.json"

In [None]:

def load_checkpoint(inPath):
  with open(inPath) as fp:
    temp_dict = json.load(fp)
  return temp_dict

brand_dict = load_checkpoint(brand_list_fp)

In [None]:
len(brand_dict)

1122

In [None]:
import fire
import json
import codecs
import requests
from string import punctuation

In [None]:
test_dict = {k: brand_dict[k] for k in list(brand_dict)[:100]}

In [None]:
def build_embeddings(brand_list = brand_dict, fpath_save = emb_save_fp, set_ignore_words = IGNORE_WORDS):
  logger.info("building knowledge base")
  dict_brand_name_emb = dict()

  wrd2emb = GloveEmbeddings.get_dict_word_embedding()

  for page, id in brand_dict.items():
    result = requests.get(page_content_api + id)
    json_data = result.json()

    content = json_data['query']['pages'][id]['extract']

    text_tokens = content.split()
    list_emb = list()
    
    for token in text_tokens:
      token = token.lower()
      token = token.strip(punctuation)
      if token in set_ignore_words:
        #logger.info("Token ignored: {}".format(token))
        continue
          
      emb = wrd2emb.get(token, None)
      if emb is not None:
        list_emb.append(emb)
    
    brand_array = np.array(list_emb)
    brand_emb = brand_array.mean(axis=0)
    
    dict_brand_name_emb[page] = brand_emb.tolist()
    
  logger.info("saving knowledge base to: `{}`".format(fpath_save))
  with codecs.open(fpath_save, 'w', encoding='utf-8') as fp:
    json.dump(dict_brand_name_emb, fp, separators=(',', ':'), indent=4)

  logger.info("knowledge base compiled")
  print("knowledge base compiled")

In [None]:
build_embeddings()

knowledge base compiled


In [None]:
os.path.getsize("/content/data/brand_emb.json")/1000000

0.23748

In [None]:
wikipedia.summary("501".encode("ascii", "ignore"), auto_suggest = False)

'Year 501 (DI) was a common year starting on Monday (link will display the full calendar) of the Julian calendar. At the time, it was known as the Year of the Consulship of Avienus and Pompeius (or, less frequently, year 1254 Ab urbe condita). The denomination 501 for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years.\n\n'

In [None]:
import operator
def query(target_brand_name, top_n=None, kb_fpath=emb_save_fp, dict_kb=None):

    if type(target_brand_name) == str:
        target_brand_name = str(target_brand_name)

    if dict_kb is None:
        with codecs.open(kb_fpath, encoding='utf-8') as fp:
            dict_kb = json.load(fp)

    target_brand_emb = np.array(dict_kb[target_brand_name])

    dict_brand_name_emb_distance = dict()
    for candidate_brand_name, candidate_emb in dict_kb.items():

        if candidate_brand_name == target_brand_name:
            continue

        emb_dist = np.linalg.norm(target_brand_emb - np.array(candidate_emb))
        dict_brand_name_emb_distance[candidate_brand_name] = emb_dist

    sorted_dict = sorted(dict_brand_name_emb_distance.items(), key=operator.itemgetter(1))

    if top_n:
        sorted_dict = sorted_dict[: top_n]

    logger.debug("{}: {}".format(target_brand_name, sorted_dict))

    return sorted_dict


In [None]:
brand_dict["Winchester Cheese Company"]

'23325827'

In [None]:
query("General Mills", top_n = 5)

[('George Weston Limited', 0.5822406671597511),
 ("Kellogg's", 0.661554623753576),
 ('Snak King', 0.6649193996563848),
 ('The Hershey Company', 0.6821557864586464),
 ('Rich Products', 0.7020520728561569)]

In [None]:
os.path.getsize(emb_save_fp)/1000000

6.66232

In [38]:
import pandas as pd 

pd.read_json(emb_save_fp)

Unnamed: 0,All Joy Foods,BOS Ice Tea,Cevital,Choppies,Colcom Foods,Distell Group Limited,Famous Brands,Kenya Wine Agencies Limited,Les Domaines Agricoles,Meat Corporation of Namibia,Melcom,Nile Breweries Limited,Pioneer Foods,SOMED,Tiger Brands,Tilda Uganda,Tongaat Hulett,Grupo Arcor,La Serenísima,Molinos Río de la Plata,SanCor,The a2 Milk Company,Baiada Poultry,Balfours,Beerenberg Farm,Bega Cheese,Bellamy's Australia,Bickford's Australia,Boost Juice,Bulla Dairy Foods,Bundaberg Brewed Drinks,Camperdown Dairy International,Canberra Milk,Darrell Lea,Dick Smith Foods,Ernest Hillier Chocolates,Ferguson Plarre Bakehouses,Frosty Boy,Haigh's Chocolates,Huon Aquaculture,...,Cuulong Fish,Habeco,Hai Ha Confectionery,Hanoimilk,Highlands Coffee,Huda Beer,Kinh Do Corporation,Sabeco,Trung Nguyên,Vinacafe,Vinamilk,Lingerie,Asset management,Private equity,Venture capital,Manufacturing,Rolling stock,Retail,Department,Marketing research,Automation integrator,Computer-aided design,Electronic design automation,Information technology,Photovoltaics,passenger,Privacy policy,Bega_Dairy_%26_Drinks,Dachan_Food_(Asia),Debauve_%26_Gallais,C._Hahne_M%C3%BChlenwerke_GmbH_%26_Co._KG,Coppenrath_%26_Wiese,Oishi_(company),Sumol_%2B_Compal,Korea_Tobacco_%26_Ginseng_Corporation,Gunnar_Dafg%C3%A5rd_AB,A._L._Simpkin_%26_Co._Ltd,F._Duerr_%26_Sons,"R._Torre_%26_Company,_Inc.",Vanberg_%26_DeWulf
0,0.074192,0.027856,0.071841,0.079619,0.020883,0.040522,0.013133,-0.003113,-0.051152,-0.033607,-0.034942,-0.008649,0.038141,0.056857,-0.015924,0.119242,-0.031964,-0.082654,-0.120472,-0.022841,-0.098334,-0.113776,0.069628,0.023818,0.017667,-0.110788,-0.005278,-0.001800,-0.054884,-0.034992,-0.014592,-0.072434,-0.203176,-0.093231,-0.078920,0.012508,0.002781,-0.054339,0.109125,0.046649,...,0.128804,-0.038619,0.133489,0.032283,0.046662,0.043294,0.002742,-0.059457,0.123889,-0.014712,-0.038465,0.181295,0.234002,0.129615,0.098864,0.023997,0.248072,0.149938,0.047187,0.087970,0.122870,0.140429,0.104005,0.100363,0.100668,0.200260,0.089264,-0.165903,0.110367,0.066419,-0.011297,-0.028329,0.000159,0.013051,0.058343,-0.029813,0.012200,0.000435,-0.094443,-0.046832
1,0.135835,0.015963,0.093612,0.103230,0.092092,0.167960,0.095780,0.147136,-0.027545,0.056857,0.097581,0.046334,0.133917,0.054467,0.077409,-0.009044,0.068723,0.120719,0.099507,0.099443,0.095967,0.172751,0.071992,0.051387,0.036436,0.107756,0.188516,0.067191,0.137108,0.062237,0.126994,0.105592,0.075372,0.047124,0.113112,0.086751,0.187658,0.053890,0.096107,0.088445,...,0.082183,0.165911,0.104471,0.062728,0.108206,0.092825,0.053790,0.151087,0.149415,0.124074,0.046088,-0.083205,0.107043,0.042639,0.101048,0.170997,0.003256,0.155234,0.088153,0.162668,0.290251,0.185572,0.229134,0.188149,0.113691,0.055729,0.201848,0.141530,0.146554,0.044222,0.000631,0.014609,0.116095,0.103357,0.126926,0.161783,0.068761,0.060058,0.144078,0.175595
2,-0.088113,-0.094814,-0.053202,-0.074755,0.004973,-0.048460,-0.156109,-0.022231,0.013406,-0.031598,-0.138336,-0.124331,-0.089807,-0.072766,-0.194249,-0.050860,-0.062843,-0.133196,-0.095649,-0.063238,-0.029579,-0.134319,-0.120414,-0.145083,-0.093600,-0.138683,-0.201249,-0.131731,-0.155363,-0.146323,-0.137577,-0.174701,-0.177057,-0.232148,-0.143223,-0.133806,-0.165298,-0.123355,-0.220570,-0.126350,...,-0.120460,-0.150971,-0.063705,0.028609,-0.104679,-0.061142,-0.099342,-0.132694,-0.051077,-0.060472,-0.036197,-0.200831,0.045143,-0.028613,-0.070636,-0.081281,-0.025371,-0.027197,0.074705,0.031279,0.062163,0.019838,0.020619,0.073787,-0.022916,-0.117600,0.135398,-0.188348,-0.026922,-0.112814,-0.027228,-0.090717,-0.132578,-0.107636,-0.057813,-0.012551,-0.133118,-0.084207,-0.079820,-0.117924
3,-0.004758,0.011038,-0.029369,0.006895,-0.033487,-0.055559,0.085752,-0.021968,-0.063232,0.048071,-0.216577,0.064299,0.077164,0.021803,-0.005468,0.021236,-0.002922,-0.024461,-0.116354,-0.068251,-0.095495,-0.044077,-0.092331,-0.041632,-0.066623,0.043679,-0.109556,-0.029358,-0.027737,-0.023421,-0.078966,-0.098639,-0.051424,-0.134563,-0.073284,-0.074714,-0.020969,-0.019409,-0.126995,-0.095120,...,-0.140617,-0.106957,-0.054061,-0.021994,-0.113235,-0.063667,-0.072230,-0.121741,-0.085689,-0.101692,-0.026037,-0.135621,-0.014650,-0.097826,-0.090235,-0.046308,0.029217,-0.113898,-0.187271,-0.110174,-0.048867,-0.146340,-0.150806,-0.136318,-0.041943,0.051586,-0.141342,0.002585,-0.155423,-0.144675,0.049787,-0.012974,-0.193549,-0.012383,-0.126619,-0.074002,-0.058396,-0.020076,-0.097344,-0.076790
4,0.103900,0.094004,0.080591,0.076292,0.025045,0.129601,0.043351,0.053008,0.206151,0.058733,0.013721,0.085327,0.128128,0.155556,0.031216,0.031381,0.107281,0.079044,0.061460,0.038535,0.044851,0.102430,-0.031461,0.007772,0.040580,0.090553,0.038432,0.047402,0.078397,0.065309,0.023821,0.019276,0.069039,0.040409,0.034968,0.026649,0.043859,0.016879,0.007419,0.015308,...,0.045238,0.074042,0.100220,0.184308,0.178922,0.119948,0.200564,0.132254,0.198363,0.126698,0.138415,0.188984,0.078137,0.014306,0.043172,0.105482,0.026035,0.104761,0.092226,0.216257,0.225447,0.320748,0.249209,0.186855,0.156928,0.059431,0.129284,0.119832,-0.055788,0.161184,0.216585,0.036210,0.030312,0.042237,0.080785,0.104493,0.008702,0.072699,0.076119,0.032204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.110888,0.123693,0.036231,0.079908,0.143337,0.173187,0.110908,0.127962,0.188742,0.153091,0.114933,0.088451,0.119737,0.173204,0.122053,0.139565,0.132850,0.111900,0.138122,0.028347,0.141317,0.115779,0.054555,0.128109,0.084141,0.183763,0.048415,0.162754,0.133023,0.086266,0.133487,0.111430,0.321342,0.084147,0.129618,0.146897,0.123714,0.041983,0.128821,0.158463,...,0.098501,0.242317,0.084279,0.146087,0.161478,0.212622,0.137093,0.100584,0.071265,0.110812,0.200906,-0.179667,-0.132628,-0.047696,-0.017923,-0.148314,-0.042848,-0.140653,0.259766,-0.118915,-0.138946,-0.242857,-0.188619,-0.132365,-0.191517,0.001101,-0.025577,0.284773,0.112592,0.062849,0.017675,0.027082,0.135356,0.228786,0.096255,0.007474,0.023365,0.106523,0.185806,0.124506
196,-0.174846,-0.105083,-0.056254,-0.108234,-0.171403,-0.161709,-0.135759,-0.070925,0.124069,-0.039690,-0.112447,-0.238017,-0.181977,0.047991,-0.081321,-0.131509,-0.057806,-0.173794,-0.138489,-0.151998,-0.115284,0.023184,0.005942,-0.084875,-0.196716,-0.132522,-0.078023,-0.166158,-0.209678,-0.126429,-0.258729,-0.025592,-0.152642,-0.058691,-0.132697,-0.146447,-0.017483,-0.169942,-0.148225,0.028826,...,-0.040335,-0.166456,-0.304036,-0.235774,-0.234186,-0.244056,-0.069095,-0.083039,-0.171778,-0.100768,-0.092989,0.023340,0.124109,-0.127383,-0.129485,-0.112451,-0.074855,-0.132354,0.013803,-0.025300,0.023132,0.003633,0.017824,-0.078463,-0.095247,-0.063405,0.044623,-0.151336,-0.141952,-0.006182,-0.185774,-0.161022,-0.078927,-0.183723,-0.133391,-0.349682,-0.140841,-0.169111,-0.259888,-0.080331
197,-0.068551,-0.129912,-0.017350,-0.021815,-0.024989,0.095313,-0.037252,0.095853,0.110349,-0.092770,0.011530,0.051904,0.005280,0.099900,-0.046912,0.063208,0.061946,-0.002469,-0.030833,-0.058418,0.000531,-0.020339,0.055384,0.111253,0.099684,0.008566,-0.001383,-0.010196,-0.018336,0.016165,-0.058280,-0.014710,-0.012833,0.077132,0.041611,0.045559,0.095658,-0.061162,0.080566,0.036006,...,-0.138820,-0.009498,-0.032717,-0.106032,-0.080722,-0.110939,-0.054183,0.093714,-0.215951,-0.154324,-0.083775,0.108236,0.056096,0.107141,0.062616,0.020570,0.149472,0.095808,0.038069,0.023496,0.154102,0.035556,0.020408,0.002641,0.059436,0.006707,0.023817,0.049363,-0.004066,0.010650,-0.039783,0.040151,-0.048723,-0.137280,-0.006258,0.000201,0.102774,0.145666,0.015614,-0.048650
198,0.028004,-0.091832,-0.013943,0.043212,0.040353,0.218866,0.103120,0.127007,0.042272,-0.065158,-0.078183,0.060976,0.116456,0.146992,0.117395,-0.055671,0.095772,0.124607,0.167881,0.111623,0.031482,0.114395,-0.122910,0.137155,-0.001637,0.089874,0.120354,0.117888,0.037479,-0.026719,0.122031,0.029107,0.016849,0.068909,0.107148,0.105563,-0.020743,0.011959,0.061338,0.021789,...,-0.073344,0.117347,0.059783,-0.044737,0.046825,0.179003,0.073245,0.065447,-0.046708,0.011681,0.041925,-0.129603,0.146136,0.170599,0.114767,0.043451,-0.021687,-0.032169,0.051241,0.060271,-0.105791,-0.046848,-0.010750,0.051161,-0.129876,-0.187283,-0.067526,0.144154,0.011429,0.141525,-0.068822,0.077004,0.117597,0.118248,0.111990,0.110061,-0.002312,0.097829,0.128533,0.196876


In [None]:
def query_list(list_target_brand_name, top_n=None, kb_fpath=DEFAULT_BRAND_EMB_SAVE_FPATH):

    with codecs.open(kb_fpath, encoding='utf-8') as fp:
        dict_kb = json.load(fp)

    dict_results = dict()
    for idx, target_brand_name in enumerate(list_target_brand_name, start=1):
        sorted_candidate_brands = query(target_brand_name, top_n=top_n, dict_kb=dict_kb)

        dict_results[target_brand_name] = sorted_candidate_brands

    return dict_results