In [1]:
import pandas as pd
from google.cloud import bigquery
import fasttext

In [2]:
country_id = 1

products_query = f"""
    SELECT * 
    FROM `peya-food-and-groceries.user_rodrigo_benitez.product_categories`
    WHERE country_id = {country_id}
    """

client = bigquery.Client()
products_ds = client.query(products_query).to_dataframe()



In [3]:
# TO CSV

In [4]:
pd.set_option('display.max_colwidth', None)

products_ds.sample(5)

Unnamed: 0,partner_id,partner_name,partner_description,business_type_id,business_type_name,country_id,legacyId_section,section_name,product_legacy_id,product_id,product_name,product_description,gtin,category_level_1_2_3
34228,369621,Kiosco 24 - La Teja,,2,Market,1,5006384,"Alfajores, chocolates y golosinas",90743096,90743096,Garoto Tabletas Jumbo Leche 90Gr,,7891008168884.0,Snacks | Confectionary | Chocolates
29266,322204,Salon 13,,2,Market,1,4008025,Limpieza De La Casa,72181693,72181693,Higienol Sin Fin Plus Papel Higienico Simple Hoja 4Rollosx50M,,7730219010707.0,Home / Pet | Disposables | Paper Products
112664,306455,Madre Tierra Paso Molino,,2,Market,1,5583591,Gluten Free,98397124,98397124,Vainilla Repostera X60Ml,,,
57210,249118,Autoservicio Corner Shop,,2,Market,1,2819707,Despensa,100456946,100456946,Esencia De Vainilla Monte Cudi Frasco 60 Cc,,7730177000369.0,Packaged Foods | Cooking / Condiments / Baking / Herbs / Spices | Cooking / Baking
60039,161038,PedidosYa Market 2,,2,Market,1,5372192,Insecticidas,79509175,79509175,Tableta Insecticida Jupiter 24 Unidades,,7840001005986.0,Home / Pet | Household | Pest Control


In [5]:
model_info = products_ds[['product_id', 'product_name', 'category_level_1_2_3']].fillna('')

In [6]:
model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3
198517,6566481,Queso Casancrem Clasico 320 g,
253342,90690557,Pamplona De Cerdo - 0.5 Kg,
31470,84268738,Garbanzos Revelacion 400 Grms,Packaged Foods | Canned / Jarred / Instant Meals | Canned Vegetables
256358,101713621,Chocolate Milka Aireado 100 G.,Snacks | Confectionary | Chocolates
205514,98439726,Premezcla Cocina Mix Exent Sin Gluten 500 G,Packaged Foods | Cooking / Condiments / Baking / Herbs / Spices | Cooking / Baking


Proceso de limpieza de texto, aqui se modifican cosas como:
- se eliminan numeros
- se normaliza texto a lowercase
- se elimina simbología especial (Ñ, tildes, dieresis) con librería UNICODE
- eliminar UNIDADES de MEDIA ????? (tokenizar y ver palabras de 1,2 y 3 gramas)

In [7]:
import re
from unidecode import unidecode

def preprocess_text(text):
    word = re.sub('\n', ' ', text, re.UNICODE)
    word = unidecode(word)
    word = re.sub(r'[^\w\s]', '', word, re.UNICODE)
    word = re.sub(r"\b(\d+|[a-z])\b *", '', word, re.UNICODE)
    word = re.sub(r'[0-9]+', '', word)
#     word = re.sub(r'[0-9]+', '', word)
    word = word.lower()
    word = word.strip()
    return word

In [8]:
model_info['full_text'] = model_info['category_level_1_2_3'] + ' ' + model_info['product_name']
model_info['preprocessed_text'] = model_info['full_text'].apply(preprocess_text)

model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3,full_text,preprocessed_text
16331,49791334,Snack Trofeu Salado 40G Pimienta,Snacks | Other Snacks |,Snacks | Other Snacks | Snack Trofeu Salado 40G Pimienta,snacks other snacks snack trofeu salado g pimienta
115207,98246228,Helado Conaprole 1L Frutos Del Bosque,Frozen | Ice Cream / Desserts |,Frozen | Ice Cream / Desserts | Helado Conaprole 1L Frutos Del Bosque,frozen ice cream desserts helado conaprole l frutos del bosque
80681,67625299,Mini Croquetas Congeladas Jamon Y Queso 300G Artico,Frozen | Frozen Convenience / Bakery | Convenience Food,Frozen | Frozen Convenience / Bakery | Convenience Food Mini Croquetas Congeladas Jamon Y Queso 300G Artico,frozen frozen convenience bakery convenience food mini croquetas congeladas jamon y queso g artico
10741,72983728,Galleta Salvado Tripack 360G Maestro Cubano,Packaged Foods | Breakfast / Spreads | Crackers,Packaged Foods | Breakfast / Spreads | Crackers Galleta Salvado Tripack 360G Maestro Cubano,packaged foods breakfast spreads crackers galleta salvado tripack g maestro cubano
216949,49679002,Chorizo Cantimpalos Unidad,Meat / Seafood | Meat | Pork,Meat / Seafood | Meat | Pork Chorizo Cantimpalos Unidad,meat seafood meat pork chorizo cantimpalos unidad


In [9]:
model_info[['preprocessed_text']].to_csv('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv', index=False)

In [10]:
# model = fasttext.train_unsupervised('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv')

In [11]:
# model.save_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')

In [12]:
model = fasttext.load_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')



In [13]:
text_words = ['aceite']
for word in text_words:
    print(f'Similares a {word}:')
    print([words[1] for words in model.get_nearest_neighbors(word)])

Similares a aceite:
['aceit', 'aceita', 'acete', 'oliva', 'olivar', 'extvirgen', 'olivas', 'olivo', 'oliovita', 'olivares']


In [14]:
product_embeddings = [model.get_sentence_vector(product_name) for product_name in model_info['preprocessed_text']]

In [15]:
model_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298790 entries, 0 to 298789
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   product_id            298790 non-null  object
 1   product_name          298790 non-null  object
 2   category_level_1_2_3  298790 non-null  object
 3   full_text             298790 non-null  object
 4   preprocessed_text     298790 non-null  object
dtypes: object(5)
memory usage: 11.4+ MB


In [16]:
model_info['product_id'] = pd.to_numeric(model_info['product_id'])

In [17]:
from annoy import AnnoyIndex

annoy_index = AnnoyIndex(len(product_embeddings[0]), 'angular')

# for product, product_embedding in zip(model_info.iterrows(), product_embeddings):
#     product_id = product[1]['product_id']
#     annoy_index.add_item(product_id, product_embedding)

# annoy_index.build(10)

In [18]:
# similar_products = annoy_index.get_nns_by_item(66153365, 10)
# products.loc[similar_products]