In [1]:
import pandas as pd
from google.cloud import bigquery
import fasttext

In [2]:
country_id = 1

products_query = f"""
    SELECT * 
    FROM `peya-food-and-groceries.user_rodrigo_benitez.product_categories`
    WHERE country_id = {country_id}
    """

client = bigquery.Client()
products_ds = client.query(products_query).to_dataframe()



In [3]:
# TO CSV

In [4]:
pd.set_option('display.max_colwidth', None)

products_ds.sample(5)

Unnamed: 0,partner_id,partner_name,partner_description,business_type_id,business_type_name,country_id,legacyId_section,section_name,product_legacy_id,product_id,product_name,product_description,gtin,category_level_1_2_3
122161,244820,Maxi Kiosco 24 Hrs,,2,Market,1,3030364,"Golosinas, alfajores y chocolates",52234781,52234781,Chicles Beldent Sabor Mentol 10 g,,77942395.0,"Snacks | Confectionary | Candies, Mint / Gum"
192081,233576,Punto Shop 3,,2,Market,1,5801636,"Infusiones, Endulzantes y Otros",101734240,101734240,Dulce De Leche Conaprole 250 g,,7730105005121.0,Packaged Foods | Breakfast / Spreads | Spreads
68145,384245,De Casa Foods - Congelados,,2,Market,1,5340856,Empanadas pack 6 unidades,94631325,94631325,Caprese,,,
261238,190119,PedidosYa Market 5,,2,Market,1,1925709,Aguas y aguas saborizadas,28194824,28194824,Agua Salus Cero Limón Clásica 1.5 L,,7730400003204.0,Beverages | Water | Flavoured
55273,351717,El Propio Market,,2,Market,1,4925039,Snacks,89613004,89613004,Papas Pringles Original Navidad - 149Gr,,38000201141.0,Snacks | Salty Snacks | Chips / Crisps


In [5]:
model_info = products_ds[['product_id', 'product_name', 'category_level_1_2_3']].fillna('')

In [6]:
model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3
122176,49790967,Chocolate con Leche Oreo Milka 55g,Snacks | Confectionary | Chocolates
119841,94190777,Agua Vitale Sin Gas 625 Ml,Beverages | Water | Still
105952,98339364,Vino Rosado Clasico Santa Teresa Tetra 1 L,BWS | Wine / Sparkling Wine |
284009,92129201,Café Soluble Bracafé 170 Gr + Recarga 50 Gr Gratis,Packaged Foods | Tea / Coffee | Coffee
26417,78354893,Sesamo Pelado La Abundancia 200 G,Packaged Foods | Pasta / Rice / Grains | Rice


Proceso de limpieza de texto, aqui se modifican cosas como:
- se eliminan numeros
- se normaliza texto a lowercase
- se elimina simbología especial (Ñ, tildes, dieresis) con librería UNICODE
- eliminar UNIDADES de MEDIA ????? (tokenizar y ver palabras de 1,2 y 3 gramas)

In [7]:
import re
from unidecode import unidecode

def preprocess_text(text):
    word = re.sub('\n', ' ', text, re.UNICODE)
    word = unidecode(word)
    word = re.sub(r'[^\w\s]', '', word, re.UNICODE)
    word = re.sub(r"\b(\d+|[a-z])\b *", '', word, re.UNICODE)
    word = re.sub(r'[0-9]+', '', word)
#     word = re.sub(r'[0-9]+', '', word)
    word = word.lower()
    word = word.strip()
    return word

In [8]:
model_info['full_text'] = model_info['category_level_1_2_3'] + ' ' + model_info['product_name']
model_info['preprocessed_text'] = model_info['full_text'].apply(preprocess_text)

model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3,full_text,preprocessed_text
287240,84129615,Pan Para Chivitos Los Sorchantes 4 Un. 240 G,Bread / Bakery | Bread | Packaged,Bread / Bakery | Bread | Packaged Pan Para Chivitos Los Sorchantes 4 Un. 240 G,bread bakery bread packaged pan para chivitos los sorchantes un g
222160,101715256,Limpiador Lysoform Baño Gatillo 500Cc,Home / Pet | Cleaning / Laundry | Cleaning Products,Home / Pet | Cleaning / Laundry | Cleaning Products Limpiador Lysoform Baño Gatillo 500Cc,home pet cleaning laundry cleaning products limpiador lysoform bano gatillo cc
55879,101698150,Bebida isotónica Powerade manzana 995 cc.,Beverages | Juice / Ice Tea / Sports / Energy | Sports,Beverages | Juice / Ice Tea / Sports / Energy | Sports Bebida isotónica Powerade manzana 995 cc.,beverages juice ice tea sports energy sports bebida isotonica powerade manzana cc
170076,98440105,Puré De Papa Monte Cudine 125 G,Packaged Foods | Cooking / Condiments / Baking / Herbs / Spices | Cooking / Baking,Packaged Foods | Cooking / Condiments / Baking / Herbs / Spices | Cooking / Baking Puré De Papa Monte Cudine 125 G,packaged foods cooking condiments baking herbs spices cooking baking pure de papa monte cudine g
31062,25178885,Esponja Virulana Siempre Limpia Unidad,Home / Pet | Disposables | Cleaning Accesories,Home / Pet | Disposables | Cleaning Accesories Esponja Virulana Siempre Limpia Unidad,home pet disposables cleaning accesories esponja virulana siempre limpia unidad


In [9]:
model_info[['preprocessed_text']].to_csv('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv', index=False)

In [10]:
# model = fasttext.train_unsupervised('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv')

In [11]:
# model.save_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')

In [12]:
model = fasttext.load_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')



In [13]:
text_words = ['aceite']
for word in text_words:
    print(f'Similares a {word}:')
    print([words[1] for words in model.get_nearest_neighbors(word)])

Similares a aceite:
['aceit', 'aceita', 'acete', 'oliva', 'olivar', 'extvirgen', 'olivas', 'olivo', 'oliovita', 'olivares']


In [14]:
product_embeddings = [model.get_sentence_vector(product_name) for product_name in model_info['preprocessed_text']]

In [15]:
model_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298790 entries, 0 to 298789
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   product_id            298790 non-null  object
 1   product_name          298790 non-null  object
 2   category_level_1_2_3  298790 non-null  object
 3   full_text             298790 non-null  object
 4   preprocessed_text     298790 non-null  object
dtypes: object(5)
memory usage: 11.4+ MB


In [16]:
model_info['product_id'] = pd.to_numeric(model_info['product_id'])

In [None]:
from annoy import AnnoyIndex

annoy_index = AnnoyIndex(len(product_embeddings[0]), 'angular')

for product, product_embedding in zip(model_info.iterrows(), product_embeddings):
    product_id = product[1]['product_id']
    annoy_index.add_item(product_id, product_embedding)

annoy_index.build(10)

In [18]:
# similar_products = annoy_index.get_nns_by_item(66153365, 10)
# products.loc[similar_products]