In [1]:
import pandas as pd
from google.cloud import bigquery
import fasttext

In [2]:
country_id = 1

products_query = f"""
    SELECT * 
    FROM `peya-food-and-groceries.user_rodrigo_benitez.product_categories`
    WHERE country_id = {country_id}
    """

client = bigquery.Client()
products_ds = client.query(products_query).to_dataframe()



In [3]:
# TO CSV

In [4]:
pd.set_option('display.max_colwidth', None)

products_ds.sample(5)

Unnamed: 0,partner_id,partner_name,partner_description,business_type_id,business_type_name,country_id,legacyId_section,section_name,product_legacy_id,product_id,product_name,product_description,gtin,category_level_1_2_3
32940,183068,Punto Shop 33,,2,Market,1,5801283,Galletas dulces y saladas,101712106,101712106,Galletas Oreo Mini 50g,,7622300841461,Snacks | Confectionary | Cookies
115103,330924,Unni Mercado,,2,Market,1,4178886,Lácteos y huevos,76169928,76169928,Huevo Blanco Prodhin Pack 15 unidades,,7730239000016,Dairy / Chilled / Eggs | Dairy / Eggs | Eggs
81082,171720,Punto Shop 96,,2,Market,1,5800985,Limpieza del hogar,101695261,101695261,Jabón Líquido Nevex Para Diluir 500 ml + Botella 3 L Vacía,,7730755001955,Home / Pet | Cleaning / Laundry | Laundry
184797,253456,Sluckis - Mercedes,,2,Market,1,2889716,Cuidado personal,49790311,49790311,Crema Sleep For Men 40 G,,7730287002222,Personal Care / Baby / Health | Personal Care / Beauty | Skin Care
53126,168524,Carniceria Ecomarket 2,,2,Market,1,1587914,Pollo,19698159,19698159,Suprema De Pollo 0.8 kg,peso aproximado,2850200308301,Meat / Seafood | Poultry | Chicken


In [5]:
model_info = products_ds[['product_id', 'product_name', 'category_level_1_2_3']].fillna('')

In [6]:
model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3
284312,101697924,Bebida isotónica Powerade mountain blast 500 cc.,Beverages | Juice / Ice Tea / Sports / Energy | Sports
171823,41391655,Harina 000 Cañuelas Paquete 1 kg,Packaged Foods | Cooking / Condiments / Baking / Herbs / Spices | Cooking / Baking
103580,105226128,Obleas Hipopo Vainilla 84 G.,Snacks | Confectionary | Other Confectionary
217505,93620160,Café Viaggio En Cápsulas Cioccolato Intensidad 8 55 Gr Caja De 10 Cápsulas,Packaged Foods | Tea / Coffee | Coffee
273872,100964371,Te Lady Grey 10 Un Twinings,Packaged Foods | Tea / Coffee | Tea


Proceso de limpieza de texto, aqui se modifican cosas como:
- se eliminan numeros
- se normaliza texto a lowercase
- se elimina simbología especial (Ñ, tildes, dieresis) con librería UNICODE
- eliminar UNIDADES de MEDIA ????? (tokenizar y ver palabras de 1,2 y 3 gramas)

In [7]:
import re
from unidecode import unidecode

def preprocess_text(text):
    word = re.sub('\n', ' ', text, re.UNICODE)
    word = unidecode(word)
    word = re.sub(r'[^\w\s]', '', word, re.UNICODE)
    word = re.sub(r"\b(\d+|[a-z])\b *", '', word, re.UNICODE)
    word = re.sub(r'[0-9]+', '', word)
#     word = re.sub(r'[0-9]+', '', word)
    word = word.lower()
    word = word.strip()
    return word

In [8]:
# model_info['full_text'] = model_info['category_level_1_2_3'] + ' ' + model_info['product_name']
model_info['preprocessed_text'] = model_info['product_name'].apply(preprocess_text)

model_info.sample(5)

Unnamed: 0,product_id,product_name,category_level_1_2_3,preprocessed_text
242528,90527487,Champignones Revelacion Laminados lata 184G,Packaged Foods | Canned / Jarred / Instant Meals | Canned Vegetables,champignones revelacion laminados lata g
278566,84544922,Refresco Fanta Naranja 2.25 L,,refresco fanta naranja l
288143,89076882,Cigarros Fiesta Box 20Un,Smoking / Tobacco | Tobacco | Cigarette Sticks,cigarros fiesta box un
126119,45707342,Vino Adobe Sauvignon Blanco Organico - 750ml,BWS | Wine / Sparkling Wine | White,vino adobe sauvignon blanco organico ml
263826,48849548,"Agua Salus Frutté Naranja 1,65 L",Beverages | Water | Flavoured,agua salus frutte naranja l


In [9]:
model_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298790 entries, 0 to 298789
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   product_id            298790 non-null  object
 1   product_name          298790 non-null  object
 2   category_level_1_2_3  298790 non-null  object
 3   preprocessed_text     298790 non-null  object
dtypes: object(4)
memory usage: 9.1+ MB


In [10]:
model_info[['preprocessed_text']].to_csv('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv', index=False)

In [11]:
# model = fasttext.train_unsupervised('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/datasets/model_input.csv')

In [12]:
# model.save_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')

In [13]:
model = fasttext.load_model('/home/rodrigobenitez/Documents/GitHub/Proyecto-Final-IMF/Model/no_supervisado.bin')



In [14]:
text_words = ['aceite']
for word in text_words:
    print(f'Similares a {word}:')
    print([words[1] for words in model.get_nearest_neighbors(word)])

Similares a aceite:
['aceit', 'aceita', 'acete', 'oliva', 'olivar', 'extvirgen', 'olivas', 'olivo', 'oliovita', 'olivares']


In [15]:
product_embeddings = [model.get_sentence_vector(product_name) for product_name in model_info['preprocessed_text']]

In [16]:
model_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298790 entries, 0 to 298789
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   product_id            298790 non-null  object
 1   product_name          298790 non-null  object
 2   category_level_1_2_3  298790 non-null  object
 3   preprocessed_text     298790 non-null  object
dtypes: object(4)
memory usage: 9.1+ MB


In [17]:
model_info['product_id'] = pd.to_numeric(model_info['product_id'])

In [None]:
from annoy import AnnoyIndex

annoy_index = AnnoyIndex(len(product_embeddings[0]), 'angular')

for product, product_embedding in zip(model_info.iterrows(), product_embeddings):
    product_id = product[1]['product_id']
    annoy_index.add_item(product_id, product_embedding)

annoy_index.build(10)

In [None]:
# similar_products = annoy_index.get_nns_by_item(66153365, 10)
# products.loc[similar_products]

In [None]:
# q = "coca cola light"
# model.get_sentence_vector(q)
# vector = model.get_sentence_vector(q)