In [2]:
#cargo paquetes
import pandas as pd
import requests
import numpy as np
from datetime import datetime
import re

## Extraigo datos a partir las APIs de desarrolladores
https://developers.mercadolibre.com.ar/es_ar/api-docs-es

#### Traigo las categorías generales

In [None]:
# Api de Categorias Generales
URL = "https://api.mercadolibre.com/sites/MLA/categories"

In [3]:
# envio la request y guardo la response
r = requests.get(url = URL) 

# extraigo la info de categorias en formato json
categories = r.json()

#### Traigo las categorias (children categories)

In [None]:
# Parametros
URL = "https://api.mercadolibre.com/categories/"
category_id_key = 'id'
category_name_key= 'name'
subcats_key = 'children_categories'
categories_dataset_filename = 'categories.csv'


In [None]:
# Traigo las categorias de cada categoria general
children_categories = []

for c in categories:
    category_id = c.get(category_id_key)
    category_name= c.get(category_name_key)
    r = requests.get(url = URL+category_id) 
    c_info = r.json()[subcats_key]
    for ci in c_info:
        ci['mother_id'] = category_id
        ci['mother_name'] = category_name
        children_categories.append(ci)

#genero dataframe y lo grabo
children_categories_df = pd.DataFrame(children_categories)
children_categories_df.to_csv(categories_dataset_filename)

#### Traigo items ids por categoria

In [None]:
#Parametros
URL = "https://api.mercadolibre.com/sites/MLA/search"

items_category_dataset_filename = 'items_categoria.csv'

In [None]:
#traigo los 1000 items_id disponibles por cada categoría
items=[]

for cc in children_categories:
    category_id = cc.get('id')
    n_items = cc.get('total_items_in_this_category')
    if n_items > 1000:
        for i in range(0,1000,50):
            offset = i
            # defining a params dict for the parameters to be sent to the API 
            PARAMS = {'category':category_id, 'offset': offset} 
            # sending get request and saving the response as response object 
            r = requests.get(url = URL, params = PARAMS) 
            # extracting data in json format 
            data = r.json()
            # elijo results
            items_ = data.get('results',None)
            if items_ is not None:
                for i in items_:
                    i_ = {'id':i.get('id', None)}
                    i_['categoria_id']= category_id
                    items.append(i_)
    else:
        for i in range(0, n_items, 50):
            offset = i
            PARAMS = {'category':category_id, 'offset': offset} 
            r = requests.get(url = URL, params = PARAMS) 
            data = r.json()
            items_ = data.get('results',None)
            if items_ is not None:
                for i in items_:
                    i_ = {'id':i.get('id', None)}
                    i_['categoria_id']= category_id
                    items.append(i_)

        
#genero dataframe y grabo
items_categoria = pd.DataFrame(items)
items_categoria.to_csv(items_category_dataset_filename)

#### Traigo data de items a partir de sus ids

In [None]:
#parametros
URL = 'https://api.mercadolibre.com/items'

items_dataset_filename = 'items_body.pkl'

In [None]:
items2 =[]
items_no_encontrados=[]

n_items = items_categoria.shape[0]

# recorro en intervalos de 20 ya que es el limite de multiget de la api items
for i in range(0,n_items, 20):
    
    ids= list(items_categoria.id[i:i+20])
    ids_= ",".join(ids)
    PARAMS = {'ids': ids_}  
    try:
        r = requests.get(url = URL, params = PARAMS) 
  
    # extracting data in json format 
        items_2 = r.json() 
        items2.append(items_2)
    except:
        items_no_encontrados.append(ids)
        


In [None]:
#aplano la lista de listas resultantes del loop      
flat_items = [item for sublist in items2 for item in sublist]

#genero DataFrame
items_df = pd.DataFrame(flat_items)

#genero DataFrame con solo el body de la response ya que tambien se guarda un campo del code
items_body = pd.DataFrame(list(items_df[items_df.code == 200].body))

#grabo DataFrame
items_body.to_pickle(items_dataset_filename)

## Generación de dataset

In [None]:
#Parametro
datasetfinal_filename= 'items_featselected.csv.gz'

In [None]:
#elimino duplicados
items_categoria.drop_duplicates(subset=['id'], inplace=True)
items_body.drop_duplicates(subset=['id'], inplace=True)

#realizo join para traer el id de categoria consistente con el dataframe categorias
items = items_body.merge(items_categoria, on='id', how='inner', validate = 'one_to_one')

### Generación de features

funciones auxiliares

In [None]:
#funcion auxiliar para traer valor de campo que es un diccionario
def traer_dictvalue(diccionario, key):
    if  pd.isnull(diccionario):
        return None
    else:
        return(diccionario.get(key))

#funcion auxiliar calcular diferencia entre valor y su promedio_por_categoria
def diff_mean_cat(x):
    promedio_categoria = x.groupby(items['categoria_id']).transform('mean')
    diff = x - promedio_categoria
    return diff


#funcion auxiliar para calcular dias entre una fecha(como string) y hoy
def daydiff_today(fecha):
    if pd.isnull(fecha):
        return None

    else:
        fecha_date = datetime.strptime(re.sub('T.*$','',fecha), '%Y-%m-%d')
        daydiff = (fecha_date-datetime.today()).days
        return daydiff
    
#funcion auxiliar para traer largo de lista   
def largo_lista(lista):
    try:
        return len(lista)
    except:
        return 0
    
#funcion auxiliar para encontrar valor lista   
def valor_en_lista(lista, valor):
    try:
        return int(valor in lista)
    except:
        return 0
            

In [None]:
#genero campos flag
items['flg_descuento'] = np.where(np.isnan(items['original_price']), 0, 1)
items['flg_official_store'] = np.where(np.isnan(items.official_store_id), 0, 1)
items['flg_brand_verified'] = items.tags.apply(lambda x : valor_en_lista(x, 'brand_verified'))


#genero campos relacionados con fechas
items['daydiff_stoptime'] =items.stop_time.apply(daydiff_today)
items['daydiff_starttime'] =items.start_time.apply(daydiff_today)
items['daydiff_update'] =items.last_updated.apply(daydiff_today)


#genero campos traidos de diccionarios
items['provincia'] = items.seller_address.apply(traer_dictvalue, key='state').apply(traer_dictvalue, key='name')
items['freeshipping'] = items.shipping.apply(traer_dictvalue, key='free_shipping')


#genero campos relacionados con quantities
items['diff_intquan_avlquan'] = items['initial_quantity'] - items['available_quantity']
items['pct_avlquan_intquan'] = np.divide(items['available_quantity'], items['initial_quantity'])



#genero otros campos
#cantidad de fotos
items['n_pictures'] = items.pictures.apply(largo_lista)
#cantidad de atributos
items['n_atribbutes'] = items.attributes.apply(largo_lista)
#valor de descuento
items['valor_descuento'] = items.original_price - items.price


#genero campos con data relativa a la categoria
items['diff_mean_cat_avlquan'] = diff_mean_cat(items['available_quantity'])
items['diff_mean_cat_iniquan'] = diff_mean_cat(items['initial_quantity'])
items['diff_mean_cat_pct_avliniquan'] = diff_mean_cat(items['pct_avlquan_intquan'])
items['diff_mean_cat_diff_iniavlquan'] = diff_mean_cat(items['diff_intquan_avlquan'])
items['diff_mean_cat_price'] = diff_mean_cat(items['price'])


In [None]:
#Selecciono los campos de interes y grabo archivo
campos_de_interes = ['id', 'price','valor_descuento', 'initial_quantity', 'available_quantity', 'sold_quantity', 
                   'listing_type_id','condition', 'status', 'categoria_id',
                   'flg_descuento', 'flg_official_store', 'flg_brand_verified',
                   'daydiff_stoptime', 'daydiff_starttime', 'daydiff_update', 'provincia',
                   'freeshipping', 'diff_intquan_avlquan', 'pct_avlquan_intquan',
                   'n_pictures', 'n_atribbutes', 'diff_mean_cat_avlquan',
                   'diff_mean_cat_iniquan', 'diff_mean_cat_pct_avliniquan',
                   'diff_mean_cat_diff_iniavlquan', 'diff_mean_cat_price']


items_featureselected = items[campos_de_interes]
items_featureselected.to_csv(datasetfinal_filename, index=False)

Posible mejora: Traer el dato de cantidad preguntas por productos