In [65]:
# importo librerias
import ast
from datetime import datetime
import pickle
from collections import namedtuple
from functools import wraps
import logging

import pandas as pd
import numpy as np
from scipy.stats import entropy

from meli_pull_apidata import MeliApiClient

In [107]:

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.info("test")

INFO:root:test


In [67]:
# Decorators

def log_shape(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        logging.info("%s,%s" % (func.__name__, result.shape))
        return result
    return wrapper

def log_dtypes(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        logging.info("%s,%s" % (func.__name__, result.dtypes))
        return result
    return wrapper

#### Funciones Útiles

In [82]:
# util functions
def read_dict(name_file):
    with open(name_file, 'rb') as f:
        dic = pickle.load(f)
    return dic

def dict_topandas(dict_file, _type=1):
    return pd.json_normalize(
            [dict_file[k] 
             if dict_file.get(k) else 0 
             for k in dict_file],
            sep='_'
            )
def parse_timestamp(text):
    try:
        return pd.to_datetime(text)
    except ValueError as e:
        return np.nan

def parse_nested_value(vals):
    if vals is not None and vals == vals:
        if type(vals) == list:
            if vals[0].startswith('MLA'):
                return vals[0][4:]
            return vals[0]
        elif type(vals) == str:
            return vals
    else:
        return np.nan
    
    
def time2datetime_converter(col):
    timepart = (df[col].astype(str)
                   .str.replace('\.0$', '')  # NaNs force float dtype
                   .str.pad(4, fillchar='0'))
    return pd.to_datetime(dff[col] + ' ' +
                           timepart.str.slice(0, 2) + ':' +
                           timepart.str.slice(2, 4),
                           errors='coerce')

def parse_boolean(val):
    if val and val == val and type(val) == str:
        if val.lower() == 'true':
            return True
        elif val.lower() == 'false':
            return False
    else:
        return np.nan
    
   
# pandas df functions
@log_shape
@log_dtypes
def general_process_columns(df, columns_nt):
    df = df.copy()
    # dict para mapear nombre de columnas
    colname_mapper = {src:dest for src, dest, _ in columns_nt}
    # convertimos o parseamos las columnas segun corresponda
    for col in columns_nt:
        try:
            df[col.src] = [col.convert(val) 
                          if val == val and val is not None else np.nan 
                          for val in df[col.src]]
        except ValueError as e:
            print(col.src, e)
        except KeyError as ke:
            print(col.src, ke)
        #except TypeError as te:
        #    print(col.src, te)
    return df.rename(columns=colname_mapper)


@log_shape
@log_dtypes
def parse_tags(df, coltag):
    nest = pd.get_dummies(
                df[coltag].apply(pd.Series),
                prefix='attr'
                )
    lcol = nest.columns.tolist()
    scol = set()
    scol.update(lcol)
    for att_col in scol:
        df[att_col] = nest[att_col].T.sum()
                              
    return df

@log_shape
@log_dtypes
def parse_prices(df, col='prices_prices'):
    df = df.copy()
    def count_prom(lines):
        if type(lines) == 'list':
            count = 0
            for line in lines:
                if line.get('type') == 'promotion':
                    count += 1
            return count
        return 0
    df[col] = list(map(count_prom, df[col]))
    return df        



In [89]:
Column = namedtuple('Column', 'src dest convert')
sites_schema = [
    Column('id', 'id', str),
    Column('title', 'pub_title', str),
    Column('seller_registration_date',
           'seller_registration_date', parse_timestamp),
    #Column('seller_tags', 'seller_tags', pd.Categorical),
    Column('seller_seller_reputation_level_id', 
           'seller_reputation_level', parse_nested_value),
    Column('seller_seller_reputation_power_seller_status', 
           'power_seller_status', parse_nested_value),
    #Column('seller_seller_reputation_transactions_completed',
    #       'seller_transactions_completed', int),

    Column('seller_seller_reputation_transactions_ratings_negative', 
           'seller_ratings_negative', int),
    Column('seller_seller_reputation_transactions_ratings_neutral', 
           'seller_ratings_neutral', int),
    Column('seller_seller_reputation_transactions_ratings_positive', 
           'seller_ratings_positive', int),
    Column('seller_seller_reputation_transactions_total', 
           'seller_transactions_total', int),
    Column('seller_seller_reputation_metrics_sales_completed', 
           'seller_sales_completed', int),
    Column('seller_seller_reputation_metrics_claims_value', 
           'seller_claims_value', int),
    Column('seller_seller_reputation_metrics_claims_rate', 
           'seller_claims_rate', float),
    Column('seller_seller_reputation_metrics_cancellations_rate', 
           'seller_cancellations_rate', float),
    Column('seller_seller_reputation_metrics_cancellations_value',
           'seller_cancellations_value', int),
    Column('seller_seller_reputation_metrics_delayed_handling_time_value',
           'seller_delayed_handling_value', int),
    Column('seller_seller_reputation_metrics_delayed_handling_time_rate',
           'seller_delayed_handling_rate', float),
    Column('sale_price', 'sale_price', float),
    Column('buying_mode', 'buying_mode', pd.Categorical),
    Column('listing_type_id', 'listing_type', pd.Categorical),
    Column('condition', 'condition', pd.Categorical),
    #Column('accepts_mercadopago', 'accepts_mercadopago', parse_boolean),
    Column('installments_quantity', 'installments_quantity', int),
]
items_schema = [
    Column('id', 'id', str),
    Column('base_price', 'base_price', float),
    Column('initial_quantity', 'initial_quantity', int), 
    Column('available_quantity', 'available_quantity', int),
    Column('start_time', 'start_time', parse_timestamp),
    Column('stop_time', 'stop_time', parse_timestamp),
    Column('sold_quantity', 'sold_quantity', int), 
    #Column('international_delivery_mode', 'international_delivery_mode', parse_boolean), 
    Column('status', 'status', bool),#parse_boolean),
    # Column('warranty', 'warranty',parse_text) 
    Column('date_created', 'date_created', parse_timestamp),
    #Column('date_updated', 'date_updated', parse_timestamp),
    Column('health', 'health', float),
    #Column('site_id', 'site_id', pd.Categorical),
    Column('price', 'price', float),
    Column('currency_id', 'currency_id', pd.Categorical),
    Column('available_quantity', 'available_quantity', int),
    Column('sold_quantity', 'sold_quantity', int),
    Column('accepts_mercadopago', 'accepts_mercadopago', str), # parse_boolean),
    Column('original_price', 'original_price', float),
    #Column('category_id', 'category_id', pd.Categorical),
    Column('domain_id', 'domain', parse_nested_value),
    Column('catalog_listing', 'catalog_listing', bool),#parse_boolean),
    Column('seller_id', 'seller_id', int),
    Column('shipping_free_shipping', 'shipping_free_shipping', bool),#parse_boolean),
    Column('shipping_mode', 'shipping_mode', parse_nested_value),
    Column('seller_address_state_name', 'seller_address_state_name', parse_nested_value),
    Column('seller_contact_webpage', 'seller_contact_webpage', str)
    ]

In [79]:
list_columns_drop_from_sites = [
    'attributes', 'tags', 'prices_prices'
    ]

In [90]:
sites_column_list = [x.dest for x in sites_schema]
items_column_list = [x.dest for x in items_schema]
master_column_list = sites_column_list + items_column_list
master_column_list

['id',
 'pub_title',
 'seller_registration_date',
 'seller_reputation_level',
 'power_seller_status',
 'seller_ratings_negative',
 'seller_ratings_neutral',
 'seller_ratings_positive',
 'seller_transactions_total',
 'seller_sales_completed',
 'seller_claims_value',
 'seller_claims_rate',
 'seller_cancellations_rate',
 'seller_cancellations_value',
 'seller_delayed_handling_value',
 'seller_delayed_handling_rate',
 'sale_price',
 'buying_mode',
 'listing_type',
 'condition',
 'installments_quantity',
 'id',
 'base_price',
 'initial_quantity',
 'available_quantity',
 'start_time',
 'stop_time',
 'sold_quantity',
 'status',
 'date_created',
 'health',
 'price',
 'currency_id',
 'available_quantity',
 'sold_quantity',
 'accepts_mercadopago',
 'original_price',
 'domain',
 'catalog_listing',
 'seller_id',
 'shipping_free_shipping',
 'shipping_mode',
 'seller_address_state_name',
 'seller_contact_webpage']

Resultados de la api en local - no en repo -

In [72]:
ls results/*

results/meli_items_api_data.p
results/meli_items_metrics_api_data_new.p
results/meli_items_metrics_api_data.p
results/meli_sites_api_data.p
results/meli_users_metrics_api_data_new.p
results/meli_users_metrics_api_data.p

results/old:
meli_items_api_data.p  meli_sites_api_data_2021-03-07T22_22_38.p


In [73]:
# A DF
sites = read_dict('results/meli_sites_api_data.p')
items = read_dict('results/meli_items_api_data.p')
# Quedan asi para ser integrados en la construccion #build 
item_metrics = read_dict('results/meli_items_metrics_api_data.p')
user_metrics = read_dict('results/meli_users_metrics_api_data.p')
len(sites), len(items), len(item_metrics), len(user_metrics)

(28059, 28059, 6690, 2686)

### Build Dataset

In [74]:
sites_df = dict_topandas(sites)
items_df = dict_topandas(items)

In [75]:
id2attr = {_id: len(d) 
           for _id, d 
           in zip(items_df['id'], 
                  items_df['attributes'])}

In [97]:
df = (sites_df
      .pipe(parse_tags, 'tags')
      .pipe(parse_prices, 'prices_prices')
      .pipe(general_process_columns, sites_schema)
      .drop(list_columns_drop_from_sites, axis=1)
      .merge(items_df.pipe(
                  general_process_columns,
                  items_schema),
             on='id', how='inner', suffixes=("_remove", None))
      # seleccionamos las columnas
      [master_column_list]
      .assign(n_attributes=list(
          map(lambda x: id2attr.get(x, 0), items_df.id))
          )
      .assign(item_visits=list(
          map(lambda x: item_metrics.get(x, 0), items_df.id))
          )
      .assign(user_visits=list(
          map(lambda x: user_metrics.get(x, 0), items_df.seller_id))
          )
      .drop('seller_id', axis=1)
      #.drop('id', axis=1)
      #.assign(item_id=df.id.values[:,0])
      #.drop('id', axis=1)
      )
df.head()

Unnamed: 0,id,pub_title,seller_registration_date,seller_reputation_level,power_seller_status,seller_ratings_negative,seller_ratings_neutral,seller_ratings_positive,seller_transactions_total,seller_sales_completed,...,original_price,domain,catalog_listing,shipping_free_shipping,shipping_mode,seller_address_state_name,seller_contact_webpage,n_attributes,item_visits,user_visits
0,MLA871657598,Smart Tv Rca X55andtv Led 4k 55 100v/240v,2016-10-04 15:46:28-04:00,5_green,platinum,0.0,0.0,0.0,12283.0,1965.0,...,,MLA-TELEVISIONS,True,False,not_specified,Buenos Aires,,30,34,438015
1,MLA885144534,Smart Tv Philips 6600 Series 50pud6654/77 Led ...,2016-10-04 15:46:28-04:00,5_green,platinum,0.0,0.0,0.0,12283.0,1965.0,...,,MLA-TELEVISIONS,True,True,me2,Buenos Aires,,33,352,438015
2,MLA910600162,Smart Tv Samsung Series 7 Un43tu7000gczb Led 4...,2013-07-31 08:14:50-04:00,3_yellow,,0.0,0.0,0.0,86020.0,12295.0,...,,MLA-TELEVISIONS,True,False,custom,Capital Federal,,36,1071,1369576
3,MLA895877679,Tv Box Noga Pc Ultra Estándar 4k 8gb Negro C...,2002-04-13 02:00:00-04:00,5_green,platinum,0.0,0.0,0.0,19134.0,2956.0,...,,MLA-STREAMING_MEDIA_DEVICES,True,True,me2,Buenos Aires,,29,1528,116539
4,MLA877436109,Smart Tv Tedge Ntv504k Led 4k 50 220v,2019-03-19 08:41:57-04:00,5_green,platinum,0.0,0.0,0.0,367677.0,60652.0,...,48999.0,MLA-TELEVISIONS,True,True,me2,Buenos Aires,,58,44250,3444269


In [101]:
# retoques mas
#df['domain'] = list(map(lambda x:x[4:], df.domain))
df['seller_reputation_level'] = list(x.split('_')[0] if x==x else np.nan for x in df.seller_reputation_level)
df['domain'] = list(x[4:] if x==x else np.nan for x in df.domain)
#df['item_id'] = df.id.values[:,0]
df.drop('id', axis=1)
df.head()

Unnamed: 0,id,pub_title,seller_registration_date,seller_reputation_level,power_seller_status,seller_ratings_negative,seller_ratings_neutral,seller_ratings_positive,seller_transactions_total,seller_sales_completed,...,original_price,domain,catalog_listing,shipping_free_shipping,shipping_mode,seller_address_state_name,seller_contact_webpage,n_attributes,item_visits,user_visits
0,MLA871657598,Smart Tv Rca X55andtv Led 4k 55 100v/240v,2016-10-04 15:46:28-04:00,5,platinum,0.0,0.0,0.0,12283.0,1965.0,...,,TELEVISIONS,True,False,not_specified,Buenos Aires,,30,34,438015
1,MLA885144534,Smart Tv Philips 6600 Series 50pud6654/77 Led ...,2016-10-04 15:46:28-04:00,5,platinum,0.0,0.0,0.0,12283.0,1965.0,...,,TELEVISIONS,True,True,me2,Buenos Aires,,33,352,438015
2,MLA910600162,Smart Tv Samsung Series 7 Un43tu7000gczb Led 4...,2013-07-31 08:14:50-04:00,3,,0.0,0.0,0.0,86020.0,12295.0,...,,TELEVISIONS,True,False,custom,Capital Federal,,36,1071,1369576
3,MLA895877679,Tv Box Noga Pc Ultra Estándar 4k 8gb Negro C...,2002-04-13 02:00:00-04:00,5,platinum,0.0,0.0,0.0,19134.0,2956.0,...,,STREAMING_MEDIA_DEVICES,True,True,me2,Buenos Aires,,29,1528,116539
4,MLA877436109,Smart Tv Tedge Ntv504k Led 4k 50 220v,2019-03-19 08:41:57-04:00,5,platinum,0.0,0.0,0.0,367677.0,60652.0,...,48999.0,TELEVISIONS,True,True,me2,Buenos Aires,,58,44250,3444269


In [105]:
df.to_pickle('master_1.df')