In [40]:
import pandas as pd
import matplotlib as plt
import numpy as np

# Finger 1
Extraemos cierta información sobre el conjuto de datos

In [42]:
pd.set_option('display.max_columns', 25)

In [43]:
tipos_de_dato = {
    'event': 'category',
    'url': 'category',
    'model': 'category',
    'condition': 'category',
    'storage': 'category',
    'color': 'category',
    'staticpage': 'category',
    'search_engine': 'category',
    'channel': 'category',
    'new_vs_returning': 'category',
    'city': 'category',
    'region': 'category',
    'country': 'category',
    'device_type': 'category',
    'operating_system_version': 'category',
    'browser_version': 'category'
}
df = pd.read_csv('../../fiuba-trocafone-tp1-final-set/events.csv', low_memory=False, dtype=tipos_de_dato)

## Clasifico los datos
Espero las siguientes columnas sean categoricas: evento, url, model, condition, storage, color, ...
person: identificador del usuario que visita la pagina. Parece ser un numero en hexadecimal. Podriamos pasarlo a binario o a decimal pero no ganariamos nada, dado que no resulta util calcular promedios, varianzas, etc. sobre un identificador.
sku: Identificador de producto relacionado al evento. Numerico.

In [44]:
df.head()

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
0,2018-05-31 23:38:05,ad campaign hit,0004b0a2,/comprar/iphone/iphone-5s,,,,,,,,,criteo,,,,,,,,,,
1,2018-05-31 23:38:05,visited site,0004b0a2,,,,,,,,,,,,Paid,New,Camaragibe,Pernambuco,Brazil,Smartphone,360x640,Android 6,Chrome Mobile 39
2,2018-05-31 23:38:09,viewed product,0004b0a2,,2694.0,iPhone 5s,Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
3,2018-05-31 23:38:40,checkout,0004b0a2,,2694.0,iPhone 5s,Bom,32GB,Cinza espacial,,,,,,,,,,,,,,
4,2018-05-29 13:29:25,viewed product,0006a21a,,15338.0,Samsung Galaxy S8,Bom,64GB,Dourado,,,,,,,,,,,,,,


### Timestamp

In [6]:
df['timestamp'].size == df['timestamp'].count()

No hay nulos en la columna de timestamp, por lo que la transformamos a fecha.

In [54]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

### Sku
El 'sku' son principalmente enteros, pero dado que varias entradas tienen el valor NaN, se los tratara como floats. Sin embargo al tratar de convertir los datos a float se encontraron valores que aparecen como 'undefined'.

In [49]:
try:
    pd.to_numeric(df['sku'])
except ValueError:
    print('No todos los valores de sku son numericos!')
sku_no_definido = df[df['sku'] == 'undefined']
sku_no_definido

No todos los valores de sku son numericos!


Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
368097,2018-05-22 13:53:14,checkout,602b3649,,undefined,,,,,,,,,,,,,,,,,,
387149,2018-05-22 13:44:53,checkout,655402b0,,undefined,,,,,,,,,,,,,,,,,,


Se ve que son solo dos y que todas las columnas son valores nulos exceptuando la de personas, así que sospechamos que son eventos que no sirven realmente. Para confirmarlo podemos buscar otros eventos de dichas personas y ver si hay otro en un timestamp cercano y con datos en mas columnas.

In [50]:
personas = df[df['sku'] == 'undefined']['person']
df[df['person'].isin(personas)]

Unnamed: 0,timestamp,event,person,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
368096,2018-05-22 13:53:02,visited site,602b3649,,,,,,,,,,,,Direct,New,Unknown,Unknown,Brazil,Computer,1366x768,Windows 7,Chrome 66.0
368097,2018-05-22 13:53:14,checkout,602b3649,,undefined,,,,,,,,,,,,,,,,,,
387149,2018-05-22 13:44:53,checkout,655402b0,,undefined,,,,,,,,,,,,,,,,,,
387150,2018-05-22 13:44:53,visited site,655402b0,,,,,,,,,,,,Direct,New,Unknown,Unknown,Brazil,Computer,1366x768,Windows 7,Chrome 66.0


Se ve que en ambos casos existe otro evento de tipo 'visited site' a pocos segundos del primer evento, por lo cual podemos descartarlos sabiendo que existe otro con mas información que representa la visita del usuario. 


In [9]:
df.drop(index=sku_no_definido.index, inplace=True)

Ahora podemos transformar el 'sku' a un tipo numerico.

In [10]:
df['sku'] = pd.to_numeric(df['sku'])

## Moda de los eventos

In [20]:
# obtengo la moda de los eventos
df['event'].mode()

0    viewed product
Name: event, dtype: category
Categories (11, object): [ad campaign hit, brand listing, checkout, conversion, ..., searched products, staticpage, viewed product, visited site]

In [33]:
# Calculo los 5 productos (identificados por sku) con mayor cantidad de conversiones.
df[df['event'] == 'conversion'].groupby('sku').count().sort_values('timestamp', ascending=False).iloc[0:5]

Unnamed: 0_level_0,timestamp,event,person,url,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
sku,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
6371.0,30,30,30,0,30,30,30,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6370.0,18,18,18,0,18,18,18,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7631.0,17,17,17,0,17,17,17,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2692.0,16,16,16,0,16,16,16,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3371.0,14,14,14,0,14,14,14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
# Calculo los 10 usuarios que registraron más actividad en la plataforma (es decir, realizaron más eventos).
df.groupby('person').count().sort_values('timestamp', ascending=False).iloc[0:11]

Unnamed: 0_level_0,timestamp,event,url,sku,model,condition,storage,color,skus,search_term,staticpage,campaign_source,search_engine,channel,new_vs_returning,city,region,country,device_type,screen_resolution,operating_system_version,browser_version
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
71492f2b,2771,2771,61,1367,1368,1367,1367,1367,1232,15,4,61,0,103,103,103,103,103,103,103,103,103
6b07be7b,2770,2770,53,1137,1137,1137,1137,1137,1269,71,14,59,6,263,263,263,263,263,263,263,263,263
285101a1,2140,2140,74,1085,1085,1085,1085,1085,906,7,0,74,12,61,61,61,61,61,61,61,61,61
3e34b3cf,1877,1877,4,1497,1497,1497,1497,1497,236,57,1,4,0,139,139,139,139,139,139,139,139,139
2d5a84c1,1800,1800,61,1528,1528,1528,1528,1528,63,6,0,61,6,142,142,142,142,142,142,142,142,142
5f25cb5d,1797,1797,52,686,687,686,686,686,833,12,32,53,37,151,151,151,151,151,151,151,151,151
5af7e2bc,1773,1773,220,734,735,734,734,734,647,23,7,220,2,157,157,157,157,157,157,157,157,157
3b5a5833,1705,1705,33,1403,1405,1403,1403,1403,73,0,1,33,25,168,168,168,168,168,168,168,168,168
d7e60792,1352,1352,7,1213,1213,1213,1213,1213,41,1,3,7,8,80,80,80,80,80,80,80,80,80
cd6e0b8d,1254,1254,19,1181,1181,1181,1181,1181,28,1,0,19,3,23,23,23,23,23,23,23,23,23
