In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None) # para mostrar todas las filas
pd.set_option('display.max_columns', None) # para mostrar todas las columnas

In [2]:
df_jan =  pd.read_csv('./2020-Jan.csv', sep=',',decimal='.')

In [3]:
df_jan.head().T

Unnamed: 0,0,1,2,3,4
event_time,2020-01-01 00:00:00 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:02 UTC
event_type,view,view,view,view,view
product_id,1005073,1005192,100063693,5100816,100014325
category_id,2232732093077520756,2232732093077520756,2053013552427434207,2232732103831716449,2232732103294845523
category_code,construction.tools.light,construction.tools.light,apparel.shirt,apparel.shoes,apparel.shoes.step_ins
brand,samsung,meizu,turtle,xiaomi,intel
price,1130.02,205.67,136.43,29.95,167.2
user_id,519698804,527767423,519046195,518269232,587748686
user_session,69b5d72f-fd6e-4fed-aa23-1286b2ca89a0,7f596032-ccbf-4643-9bad-e36a209512b4,d1e2f343-84bb-49bd-b13d-ca0f1ed9910e,0444841c-38ef-410c-b11f-7b35ea4e5991,31b7d4cf-dfac-4895-9927-90fa3254f860


In [4]:
df_jan.apply(lambda x: len(x.unique()))

event_time        2641539
event_type              3
product_id         227608
category_id          1201
category_code         136
brand                5084
price               73596
user_id           4385985
user_session     13847855
dtype: int64

In [5]:
## Creo que se debe eliminar la columna de user_session ya que solo parece ser un id de la sesion.
## Se nota la diferencia entre user_id y user_session. 

In [6]:
## No se entiende por que existen mas category_id que category_code, en que se diferencian?
##product_id tampoco nos identifica ningun producto especifico parece algo interno de la tienda

In [7]:
drop_columns = ['user_session', 'category_id', 'product_id']

In [8]:
df_jan.drop(drop_columns, axis=1, inplace=True)

In [9]:
df_jan.head().T

Unnamed: 0,0,1,2,3,4
event_time,2020-01-01 00:00:00 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:01 UTC,2020-01-01 00:00:02 UTC
event_type,view,view,view,view,view
category_code,construction.tools.light,construction.tools.light,apparel.shirt,apparel.shoes,apparel.shoes.step_ins
brand,samsung,meizu,turtle,xiaomi,intel
price,1130.02,205.67,136.43,29.95,167.2
user_id,519698804,527767423,519046195,518269232,587748686


In [10]:
df_jan_el = df_jan.loc[df_jan.category_code == 'electronics.smartphone']

In [11]:
print(f'El dataframe tiene un total de {len(df_jan_el)} filas.')
len(df_jan_el) - df_jan_el.count() # vemos cuántos valores NaN hay por columna

El dataframe tiene un total de 446779 filas.


event_time          0
event_type          0
category_code       0
brand            2024
price               0
user_id             0
dtype: int64

In [12]:
df_jan_el['brand'].unique()

array(['huawei', 'lg', 'xiaomi', 'samsung', 'apple', nan, 'deppa', 'oppo',
       'karya', 'nokia', 'honor', 'remax', 'sony', 'prestigio', 'hiper',
       'meizu', 'bq', 'haier', 'x-level', 'vivo', 'doogee', 'vega',
       'a-case', 'usams', 'irbis', 'google', 'nillkin', 'fini', 'vertex',
       'asus', 'tecno', 'oneplus', 'takeit', 'moshi', 'texet', 'tp-link',
       'dub', 'mujjo', 'petek', 'goodloot', 'jinga', 'gionee', 'lego',
       'yotrix', 'motorola', 'zte', 'inoi', 'blackberry', 'blackview',
       'htc', 'iwalk', 'fly', 'hoco', 'micromax', 'toto', 'janesper',
       'rapid', 'kajsa', 'denzel', 'fubag', 'ark', 'rivacase', 'wuw',
       'ubear', 'keneksi'], dtype=object)

In [13]:
df_jan_clean = df_jan_el.dropna()

In [14]:
df_jan_clean.head().T

Unnamed: 0,22,48,148,197,325
event_time,2020-01-01 00:00:05 UTC,2020-01-01 00:00:12 UTC,2020-01-01 00:00:41 UTC,2020-01-01 00:00:55 UTC,2020-01-01 00:01:32 UTC
event_type,view,view,view,view,view
category_code,electronics.smartphone,electronics.smartphone,electronics.smartphone,electronics.smartphone,electronics.smartphone
brand,huawei,lg,xiaomi,xiaomi,lg
price,149.3,205.9,192.26,192.26,205.9
user_id,517014550,536578949,532166853,532166853,536578949


In [15]:
#creo que se debe codificar las categorias y las marcas para poder realizar una matriz correlacion mas clara
#y poder ubicar mas claramente los outlayers

In [16]:
dummies_event = pd.get_dummies(df_jan_clean.event_type)
dummies_brand = pd.get_dummies(df_jan_clean.brand)

In [18]:
df_dummies = pd.concat([df_jan_clean, dummies_event, dummies_brand], axis='columns')
df_dummies.drop(['event_type', 'brand'], axis=1, inplace=True)

In [19]:
df_dummies.head().T

Unnamed: 0,22,48,148,197,325
event_time,2020-01-01 00:00:05 UTC,2020-01-01 00:00:12 UTC,2020-01-01 00:00:41 UTC,2020-01-01 00:00:55 UTC,2020-01-01 00:01:32 UTC
category_code,electronics.smartphone,electronics.smartphone,electronics.smartphone,electronics.smartphone,electronics.smartphone
price,149.3,205.9,192.26,192.26,205.9
user_id,517014550,536578949,532166853,532166853,536578949
cart,0,0,0,0,0
purchase,0,0,0,0,0
view,1,1,1,1,1
a-case,0,0,0,0,0
apple,0,0,0,0,0
ark,0,0,0,0,0


In [None]:
pd.plotting.scatter_matrix(df_dummies, alpha=0.2, figsize=(20, 20), diagonal = 'kde')
plt.show()