In [1]:
import pandas as pd

In [2]:
#Cargamos el archivo a utilizar 
events = pd.read_csv("data/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv", low_memory=False)

In [3]:
#Cargamos el archivo a utilizar 
labels = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

#Hay mucha cantidad de valores de 0 (no conversiones)

In [4]:
# armo df con registros completos clasificados
train = events.merge(labels, on='person', how='right')

# Funciones útiles

In [5]:
quincenas = ['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10']

def aplicar_por_quincena(train, funcion):
    
    for quincena in quincenas:
        train = funcion(train, quincena)
    return train

# Feature quincenas

In [6]:
train['timestamp'] = pd.to_datetime(events['timestamp'])
train['mes'] = train['timestamp'].dt.month
train['dia'] = train['timestamp'].dt.day

In [7]:
train['q1'] = ((train['mes']==1) & (train['dia']>1) & (train['dia']<16))
train['q2'] = ((train['mes']==1) & (train['dia']>15) & (train['dia']<32))
train['q3'] = ((train['mes']==2) & (train['dia']>1) & (train['dia']<16))
train['q4'] = ((train['mes']==2) & (train['dia']>15) & (train['dia']<29))
train['q5'] = ((train['mes']==3) & (train['dia']>1) & (train['dia']<16))
train['q6'] = ((train['mes']==3) & (train['dia']>15) & (train['dia']<32))
train['q7'] = ((train['mes']==4) & (train['dia']>1) & (train['dia']<16))
train['q8'] = ((train['mes']==4) & (train['dia']>15) & (train['dia']<31))
train['q9'] = ((train['mes']==5) & (train['dia']>1) & (train['dia']<16))
train['q10'] = ((train['mes']==5) & (train['dia']>15) & (train['dia']<32))

# Feature cantidad de eventos por persona

In [8]:
train = train.join(train.groupby('person').event.count(), on='person', rsuffix='_count')

# Feature tiempo entre primer y última entrada al sitio por persona

In [9]:
train = train.join(train.groupby('person').timestamp.agg('max') - train.groupby('person').timestamp.agg('min'), on='person', rsuffix='_total')
train = train.rename(columns={'timestamp_total' : 'tiempo_total'})

In [10]:
train.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label', 'mes', 'dia',
       'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10',
       'event_count', 'tiempo_total'],
      dtype='object')

In [11]:
pd.DataFrame(train.tiempo_total.unique()).head(10)

Unnamed: 0,0
0,0 days 00:33:18
1,0 days 00:19:15
2,0 days 00:49:04
3,0 days 00:53:54
4,11 days 09:18:42
5,0 days 01:21:09
6,0 days 19:05:07
7,0 days 00:48:25
8,0 days 00:29:27
9,0 days 01:39:45


# Feature cantidad de events por persona por quincena

In [12]:
def cantidad_eventos_por_persona(train, quincena):
    train = train.join(train.loc[train[quincena] == True].groupby('person').event.count(), on='person', rsuffix=('_count_'+quincena))
    return train

In [13]:
train = (aplicar_por_quincena(train, cantidad_eventos_por_persona))
train.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label', 'mes', 'dia',
       'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10',
       'event_count', 'tiempo_total', 'event_count_q1', 'event_count_q2',
       'event_count_q3', 'event_count_q4', 'event_count_q5', 'event_count_q6',
       'event_count_q7', 'event_count_q8', 'event_count_q9',
       'event_count_q10'],
      dtype='object')

# Feature tiempo entre primer y última entrada al sitio en quincena por persona

In [14]:
def tiempo_total_por_persona(train, quincena):
    df_quincena = train.loc[train[quincena] == True]
    df_quincena = train.join(df_quincena.groupby('person').timestamp.agg(lambda x: (x.max() - x.min())), on='person', rsuffix='_total_'+quincena)
    df_quincena = df_quincena.rename(columns={'timestamp_total_'+quincena : 'tiempo_total_'+quincena})
    
    return df_quincena

In [15]:
train = aplicar_por_quincena(train, tiempo_total_por_persona)
train.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'label', 'mes', 'dia',
       'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10',
       'event_count', 'tiempo_total', 'event_count_q1', 'event_count_q2',
       'event_count_q3', 'event_count_q4', 'event_count_q5', 'event_count_q6',
       'event_count_q7', 'event_count_q8', 'event_count_q9', 'event_count_q10',
       'tiempo_total_q1', 'tiempo_total_q2', 'tiempo_total_q3',
       'tiempo_total_q4', 'tiempo_total_q5', 'tiempo_total_q6',
       'tiempo_total_q7', 'tiempo_total_q8', 'tiempo_total_q9',
       'tiempo_total_q10'],
      dtype='object')

In [16]:
pd.DataFrame(train.tiempo_total_q1.unique()).head(10)

Unnamed: 0,0
0,NaT
1,0 days 23:45:00
2,0 days 17:33:32
3,0 days 22:42:15
4,0 days 22:41:41
5,0 days 05:54:18
6,0 days 03:19:11
7,0 days 01:53:22
8,0 days 22:31:56
9,5 days 18:07:03


# Export a csv

In [17]:
train.to_csv('data/train_with_features.csv', index=False)