In [1]:
import pandas as pd

In [2]:
#Cargamos el archivo a utilizar 
events = pd.read_csv("data/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv", low_memory=False)

# Funciones útiles

In [3]:
quincenas = ['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10']

def aplicar_por_quincena(train, funcion):
    
    for quincena in quincenas:
        train = funcion(train, quincena)
    return train

# Feature quincenas

In [4]:
events['timestamp'] = pd.to_datetime(events['timestamp'])
events['mes'] = events['timestamp'].dt.month
events['dia'] = events['timestamp'].dt.day

In [5]:
events['q1'] = ((events['mes']==1) & (events['dia']>1) & (events['dia']<16))
events['q2'] = ((events['mes']==1) & (events['dia']>15) & (events['dia']<32))
events['q3'] = ((events['mes']==2) & (events['dia']>1) & (events['dia']<16))
events['q4'] = ((events['mes']==2) & (events['dia']>15) & (events['dia']<29))
events['q5'] = ((events['mes']==3) & (events['dia']>1) & (events['dia']<16))
events['q6'] = ((events['mes']==3) & (events['dia']>15) & (events['dia']<32))
events['q7'] = ((events['mes']==4) & (events['dia']>1) & (events['dia']<16))
events['q8'] = ((events['mes']==4) & (events['dia']>15) & (events['dia']<31))
events['q9'] = ((events['mes']==5) & (events['dia']>1) & (events['dia']<16))
events['q10'] = ((events['mes']==5) & (events['dia']>15) & (events['dia']<32))

# Feature cantidad de eventos por persona

In [6]:
events = events.join(events.groupby('person').event.count(), on='person', rsuffix='_count')

# Feature tiempo entre primer y última entrada al sitio por persona

In [7]:
events = events.join(events.groupby('person').timestamp.agg('max') - events.groupby('person').timestamp.agg('min'), on='person', rsuffix='_total')
events = events.rename(columns={'timestamp_total' : 'tiempo_total'})

In [8]:
events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'mes', 'dia', 'q1', 'q2',
       'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'event_count',
       'tiempo_total'],
      dtype='object')

In [9]:
pd.DataFrame(events.tiempo_total.unique()).head(10)

Unnamed: 0,0
0,0 days 00:23:08
1,7 days 23:00:32
2,138 days 12:53:38
3,9 days 19:34:20
4,23 days 16:05:31
5,4 days 00:08:04
6,3 days 14:48:42
7,2 days 17:01:21
8,3 days 22:01:25
9,0 days 00:14:52


# Feature cantidad de events por persona por quincena

In [10]:
def cantidad_eventos_por_persona(df, quincena):
    df = df.join(df.loc[df[quincena] == True].groupby('person').event.count(), on='person', rsuffix=('_count_'+quincena))
    return df

In [11]:
events = (aplicar_por_quincena(events, cantidad_eventos_por_persona))
events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'mes', 'dia', 'q1', 'q2',
       'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'event_count',
       'tiempo_total', 'event_count_q1', 'event_count_q2', 'event_count_q3',
       'event_count_q4', 'event_count_q5', 'event_count_q6', 'event_count_q7',
       'event_count_q8', 'event_count_q9', 'event_count_q10'],
      dtype='object')

# Feature tiempo entre primer y última entrada al sitio en quincena por persona

In [12]:
def tiempo_total_por_persona(df, quincena):
    df_quincena = df.loc[df[quincena] == True]
    df_quincena = df.join(df_quincena.groupby('person').timestamp.agg(lambda x: (x.max() - x.min())), on='person', rsuffix='_total_'+quincena)
    df_quincena = df_quincena.rename(columns={'timestamp_total_'+quincena : 'tiempo_total_'+quincena})
    
    return df_quincena

In [13]:
events = aplicar_por_quincena(events, tiempo_total_por_persona)
events.columns

KeyboardInterrupt: 

In [None]:
pd.DataFrame(events.tiempo_total_q1.unique()).head(10)

# Join con events con support

In [None]:
events_support = pd.read_csv('data/ev_con_3_supports.csv', low_memory=False)

In [None]:
events = events.join(events_support.set_index('person'), on='person', how='left')

# Export a csv

In [None]:
events.to_csv('data/events_with_features.csv', index=False)