In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
import xgboost as xgb
from sklearn import preprocessing

In [2]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [3]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [4]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear
df_events['second'] = df_events['timestamp'].dt.second

In [5]:
df_events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'month', 'day',
       'day_of_week', 'day_of_year', 'hour', 'minute', 'week_of_year',
       'second'],
      dtype='object')

In [6]:
df_todas_las_personas = df_events[['person']].drop_duplicates()

In [7]:
# Me quedo solo con los users que tienen label.
df_events_con_labels = pd.merge(df_events,df_labels,on = 'person',how = 'inner')

# Features
* Tiempo que esta en promedio cada vez que entra.
* Tratar de sacar cuanto tiempo estuvo la ultima vez que entro
    * Fecha de ultima conversion
    * Fecha de ultimo checkout
    * Fecha de ultima vez que accedio
    * Cada cuanto hace checkouts
    * Cada cuanto hace compras  			 (promedio o mediana)
* Cada cuanto visita celulares
    * Cada cuanto hace eventos. # promedio de diferencia entre fechas consecutivas<
* Cada cuanto ve distintos celulares
probar con mes semana dia hora minuto

* epoch
* mas features usando columnas con eventos
* cada cuanto hace una busqueda?
* cada cuanto clickea una publcidad? 
* distancia promedio entre epochs consecutivo

## Cada cuanto hace eventos

#### Cada cuantos dias hace eventos (promedio) 

In [8]:
df_cadaCuantosDiasHaceEventos = df_events[['person','timestamp','month','day']]

In [9]:
df_cadaCuantosDiasHaceEventos = df_cadaCuantosDiasHaceEventos.drop_duplicates(['month','day','person'])\
                                .sort_values('timestamp')

In [10]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def promedioCadaCuantosDiasVuelve(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.days))

In [11]:
df_cadaCuantosDiasHaceEventos = (df_cadaCuantosDiasHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantosDiasVuelve)).to_frame().reset_index()
df_cadaCuantosDiasHaceEventos.columns = ['person','promedioCadaCuantosDiasVuelve']

In [12]:
df_cadaCuantosDiasHaceEventos.head()

Unnamed: 0,person,promedioCadaCuantosDiasVuelve
0,0008ed71,0
1,00091926,1
2,00091a7a,0
3,000ba417,4
4,000c79fe,0


In [None]:
import matplotlib.pyplot as plt
import six

def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

render_mpl_table(df, header_columns=0, col_width=2.0)


Con one hot encoding

In [227]:
a = pd.merge(df_labels,df_cadaCuantosDiasHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['promedioCadaCuantosDiasVuelve','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
promedioCadaCuantosDiasVuelve,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.541554,0.328571,0.212982
4,0.039655,0.073469,0.033814
2,0.080015,0.103061,0.023046
9,0.011446,0.027551,0.016105
1,0.083813,0.09898,0.015167


In [228]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasHaceEventos['promedioCadaCuantosDiasVuelve'].value_counts().head(K).index)
df_cadaCuantosDiasHaceEventos_conOneHot = pd.concat([df_cadaCuantosDiasHaceEventos['person'],pd.get_dummies(df_cadaCuantosDiasHaceEventos['promedioCadaCuantosDiasVuelve'])[top_columnas]],axis = 1)

In [229]:
nombres_columnas = ['cadaCuantosDiasHaceEventos_'+str(columna) for columna in df_cadaCuantosDiasHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasHaceEventos_conOneHot.columns = nombres_columnas

In [230]:
df_cadaCuantosDiasHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasHaceEventos_0,cadaCuantosDiasHaceEventos_4,cadaCuantosDiasHaceEventos_2,cadaCuantosDiasHaceEventos_9,cadaCuantosDiasHaceEventos_1,cadaCuantosDiasHaceEventos_6,cadaCuantosDiasHaceEventos_8,cadaCuantosDiasHaceEventos_5,cadaCuantosDiasHaceEventos_11,cadaCuantosDiasHaceEventos_7,cadaCuantosDiasHaceEventos_13,cadaCuantosDiasHaceEventos_10,cadaCuantosDiasHaceEventos_3,cadaCuantosDiasHaceEventos_16,cadaCuantosDiasHaceEventos_14
0,0008ed71,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00091926,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,00091a7a,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantas horas hace eventos (promedio)

In [231]:
df_cadaCuantasHorasHaceEventos = df_events[['person','timestamp','month','day','hour']]

In [232]:
df_cadaCuantasHorasHaceEventos = df_cadaCuantasHorasHaceEventos.drop_duplicates(['month','day','hour','person'])\
                                .sort_values('timestamp')

In [233]:
def secToHours(seconds):
    return seconds//3600
def secToMinutes(seconds):
    return (seconds//60)

In [234]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.w
def promedioCadaCuantasHorasHaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean(((S - S.shift()).dropna().dt.total_seconds()).apply(secToHours)))

In [235]:
df_cadaCuantasHorasHaceEventos = (df_cadaCuantasHorasHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantasHorasHaceEventos)).to_frame().reset_index()
df_cadaCuantasHorasHaceEventos.columns = ['person','cadaCuantasHorasHaceEventos']

In [236]:
df_cadaCuantasHorasHaceEventos.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventos
0,0008ed71,2
1,00091926,18
2,00091a7a,0
3,000ba417,27
4,000c79fe,0


Con one hot encoding

In [237]:
a = pd.merge(df_labels,df_cadaCuantasHorasHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasHaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasHaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.420256,0.230612,0.189644
12,0.011012,0.005102,0.00591
130,0.001302,0.007143,0.005841
24,0.007812,0.013265,0.005454
1,0.01763,0.012245,0.005386


In [238]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasHaceEventos['cadaCuantasHorasHaceEventos'].value_counts().head(K).index)
df_cadaCuantasHorasHaceEventos_conOneHot = pd.concat([df_cadaCuantasHorasHaceEventos['person'],pd.get_dummies(df_cadaCuantasHorasHaceEventos['cadaCuantasHorasHaceEventos'])[top_columnas]],axis = 1)

In [239]:
nombres_columnas = ['cadaCuantasHorasHaceEventos_'+str(columna) for columna in df_cadaCuantasHorasHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasHaceEventos_conOneHot.columns = nombres_columnas

In [240]:
df_cadaCuantasHorasHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventos_0,cadaCuantasHorasHaceEventos_12,cadaCuantasHorasHaceEventos_130,cadaCuantasHorasHaceEventos_24,cadaCuantasHorasHaceEventos_1,cadaCuantasHorasHaceEventos_39,cadaCuantasHorasHaceEventos_4,cadaCuantasHorasHaceEventos_28,cadaCuantasHorasHaceEventos_2,cadaCuantasHorasHaceEventos_170,cadaCuantasHorasHaceEventos_37,cadaCuantasHorasHaceEventos_8,cadaCuantasHorasHaceEventos_143,cadaCuantasHorasHaceEventos_30,cadaCuantasHorasHaceEventos_23
0,0008ed71,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,00091926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00091a7a,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos minutos hace eventos (promedio)

In [241]:
df_cadaCuantosMinutosHaceEventos = df_events[['person','timestamp','month','day','hour','minute']]

In [242]:
df_cadaCuantosMinutosHaceEventos = df_cadaCuantosMinutosHaceEventos.drop_duplicates(['month','day','hour','person','minute'])\
                                .sort_values('timestamp')

In [243]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def promedioCadaCuantosMinutosHaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean(((S - S.shift()).dropna().dt.total_seconds()).apply(secToMinutes)))

In [244]:
df_cadaCuantosMinutosHaceEventos = (df_cadaCuantosMinutosHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantosMinutosHaceEventos)).to_frame().reset_index()
df_cadaCuantosMinutosHaceEventos.columns = ['person','cadaCuantosMinutosHaceEventos']

In [245]:
df_cadaCuantosMinutosHaceEventos.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventos
0,0008ed71,48
1,00091926,340
2,00091a7a,2
3,000ba417,167
4,000c79fe,1


Con one hot encoding

In [246]:
a = pd.merge(df_labels,df_cadaCuantosMinutosHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosMinutosHaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosMinutosHaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.185689,0.104082,0.081608
1,0.107139,0.056122,0.051017
2,0.050993,0.037755,0.013238
3,0.020397,0.011224,0.009173
4,0.015569,0.008163,0.007406


In [247]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosMinutosHaceEventos['cadaCuantosMinutosHaceEventos'].value_counts().head(K).index)
df_cadaCuantosMinutosHaceEventos_conOneHot = pd.concat([df_cadaCuantosMinutosHaceEventos['person'],pd.get_dummies(df_cadaCuantosMinutosHaceEventos['cadaCuantosMinutosHaceEventos'])[top_columnas]],axis = 1)

In [248]:
nombres_columnas = ['cadaCuantosMinutosHaceEventos_'+str(columna) for columna in df_cadaCuantosMinutosHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosMinutosHaceEventos_conOneHot.columns = nombres_columnas

In [249]:
df_cadaCuantosMinutosHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventos_0,cadaCuantosMinutosHaceEventos_1,cadaCuantosMinutosHaceEventos_2,cadaCuantosMinutosHaceEventos_3,cadaCuantosMinutosHaceEventos_4,cadaCuantosMinutosHaceEventos_5,cadaCuantosMinutosHaceEventos_6,cadaCuantosMinutosHaceEventos_8,cadaCuantosMinutosHaceEventos_1992,cadaCuantosMinutosHaceEventos_273,cadaCuantosMinutosHaceEventos_284,cadaCuantosMinutosHaceEventos_205,cadaCuantosMinutosHaceEventos_9,cadaCuantosMinutosHaceEventos_476,cadaCuantosMinutosHaceEventos_320
0,0008ed71,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00091926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00091a7a,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos dias hace eventos en el mes 5 (promedio)

In [250]:
df_cadaCuantosDiasEnMes5HaceEventos = df_events[df_events['month'] == 5]

In [251]:
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos[['person','day']]

In [252]:
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos.drop_duplicates()\
                                .sort_values('day')

In [253]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosDiasEnMes5HaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [254]:
df_cadaCuantosDiasEnMes5HaceEventos = (df_cadaCuantosDiasEnMes5HaceEventos.groupby('person')['day']\
                                 .agg(cadaCuantosDiasEnMes5HaceEventos)).to_frame().reset_index()
df_cadaCuantosDiasEnMes5HaceEventos.columns = ['person','cadaCuantosDiasEnMes5HaceEventos']

In [255]:
df_cadaCuantosDiasEnMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos
0,0008ed71,0
1,00091926,1
2,000ba417,4
3,000c79fe,0
4,000e4d9e,2


In [256]:
df_cadaCuantosDiasEnMes5HaceEventos = pd.merge(df_todas_las_personas,df_cadaCuantosDiasEnMes5HaceEventos\
                                               ,how = 'left', on = 'person')

In [257]:
# persona no entro en mes 5 -> -1
# persona entro una dia en mes 5 -> 0
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos.fillna(-1)

In [258]:
df_cadaCuantosDiasEnMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos
0,4886f805,0.0
1,ad93850f,3.0
2,0297fc1e,2.0
3,2d681dd8,9.0
4,cccea85e,2.0


Con one hot encoding

In [259]:
a = pd.merge(df_labels,df_cadaCuantosDiasEnMes5HaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosDiasEnMes5HaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosDiasEnMes5HaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.490886,0.287755,0.203131
-1.0,0.037648,0.161224,0.123577
3.0,0.057231,0.071429,0.014197
2.0,0.111425,0.12551,0.014086
4.0,0.052349,0.062245,0.009896


In [260]:
K = 31
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasEnMes5HaceEventos['cadaCuantosDiasEnMes5HaceEventos'].value_counts().head(K).index)
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot = pd.concat([df_cadaCuantosDiasEnMes5HaceEventos['person'],pd.get_dummies(df_cadaCuantosDiasEnMes5HaceEventos['cadaCuantosDiasEnMes5HaceEventos'])[top_columnas]],axis = 1)

In [261]:
nombres_columnas = ['cadaCuantosDiasEnMes5HaceEventos_'+str(columna) for columna in df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.columns = nombres_columnas

In [262]:
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos_0.0,cadaCuantosDiasEnMes5HaceEventos_-1.0,cadaCuantosDiasEnMes5HaceEventos_3.0,cadaCuantosDiasEnMes5HaceEventos_2.0,cadaCuantosDiasEnMes5HaceEventos_4.0,cadaCuantosDiasEnMes5HaceEventos_6.0,cadaCuantosDiasEnMes5HaceEventos_9.0,cadaCuantosDiasEnMes5HaceEventos_7.0,cadaCuantosDiasEnMes5HaceEventos_13.0,...,cadaCuantosDiasEnMes5HaceEventos_16.0,cadaCuantosDiasEnMes5HaceEventos_14.0,cadaCuantosDiasEnMes5HaceEventos_21.0,cadaCuantosDiasEnMes5HaceEventos_19.0,cadaCuantosDiasEnMes5HaceEventos_23.0,cadaCuantosDiasEnMes5HaceEventos_24.0,cadaCuantosDiasEnMes5HaceEventos_26.0,cadaCuantosDiasEnMes5HaceEventos_27.0,cadaCuantosDiasEnMes5HaceEventos_28.0,cadaCuantosDiasEnMes5HaceEventos_29.0
0,4886f805,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Cada cuantas horas hace eventos en un dia del mes 5

In [263]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_events[df_events['month'] == 5]

In [264]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos[['person','day','hour']]

In [265]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.drop_duplicates()\
                                .sort_values(['day','hour'])

In [266]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantasHorasEnUnDiaDelMes5HaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [267]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = (df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
        .groupby(['person','day'])['hour'].agg(cadaCuantasHorasEnUnDiaDelMes5HaceEventos)).to_frame()


In [268]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
                .groupby('person')['hour'].mean().to_frame().reset_index()
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['hour'] = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['hour'].apply(round)

In [269]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.columns = ['person','cadaCuantasHorasEnUnDiaDelMes5HaceEventos']

In [270]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = pd.merge(df_todas_las_personas,df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
                                                       ,on = 'person', how = 'left')


In [271]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.fillna(-1)

In [272]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantasHorasEnUnDiaDelMes5HaceEventos
0,4886f805,0.0
1,ad93850f,6.0
2,0297fc1e,5.0
3,2d681dd8,0.0
4,cccea85e,4.0


Con one hot encoding

In [273]:
a = pd.merge(df_labels,df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasEnUnDiaDelMes5HaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasEnUnDiaDelMes5HaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.589509,0.429592,0.159917
-1.0,0.037648,0.161224,0.123577
2.0,0.080449,0.115306,0.034857
1.0,0.155799,0.133673,0.022126
4.0,0.035478,0.046939,0.011461


In [274]:
K = 24
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['cadaCuantasHorasEnUnDiaDelMes5HaceEventos'].value_counts().head(K).index)
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot = pd.concat([df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['person'],pd.get_dummies(df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['cadaCuantasHorasEnUnDiaDelMes5HaceEventos'])[top_columnas]],axis = 1)

In [275]:
nombres_columnas = ['cadaCuantasHorasEnUnDiaDelMes5HaceEventos_'+str(columna) for columna in df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.columns = nombres_columnas

In [276]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_0.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_-1.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_2.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_1.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_4.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_3.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_5.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_8.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_6.0,...,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_17.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_12.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_7.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_9.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_13.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_16.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_18.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_19.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_20.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_21.0
0,4886f805,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos minutos hace eventos en un dia y hora del mes 5

In [277]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_events[df_events['month'] == 5]

In [278]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5[['person','day','hour','minute']]

In [279]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.drop_duplicates()\
                                .sort_values(['day','hour','minute'])

In [280]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [281]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = (df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
        .groupby(['person','day','hour'])['minute'].agg(cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5)).to_frame()


In [282]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
                .groupby('person')['minute'].mean().to_frame().reset_index()
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['minute'] = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['minute'].apply(round)

In [283]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.columns = ['person','cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5']

In [284]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
                                                       ,on = 'person', how = 'left')


In [285]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.fillna(-1)

In [286]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5
0,4886f805,8.0
1,ad93850f,3.0
2,0297fc1e,3.0
3,2d681dd8,2.0
4,cccea85e,3.0


In [287]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.shape

(38829, 2)

Con one hot encoding

In [288]:
a = pd.merge(df_labels,df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
1.0,0.366442,0.292857,0.073585
0.0,0.174569,0.129592,0.044977
3.0,0.068515,0.090816,0.022302
2.0,0.248345,0.228571,0.019774


In [289]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5'].value_counts().head(K).index)
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot = pd.concat([df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['person'],pd.get_dummies(df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5'])[top_columnas]],axis = 1)

In [290]:
nombres_columnas = ['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_'+str(columna) for columna in df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns = nombres_columnas

In [291]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_-1.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_1.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_0.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_3.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_2.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_6.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_4.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_7.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_5.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_8.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_9.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_12.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_17.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_16.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_10.0
0,4886f805,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,ad93850f,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos segundos hace eventos en un dia y hora mes 5

In [292]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_events[df_events['month'] == 5]

In [293]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5[['person','timestamp','day','hour']]

In [294]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.drop_duplicates()\
                                .sort_values(['timestamp'])

In [295]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.total_seconds()))

In [296]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = (df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
        .groupby(['person','day','hour'])['timestamp'].agg(cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5)).to_frame()


In [297]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
                .groupby('person')['timestamp'].mean().to_frame().reset_index()
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['timestamp'] = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['timestamp'].apply(round)

In [298]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.columns = ['person','cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5']

In [299]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
                                                       ,on = 'person', how = 'left')


In [300]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.fillna(-1)

In [301]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.head()

Unnamed: 0,person,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5
0,4886f805,198.0
1,ad93850f,69.0
2,0297fc1e,62.0
3,2d681dd8,27.0
4,cccea85e,41.0


In [302]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.shape

(38829, 2)

Con one hot encoding

In [303]:
a = pd.merge(df_labels,df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
10.0,0.022513,0.008163,0.014349
16.0,0.023706,0.011224,0.012482
22.0,0.021591,0.010204,0.011386
6.0,0.019204,0.009184,0.01002


In [305]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5'].value_counts().head(K).index)
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot = pd.concat([df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['person'],pd.get_dummies(df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5'])[top_columnas]],axis = 1)

In [306]:
nombres_columnas = ['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_'+str(columna) for columna in df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns = nombres_columnas

In [307]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_-1.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_10.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_16.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_22.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_6.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_1.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_7.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_17.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_28.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_8.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_23.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_71.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_15.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_13.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_44.0
0,4886f805,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantas horas hace eventos en su ultima conexion (dia) en el mes 5

In [308]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_events[df_events['month'] == 5]

In [309]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5[['person','day','hour']]

In [310]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.drop_duplicates()\
                                .sort_values(['day','hour'])

In [311]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [312]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = (df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
        .groupby(['person','day'])['hour'].agg(cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5)).to_frame()


In [313]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
                .groupby('person').last().reset_index()

In [314]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.columns = ['person','cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5']

In [315]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
                                                       ,on = 'person', how = 'left')


In [316]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.fillna(-1)

In [317]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5
0,4886f805,0.0
1,ad93850f,0.0
2,0297fc1e,0.0
3,2d681dd8,0.0
4,cccea85e,5.0


Con one hot encoding

In [318]:
a = pd.merge(df_labels,df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.674948,0.522449,0.152499
-1.0,0.037648,0.161224,0.123577
1.0,0.141098,0.123469,0.017629
4.0,0.018173,0.027551,0.009378
6.0,0.012857,0.021429,0.008572


In [319]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5'].value_counts().head(K).index)
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot = pd.concat([df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['person'],pd.get_dummies(df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5'])[top_columnas]],axis = 1)

In [320]:
nombres_columnas = ['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_'+str(columna) for columna in df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns = nombres_columnas

In [321]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_0.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_-1.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_1.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_4.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_6.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_2.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_8.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_7.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_15.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_19.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_13.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_23.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_10.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_12.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_14.0
0,4886f805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos minutos hace eventos en su ultima conexion (dia) en el mes 5

In [322]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = df_events[df_events['month'] == 5]

In [323]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5[['person','day','hour','minute']]

In [324]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5.drop_duplicates()\
                                .sort_values(['day','hour','minute'])

In [325]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [326]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = (df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5\
        .groupby(['person','day','hour'])['minute'].agg(cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5)).to_frame()


In [327]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5\
            .groupby(['person','day']).mean().groupby('person').last().reset_index()

In [328]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5.columns = ['person','cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5']

In [329]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5\
                                                       ,on = 'person', how = 'left')


In [330]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5.fillna(-1)

In [331]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5
0,4886f805,8.0
1,ad93850f,9.0
2,0297fc1e,1.0
3,2d681dd8,1.0
4,cccea85e,9.75


Con one hot encoding

In [332]:
a = pd.merge(df_labels,df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
1.0,0.304329,0.229592,0.074737
0.0,0.242215,0.192857,0.049358
2.0,0.13985,0.113265,0.026585
3.0,0.042639,0.055102,0.012463


In [333]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5['cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5'].value_counts().head(K).index)
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_conOneHot = pd.concat([df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5['person'],pd.get_dummies(df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5['cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5'])[top_columnas]],axis = 1)

In [334]:
nombres_columnas = ['cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_'+str(columna) for columna in df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns = nombres_columnas

In [335]:
df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_-1.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_1.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_0.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_2.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_3.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_3.5,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_1.5,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_0.5,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_5.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_6.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_4.0,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_1.3333333333333333,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_3.6666666666666665,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_0.3333333333333333,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5_1.6666666666666667
0,4886f805,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos segundos (en cada hora)  hace eventos en su ultima conexion del mes 5 

In [336]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = df_events[df_events['month'] == 5]

In [337]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5[['person','timestamp','day','hour']]

In [338]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5.drop_duplicates()\
                                .sort_values(['timestamp'])

In [339]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.total_seconds()))

In [340]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = (df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5\
        .groupby(['person','day','hour'])['timestamp'].agg(cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5)).to_frame()


In [341]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5\
    .groupby(['person','day']).mean().groupby('person').last().reset_index()

In [342]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5.columns = ['person','cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5']

In [343]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5\
                                                       ,on = 'person', how = 'left')


In [344]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5.fillna(-1)

In [345]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5.shape

(38829, 2)

Con one hot encoding

In [346]:
a = pd.merge(df_labels,df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
6.0,0.018227,0.007143,0.011084
14.0,0.016437,0.006122,0.010315
10.0,0.017956,0.009184,0.008772
12.0,0.017088,0.010204,0.006884


In [347]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5['cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5'].value_counts().head(K).index)
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_conOneHot = pd.concat([df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5['person'],pd.get_dummies(df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5['cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5'])[top_columnas]],axis = 1)

In [348]:
nombres_columnas = ['cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_'+str(columna) for columna in df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns = nombres_columnas

In [349]:
df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_-1.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_6.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_14.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_10.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_12.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_16.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_21.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_31.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_24.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_18.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_41.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_8.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_23.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_7.0,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5_9.0
0,4886f805,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##  Cada cuanto hace conversion

#### Cada cuantos dias hace conversion

In [350]:
df_cadaCuantosDiasHaceConversion = df_events[df_events['event'] == 'conversion']

In [351]:
df_cadaCuantosDiasHaceConversion = df_cadaCuantosDiasHaceConversion[['person','timestamp','month','day']]

In [352]:
df_cadaCuantosDiasHaceConversion = df_cadaCuantosDiasHaceConversion.drop_duplicates(['month','day','person'])\
                                .sort_values('timestamp')

In [353]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosDiasHaceConversion(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.days))

In [354]:
df_cadaCuantosDiasHaceConversion = (df_cadaCuantosDiasHaceConversion.groupby('person')['timestamp']\
                                 .agg(cadaCuantosDiasHaceConversion)).to_frame().reset_index()
df_cadaCuantosDiasHaceConversion.columns = ['person','cadaCuantosDiasHaceConversion']

In [355]:
df_cadaCuantosDiasHaceConversion = pd.merge(df_todas_las_personas,df_cadaCuantosDiasHaceConversion\
                                                       ,on = 'person', how = 'left')

In [356]:
df_cadaCuantosDiasHaceConversion = df_cadaCuantosDiasHaceConversion.fillna(-1)

In [357]:
df_cadaCuantosDiasHaceConversion.head()

Unnamed: 0,person,cadaCuantosDiasHaceConversion
0,4886f805,-1.0
1,ad93850f,-1.0
2,0297fc1e,-1.0
3,2d681dd8,-1.0
4,cccea85e,-1.0


In [358]:
df_cadaCuantosDiasHaceConversion.shape

(38829, 2)

Con one hot encoding

In [359]:
a = pd.merge(df_labels,df_cadaCuantosDiasHaceConversion,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosDiasHaceConversion','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosDiasHaceConversion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.893295,0.79898,0.094315
0.0,0.091027,0.143878,0.05285
1.0,0.002387,0.012245,0.009858
3.0,0.000542,0.005102,0.00456
34.0,0.000217,0.004082,0.003865


In [360]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasHaceConversion['cadaCuantosDiasHaceConversion'].value_counts().head(K).index)
df_cadaCuantosDiasHaceConversion_conOneHot = pd.concat([df_cadaCuantosDiasHaceConversion['person'],pd.get_dummies(df_cadaCuantosDiasHaceConversion['cadaCuantosDiasHaceConversion'])[top_columnas]],axis = 1)

In [361]:
nombres_columnas = ['cadaCuantosDiasHaceConversion_'+str(columna) for columna in df_cadaCuantosDiasHaceConversion_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasHaceConversion_conOneHot.columns = nombres_columnas

In [362]:
df_cadaCuantosDiasHaceConversion_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasHaceConversion_-1.0,cadaCuantosDiasHaceConversion_0.0,cadaCuantosDiasHaceConversion_1.0,cadaCuantosDiasHaceConversion_3.0,cadaCuantosDiasHaceConversion_34.0,cadaCuantosDiasHaceConversion_4.0,cadaCuantosDiasHaceConversion_20.0,cadaCuantosDiasHaceConversion_9.0,cadaCuantosDiasHaceConversion_2.0,cadaCuantosDiasHaceConversion_12.0,cadaCuantosDiasHaceConversion_6.0,cadaCuantosDiasHaceConversion_33.0,cadaCuantosDiasHaceConversion_35.0,cadaCuantosDiasHaceConversion_38.0,cadaCuantosDiasHaceConversion_41.0
0,4886f805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##  Cada cuanto hace checkout

#### Cada cuantos dias hace checkout

In [363]:
df_cadaCuantosDiasHaceCheckout = df_events[df_events['event'] == 'checkout']

In [364]:
df_cadaCuantosDiasHaceCheckout = df_cadaCuantosDiasHaceCheckout[['person','timestamp','month','day']]

In [365]:
df_cadaCuantosDiasHaceCheckout = df_cadaCuantosDiasHaceCheckout.drop_duplicates(['month','day','person'])\
                                .sort_values('timestamp')

In [366]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosDiasHaceCheckout(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.days))

In [367]:
df_cadaCuantosDiasHaceCheckout = (df_cadaCuantosDiasHaceCheckout.groupby('person')['timestamp']\
                                 .agg(cadaCuantosDiasHaceCheckout)).to_frame().reset_index()
df_cadaCuantosDiasHaceCheckout.columns = ['person','cadaCuantosDiasHaceCheckout']

In [368]:
df_cadaCuantosDiasHaceCheckout = pd.merge(df_todas_las_personas,df_cadaCuantosDiasHaceCheckout\
                                                       ,on = 'person', how = 'left')

In [369]:
df_cadaCuantosDiasHaceCheckout = df_cadaCuantosDiasHaceCheckout.fillna(-1)

In [370]:
df_cadaCuantosDiasHaceCheckout.head()

Unnamed: 0,person,cadaCuantosDiasHaceCheckout
0,4886f805,0.0
1,ad93850f,0.0
2,0297fc1e,24.0
3,2d681dd8,0.0
4,cccea85e,0.0


In [371]:
df_cadaCuantosDiasHaceCheckout.shape

(38829, 2)

Con one hot encoding

In [372]:
a = pd.merge(df_labels,df_cadaCuantosDiasHaceCheckout,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosDiasHaceCheckout','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosDiasHaceCheckout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.740263,0.277551,0.462712
-1.0,0.131876,0.541837,0.409961
20.0,0.00179,0.009184,0.007394
2.0,0.012043,0.018367,0.006324
9.0,0.002929,0.008163,0.005234


In [373]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasHaceCheckout['cadaCuantosDiasHaceCheckout'].value_counts().head(K).index)
df_cadaCuantosDiasHaceCheckout_conOneHot = pd.concat([df_cadaCuantosDiasHaceCheckout['person'],pd.get_dummies(df_cadaCuantosDiasHaceCheckout['cadaCuantosDiasHaceCheckout'])[top_columnas]],axis = 1)

In [374]:
nombres_columnas = ['cadaCuantosDiasHaceCheckout_'+str(columna) for columna in df_cadaCuantosDiasHaceCheckout_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasHaceCheckout_conOneHot.columns = nombres_columnas

In [375]:
df_cadaCuantosDiasHaceCheckout_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasHaceCheckout_0.0,cadaCuantosDiasHaceCheckout_-1.0,cadaCuantosDiasHaceCheckout_20.0,cadaCuantosDiasHaceCheckout_2.0,cadaCuantosDiasHaceCheckout_9.0,cadaCuantosDiasHaceCheckout_30.0,cadaCuantosDiasHaceCheckout_5.0,cadaCuantosDiasHaceCheckout_3.0,cadaCuantosDiasHaceCheckout_14.0,cadaCuantosDiasHaceCheckout_4.0,cadaCuantosDiasHaceCheckout_22.0,cadaCuantosDiasHaceCheckout_1.0,cadaCuantosDiasHaceCheckout_31.0,cadaCuantosDiasHaceCheckout_23.0,cadaCuantosDiasHaceCheckout_12.0
0,4886f805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### -----------------------------------------------------------------------------------------------------------------------------

In [522]:
df_train2 = pd.merge(df_cadaCuantosDiasHaceEventos,df_cadaCuantasHorasHaceEventos\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosMinutosHaceEventos\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosDiasEnMes5HaceEventos\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosDiasHaceConversion\
                    , on = 'person', how = 'inner')
df_train2 = pd.merge(df_train2,df_cadaCuantosDiasHaceCheckout\
                    , on = 'person', how = 'inner')

In [427]:
santi_time = pd.read_csv("santi_timefeatures.csv")
santi_time.drop(columns = ['Unnamed: 0'], inplace = True)

In [428]:
santi_eventos = pd.read_csv("Santi_FeaturesConEventos.csv")
santi_eventos.drop(columns = ['Unnamed: 0'], inplace = True)

In [394]:
seba = pd.read_csv("../features_varios.csv")

In [536]:
magui = pd.read_csv("featuresMagui.csv")
magui.drop(columns = ['Unnamed: 0','device_type'], inplace = True)

In [None]:
# https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

In [552]:
# Solo para contar cuanto tarda en entrenar.
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [553]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [554]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [617]:
df_train_con_labels = pd.merge(df_train,df_labels,how = 'inner', on ='person')

In [618]:
df_train_con_labels.drop(columns = ['person'],inplace = True)

In [619]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [620]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [621]:
xg = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

Stratification seeks to ensure that each fold is representative of all strata of the data. Generally this is done in a supervised way for classification and aims to ensure each class is (approximately) equally represented across each test fold (which are of course combined in a complementary way to form training folds).

In [610]:
y_train = y_train.reset_index(drop = True)

In [611]:
X_train = X_train.reset_index(drop=True)

In [577]:
from datetime import timedelta, datetime

In [642]:
folds = 10
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xg, param_distributions=params, n_iter=param_comb, scoring='balanced_accuracy', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )
#grid = GridSearchCV(estimator=xg, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3 )

start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
#grid.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  6.1min finished



 Time taken: 0 hours 6 minutes and 28.13 seconds.


In [648]:
xg = random_search.best_estimator_
#xg = grid.best_estimator_

In [359]:
grid.best_params_

{'colsample_bytree': 1.0,
 'gamma': 5,
 'max_depth': 3,
 'min_child_weight': 5,
 'subsample': 0.6}

#### pruebita 

In [574]:
df_train = df_cadaCuantosDiasHaceEventos2

In [537]:
df_train_con_labels = pd.merge(df_train,df_labels,how = 'inner', on ='person')

In [538]:
df_train_con_labels.drop(columns = ['person'],inplace = True)

In [539]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [540]:
xg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [541]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [649]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=1, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [650]:
predsProb1 = pd.DataFrame(xg.predict_proba(X_test))[1]

In [651]:
predsLabel = pd.Series(xg.predict(X_test))

In [40]:
#trainAccuracy = accuracy_score(y_train, pd.Series(xg.predict(X_train)))
#testAccuracy = accuracy_score(y_test, predsLabel)

cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
classificationReport = classification_report(y_test, predsLabel)
precisionScore = precision_score(y_test, predsLabel)
recallScore = recall_score(y_test, predsLabel)
f1Score = f1_score(y_test, predsLabel)
matrizDeConfusion = confusion_matrix(y_test, predsLabel)

meanSquaredError = mean_squared_error(y_test, predsProb1)
areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)

In [42]:
# Pruebo todas las metricas.
# accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
# f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
# brierScoreLoss.
# Hay metricas para la probabilidad de que sea 1 y otras metricas para las predicciones de los labels.


# Metricas con LABELS.

print()
print("Metricas con Labels:")
print()

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Precision Score: ",precisionScore)
print("Recall Score: ",recallScore)
print("F1 Score: ",f1Score)
print("Cohen Kappa Score: ",cohenKappaScore)
print()
print("Confusion matrix: ")
print(matrizDeConfusion)

# Metricas con PROBABILIDADES. 

print()
print("Metricas sin Labels:")
print()

print("ROC auc score: ", areaDebajoDeCurva)
print("Mean squared error: ", meanSquaredError)


Metricas con Labels:


Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3692
           1       1.00      0.01      0.01       191

   micro avg       0.95      0.95      0.95      3883
   macro avg       0.98      0.50      0.49      3883
weighted avg       0.95      0.95      0.93      3883


Precision Score:  1.0
Recall Score:  0.005235602094240838
F1 Score:  0.010416666666666668
Cohen Kappa Score:  0.009909361017991003

Confusion matrix: 
[[3692    0]
 [ 190    1]]

Metricas sin Labels:

ROC auc score:  0.6299612009552279
Mean squared error:  0.046719752675264896
