In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
import xgboost as xgb
from sklearn import preprocessing

In [2]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [3]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [4]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear
df_events['second'] = df_events['timestamp'].dt.second

In [5]:
df_events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'month', 'day',
       'day_of_week', 'day_of_year', 'hour', 'minute', 'week_of_year',
       'second'],
      dtype='object')

In [6]:
df_todas_las_personas = df_events[['person']].drop_duplicates()

In [7]:
# Me quedo solo con los users que tienen label.
df_events_con_labels = pd.merge(df_events,df_labels,on = 'person',how = 'inner')

# Features
* Tiempo que esta en promedio cada vez que entra.
* Tratar de sacar cuanto tiempo estuvo la ultima vez que entro
    * Fecha de ultima conversion
    * Fecha de ultimo checkout
    * Fecha de ultima vez que accedio
* Cada cuanto hace checkouts
* Cada cuanto hace compras  			 (promedio o mediana)
* Cada cuanto visita celulares
* Cada cuanto hace eventos. # promedio de diferencia entre fechas consecutivas
* Cada cuanto ve distintos celulares
probar con mes semana dia hora minuto

* epoch
* mas features usando columnas con eventos
* cada cuanto hace una busqueda?
* cada cuanto clickea una publcidad? 
* distancia promedio entre epochs consecutivo

## Cada cuanto hace eventos

#### Cada cuantos dias hace eventos (promedio) 

In [189]:
df_cadaCuantosDiasHaceEventos = df_events[['person','timestamp','month','day']]

In [190]:
df_cadaCuantosDiasHaceEventos = df_cadaCuantosDiasHaceEventos.drop_duplicates(['month','day','person'])\
                                .sort_values('timestamp')

In [191]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def promedioCadaCuantosDiasVuelve(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.days))

In [192]:
df_cadaCuantosDiasHaceEventos = (df_cadaCuantosDiasHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantosDiasVuelve)).to_frame().reset_index()
df_cadaCuantosDiasHaceEventos.columns = ['person','promedioCadaCuantosDiasVuelve']

In [193]:
df_cadaCuantosDiasHaceEventos.head()

Unnamed: 0,person,promedioCadaCuantosDiasVuelve
0,0008ed71,0
1,00091926,1
2,00091a7a,0
3,000ba417,4
4,000c79fe,0


Con one hot encoding

In [194]:
a = pd.merge(df_labels,df_cadaCuantosDiasHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['promedioCadaCuantosDiasVuelve','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
promedioCadaCuantosDiasVuelve,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.541554,0.328571,0.212982
4,0.039655,0.073469,0.033814
2,0.080015,0.103061,0.023046
9,0.011446,0.027551,0.016105
1,0.083813,0.09898,0.015167
6,0.027829,0.040816,0.012987
8,0.017847,0.028571,0.010724
5,0.025822,0.034694,0.008872
11,0.00792,0.016327,0.008406
7,0.016112,0.02449,0.008378


In [195]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasHaceEventos['promedioCadaCuantosDiasVuelve'].value_counts().head(K).index)
df_cadaCuantosDiasHaceEventos_conOneHot = pd.concat([df_cadaCuantosDiasHaceEventos['person'],pd.get_dummies(df_cadaCuantosDiasHaceEventos['promedioCadaCuantosDiasVuelve'])[top_columnas]],axis = 1)

In [196]:
nombres_columnas = ['cadaCuantosDiasHaceEventos_'+str(columna) for columna in df_cadaCuantosDiasHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasHaceEventos_conOneHot.columns = nombres_columnas

In [197]:
df_cadaCuantosDiasHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasHaceEventos_0,cadaCuantosDiasHaceEventos_4,cadaCuantosDiasHaceEventos_2,cadaCuantosDiasHaceEventos_9,cadaCuantosDiasHaceEventos_1,cadaCuantosDiasHaceEventos_6,cadaCuantosDiasHaceEventos_8,cadaCuantosDiasHaceEventos_5,cadaCuantosDiasHaceEventos_11,cadaCuantosDiasHaceEventos_7,cadaCuantosDiasHaceEventos_13,cadaCuantosDiasHaceEventos_10,cadaCuantosDiasHaceEventos_3,cadaCuantosDiasHaceEventos_16,cadaCuantosDiasHaceEventos_14
0,0008ed71,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00091926,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,00091a7a,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantas horas hace eventos (promedio)

In [198]:
df_cadaCuantasHorasHaceEventos = df_events[['person','timestamp','month','day','hour']]

In [199]:
df_cadaCuantasHorasHaceEventos = df_cadaCuantasHorasHaceEventos.drop_duplicates(['month','day','hour','person'])\
                                .sort_values('timestamp')

In [200]:
def secToHours(seconds):
    return seconds//3600
def secToMinutes(seconds):
    return (seconds//60)

In [201]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.w
def promedioCadaCuantasHorasHaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean(((S - S.shift()).dropna().dt.total_seconds()).apply(secToHours)))

In [202]:
df_cadaCuantasHorasHaceEventos = (df_cadaCuantasHorasHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantasHorasHaceEventos)).to_frame().reset_index()
df_cadaCuantasHorasHaceEventos.columns = ['person','cadaCuantasHorasHaceEventos']

In [203]:
df_cadaCuantasHorasHaceEventos.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventos
0,0008ed71,2
1,00091926,18
2,00091a7a,0
3,000ba417,27
4,000c79fe,0


Con one hot encoding

In [204]:
a = pd.merge(df_labels,df_cadaCuantasHorasHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasHaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasHaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.420256,0.230612,0.189644
12,0.011012,0.005102,0.00591
130,0.001302,0.007143,0.005841
24,0.007812,0.013265,0.005454
1,0.01763,0.012245,0.005386
39,0.003363,0.008163,0.0048
4,0.009656,0.014286,0.00463
28,0.004774,0.009184,0.00441
2,0.014538,0.010204,0.004334
170,0.000814,0.005102,0.004288


In [205]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasHaceEventos['cadaCuantasHorasHaceEventos'].value_counts().head(K).index)
df_cadaCuantasHorasHaceEventos_conOneHot = pd.concat([df_cadaCuantasHorasHaceEventos['person'],pd.get_dummies(df_cadaCuantasHorasHaceEventos['cadaCuantasHorasHaceEventos'])[top_columnas]],axis = 1)

In [206]:
nombres_columnas = ['cadaCuantasHorasHaceEventos_'+str(columna) for columna in df_cadaCuantasHorasHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasHaceEventos_conOneHot.columns = nombres_columnas

In [207]:
df_cadaCuantasHorasHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventos_0,cadaCuantasHorasHaceEventos_12,cadaCuantasHorasHaceEventos_130,cadaCuantasHorasHaceEventos_24,cadaCuantasHorasHaceEventos_1,cadaCuantasHorasHaceEventos_39,cadaCuantasHorasHaceEventos_4,cadaCuantasHorasHaceEventos_28,cadaCuantasHorasHaceEventos_2,cadaCuantasHorasHaceEventos_170,cadaCuantasHorasHaceEventos_37,cadaCuantasHorasHaceEventos_8,cadaCuantasHorasHaceEventos_143,cadaCuantasHorasHaceEventos_30,cadaCuantasHorasHaceEventos_23
0,0008ed71,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,00091926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00091a7a,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos minutos hace eventos (promedio)

In [208]:
df_cadaCuantosMinutosHaceEventos = df_events[['person','timestamp','month','day','hour','minute']]

In [209]:
df_cadaCuantosMinutosHaceEventos = df_cadaCuantosMinutosHaceEventos.drop_duplicates(['month','day','hour','person','minute'])\
                                .sort_values('timestamp')

In [210]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def promedioCadaCuantosMinutosHaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean(((S - S.shift()).dropna().dt.total_seconds()).apply(secToMinutes)))

In [211]:
df_cadaCuantosMinutosHaceEventos = (df_cadaCuantosMinutosHaceEventos.groupby('person')['timestamp']\
                                 .agg(promedioCadaCuantosMinutosHaceEventos)).to_frame().reset_index()
df_cadaCuantosMinutosHaceEventos.columns = ['person','cadaCuantosMinutosHaceEventos']

In [212]:
df_cadaCuantosMinutosHaceEventos.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventos
0,0008ed71,48
1,00091926,340
2,00091a7a,2
3,000ba417,167
4,000c79fe,1


Con one hot encoding

In [213]:
a = pd.merge(df_labels,df_cadaCuantosMinutosHaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosMinutosHaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosMinutosHaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.185689,0.104082,0.081608
1,0.107139,0.056122,0.051017
2,0.050993,0.037755,0.013238
3,0.020397,0.011224,0.009173
4,0.015569,0.008163,0.007406
5,0.008734,0.002041,0.006693
6,0.007757,0.002041,0.005717
8,0.006347,0.00102,0.005327
1992,0.000108,0.003061,0.002953
273,0.000217,0.003061,0.002844


In [214]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosMinutosHaceEventos['cadaCuantosMinutosHaceEventos'].value_counts().head(K).index)
df_cadaCuantosMinutosHaceEventos_conOneHot = pd.concat([df_cadaCuantosMinutosHaceEventos['person'],pd.get_dummies(df_cadaCuantosMinutosHaceEventos['cadaCuantosMinutosHaceEventos'])[top_columnas]],axis = 1)

In [215]:
nombres_columnas = ['cadaCuantosMinutosHaceEventos_'+str(columna) for columna in df_cadaCuantosMinutosHaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosMinutosHaceEventos_conOneHot.columns = nombres_columnas

In [216]:
df_cadaCuantosMinutosHaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventos_0,cadaCuantosMinutosHaceEventos_1,cadaCuantosMinutosHaceEventos_2,cadaCuantosMinutosHaceEventos_3,cadaCuantosMinutosHaceEventos_4,cadaCuantosMinutosHaceEventos_5,cadaCuantosMinutosHaceEventos_6,cadaCuantosMinutosHaceEventos_8,cadaCuantosMinutosHaceEventos_1992,cadaCuantosMinutosHaceEventos_273,cadaCuantosMinutosHaceEventos_284,cadaCuantosMinutosHaceEventos_205,cadaCuantosMinutosHaceEventos_9,cadaCuantosMinutosHaceEventos_476,cadaCuantosMinutosHaceEventos_320
0,0008ed71,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00091926,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,00091a7a,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,000ba417,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,000c79fe,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos dias hace eventos en el mes 5 (promedio)

In [276]:
df_cadaCuantosDiasEnMes5HaceEventos = df_events[df_events['month'] == 5]

In [277]:
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos[['person','day']]

In [278]:
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos.drop_duplicates()\
                                .sort_values('day')

In [279]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosDiasEnMes5HaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [280]:
df_cadaCuantosDiasEnMes5HaceEventos = (df_cadaCuantosDiasEnMes5HaceEventos.groupby('person')['day']\
                                 .agg(cadaCuantosDiasEnMes5HaceEventos)).to_frame().reset_index()
df_cadaCuantosDiasEnMes5HaceEventos.columns = ['person','cadaCuantosDiasEnMes5HaceEventos']

In [281]:
df_cadaCuantosDiasEnMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos
0,0008ed71,0
1,00091926,1
2,000ba417,4
3,000c79fe,0
4,000e4d9e,2


In [282]:
df_cadaCuantosDiasEnMes5HaceEventos = pd.merge(df_todas_las_personas,df_cadaCuantosDiasEnMes5HaceEventos\
                                               ,how = 'left', on = 'person')

In [285]:
# persona no entro en mes 5 -> -1
# persona entro una dia en mes 5 -> 0
df_cadaCuantosDiasEnMes5HaceEventos = df_cadaCuantosDiasEnMes5HaceEventos.fillna(-1)

In [294]:
df_cadaCuantosDiasEnMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos
0,4886f805,0.0
1,ad93850f,3.0
2,0297fc1e,2.0
3,2d681dd8,9.0
4,cccea85e,2.0


Con one hot encoding

In [295]:
a = pd.merge(df_labels,df_cadaCuantosDiasEnMes5HaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosDiasEnMes5HaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosDiasEnMes5HaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.490886,0.287755,0.203131
-1.0,0.037648,0.161224,0.123577
3.0,0.057231,0.071429,0.014197
2.0,0.111425,0.12551,0.014086
4.0,0.052349,0.062245,0.009896


In [296]:
K = 31
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosDiasEnMes5HaceEventos['cadaCuantosDiasEnMes5HaceEventos'].value_counts().head(K).index)
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot = pd.concat([df_cadaCuantosDiasEnMes5HaceEventos['person'],pd.get_dummies(df_cadaCuantosDiasEnMes5HaceEventos['cadaCuantosDiasEnMes5HaceEventos'])[top_columnas]],axis = 1)

In [297]:
nombres_columnas = ['cadaCuantosDiasEnMes5HaceEventos_'+str(columna) for columna in df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.columns = nombres_columnas

In [298]:
df_cadaCuantosDiasEnMes5HaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantosDiasEnMes5HaceEventos_0.0,cadaCuantosDiasEnMes5HaceEventos_-1.0,cadaCuantosDiasEnMes5HaceEventos_3.0,cadaCuantosDiasEnMes5HaceEventos_2.0,cadaCuantosDiasEnMes5HaceEventos_4.0,cadaCuantosDiasEnMes5HaceEventos_6.0,cadaCuantosDiasEnMes5HaceEventos_9.0,cadaCuantosDiasEnMes5HaceEventos_7.0,cadaCuantosDiasEnMes5HaceEventos_13.0,...,cadaCuantosDiasEnMes5HaceEventos_16.0,cadaCuantosDiasEnMes5HaceEventos_14.0,cadaCuantosDiasEnMes5HaceEventos_21.0,cadaCuantosDiasEnMes5HaceEventos_19.0,cadaCuantosDiasEnMes5HaceEventos_23.0,cadaCuantosDiasEnMes5HaceEventos_24.0,cadaCuantosDiasEnMes5HaceEventos_26.0,cadaCuantosDiasEnMes5HaceEventos_27.0,cadaCuantosDiasEnMes5HaceEventos_28.0,cadaCuantosDiasEnMes5HaceEventos_29.0
0,4886f805,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Cada cuantas horas hace eventos en un dia del mes 5

In [299]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_events[df_events['month'] == 5]

In [300]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos[['person','day','hour']]

In [301]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.drop_duplicates()\
                                .sort_values(['day','hour'])

In [302]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantasHorasEnUnDiaDelMes5HaceEventos(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [303]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = (df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
        .groupby(['person','day'])['hour'].agg(cadaCuantasHorasEnUnDiaDelMes5HaceEventos)).to_frame()


In [304]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
                .groupby('person')['hour'].mean().to_frame().reset_index()
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['hour'] = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['hour'].apply(round)

In [305]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.columns = ['person','cadaCuantasHorasEnUnDiaDelMes5HaceEventos']

In [307]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = pd.merge(df_todas_las_personas,df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos\
                                                       ,on = 'person', how = 'left')


In [310]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos = df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.fillna(-1)

In [396]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos.head()

Unnamed: 0,person,cadaCuantasHorasEnUnDiaDelMes5HaceEventos
0,4886f805,0.0
1,ad93850f,6.0
2,0297fc1e,5.0
3,2d681dd8,0.0
4,cccea85e,4.0


Con one hot encoding

In [313]:
a = pd.merge(df_labels,df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasEnUnDiaDelMes5HaceEventos','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasEnUnDiaDelMes5HaceEventos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.589509,0.429592,0.159917
-1.0,0.037648,0.161224,0.123577
2.0,0.080449,0.115306,0.034857
1.0,0.155799,0.133673,0.022126
4.0,0.035478,0.046939,0.011461


In [314]:
K = 24
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['cadaCuantasHorasEnUnDiaDelMes5HaceEventos'].value_counts().head(K).index)
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot = pd.concat([df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['person'],pd.get_dummies(df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos['cadaCuantasHorasEnUnDiaDelMes5HaceEventos'])[top_columnas]],axis = 1)

In [315]:
nombres_columnas = ['cadaCuantasHorasEnUnDiaDelMes5HaceEventos_'+str(columna) for columna in df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.columns = nombres_columnas

In [316]:
df_cadaCuantasHorasEnUnDiaDelMes5HaceEventos_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_0.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_-1.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_2.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_1.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_4.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_3.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_5.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_8.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_6.0,...,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_17.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_12.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_7.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_9.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_13.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_16.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_18.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_19.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_20.0,cadaCuantasHorasEnUnDiaDelMes5HaceEventos_21.0
0,4886f805,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos minutos hace eventos en un dia y hora del mes 5

In [317]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_events[df_events['month'] == 5]

In [318]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5[['person','day','hour','minute']]

In [319]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.drop_duplicates()\
                                .sort_values(['day','hour','minute'])

In [322]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [323]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = (df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
        .groupby(['person','day','hour'])['minute'].agg(cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5)).to_frame()


In [326]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
                .groupby('person')['minute'].mean().to_frame().reset_index()
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['minute'] = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['minute'].apply(round)

In [330]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.columns = ['person','cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5']

In [331]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5\
                                                       ,on = 'person', how = 'left')


In [332]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.fillna(-1)

In [344]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5
0,4886f805,8.0
1,ad93850f,3.0
2,0297fc1e,3.0
3,2d681dd8,2.0
4,cccea85e,3.0


In [334]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5.shape

(38829, 2)

Con one hot encoding

In [337]:
a = pd.merge(df_labels,df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
1.0,0.366442,0.292857,0.073585
0.0,0.174569,0.129592,0.044977
3.0,0.068515,0.090816,0.022302
2.0,0.248345,0.228571,0.019774


In [338]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5'].value_counts().head(K).index)
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot = pd.concat([df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['person'],pd.get_dummies(df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5'])[top_columnas]],axis = 1)

In [339]:
nombres_columnas = ['cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_'+str(columna) for columna in df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns = nombres_columnas

In [340]:
df_cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_-1.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_1.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_0.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_3.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_2.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_6.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_4.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_7.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_5.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_8.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_9.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_12.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_17.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_16.0,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5_10.0
0,4886f805,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,ad93850f,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


#### Cada cuantos segundos hace eventos en un dia y hora mes 5

In [372]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_events[df_events['month'] == 5]

In [373]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5[['person','timestamp','day','hour']]

In [374]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.drop_duplicates()\
                                .sort_values(['timestamp'])

In [376]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna().dt.total_seconds()))

In [377]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = (df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
        .groupby(['person','day','hour'])['timestamp'].agg(cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5)).to_frame()


In [380]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
                .groupby('person')['timestamp'].mean().to_frame().reset_index()
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['timestamp'] = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['timestamp'].apply(round)

In [381]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.columns = ['person','cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5']

In [382]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5\
                                                       ,on = 'person', how = 'left')


In [383]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5 = df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.fillna(-1)

In [386]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.head()

Unnamed: 0,person,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5
0,4886f805,198.0
1,ad93850f,69.0
2,0297fc1e,62.0
3,2d681dd8,27.0
4,cccea85e,41.0


In [387]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5.shape

(38829, 2)

Con one hot encoding

In [388]:
a = pd.merge(df_labels,df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1.0,0.037648,0.161224,0.123577
10.0,0.022513,0.008163,0.014349
16.0,0.023706,0.011224,0.012482
22.0,0.021591,0.010204,0.011386
6.0,0.019204,0.009184,0.01002


In [389]:
 tiempo estuvo la ultiK = 20
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5'].value_counts().head(K).index)
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot = pd.concat([df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['person'],pd.get_dummies(df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5'])[top_columnas]],axis = 1)

In [390]:
nombres_columnas = ['cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_'+str(columna) for columna in df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.columns = nombres_columnas

In [392]:
df_cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_-1.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_10.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_16.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_22.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_6.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_1.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_7.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_17.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_28.0,...,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_23.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_71.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_15.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_13.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_44.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_14.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_20.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_47.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_41.0,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5_11.0
0,4886f805,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ad93850f,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


#### Cada cuantas horas hace eventos en su ultima conexion (dia) en el mes 5

In [42]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_events[df_events['month'] == 5]

In [43]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5[['person','day','hour']]

In [44]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.drop_duplicates()\
                                .sort_values(['day','hour'])

In [45]:
# Funcion de agregacion para poder calcular la diferencia entre 2 ingresos consecutivos pero para una persona.
def cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5(S):
    if(len(S) == 1): return 0
    return round(np.mean((S - S.shift()).dropna()))

In [46]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = (df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
        .groupby(['person','day'])['hour'].agg(cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5)).to_frame()


In [47]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
                .groupby('person').last().reset_index()

In [48]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.columns = ['person','cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5']

In [49]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = pd.merge(df_todas_las_personas,df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5\
                                                       ,on = 'person', how = 'left')


In [50]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5 = df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.fillna(-1)

In [54]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5
0,4886f805,0.0
1,ad93850f,0.0
2,0297fc1e,0.0
3,2d681dd8,0.0
4,cccea85e,5.0


Con one hot encoding

In [56]:
a = pd.merge(df_labels,df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5,on = 'person', how = 'inner')
a = a.groupby(['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5','label']).count().unstack()
a.columns = ['neg','pos']
a['neg'] = a['neg'] / a['neg'].sum()
a['pos'] = a['pos'] / a['pos'].sum()
a['dif'] = np.abs(a['pos'] - a['neg'])
a.sort_values('dif',ascending = False, inplace = True)
a.head()

Unnamed: 0_level_0,neg,pos,dif
cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.674948,0.522449,0.152499
-1.0,0.037648,0.161224,0.123577
1.0,0.141098,0.123469,0.017629
4.0,0.018173,0.027551,0.009378
6.0,0.012857,0.021429,0.008572


In [57]:
K = 15
top_columnas = list(a.head(K).index)
top_columnas2 = list(df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5'].value_counts().head(K).index)
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot = pd.concat([df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['person'],pd.get_dummies(df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5'])[top_columnas]],axis = 1)

In [58]:
nombres_columnas = ['cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_'+str(columna) for columna in df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns]
nombres_columnas[0] = 'person'
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.columns = nombres_columnas

In [59]:
df_cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_conOneHot.head()

Unnamed: 0,person,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_0.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_-1.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_1.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_4.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_6.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_2.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_8.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_7.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_15.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_19.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_13.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_23.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_10.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_12.0,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5_14.0
0,4886f805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ad93850f,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0297fc1e,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2d681dd8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,cccea85e,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [585]:
seba = pd.read_csv("../features_varios.csv")

In [770]:
df_train = pd.merge(seba,df_cadaCuantosDiasHaceEventos2, on = 'person', how = 'left')

In [771]:
df_train = df_train.fillna(0)

In [600]:
list(df_train.columns)

['person',
 'nuevo_mes_5',
 'distan_dias',
 'mes_primer_entrada',
 'ad campaign hit mes 4',
 'brand listing mes 4',
 'checkout mes 4',
 'conversion mes 4',
 'generic listing mes 4',
 'lead mes 4',
 'search engine hit mes 4',
 'searched products mes 4',
 'staticpage mes 4',
 'viewed product mes 4',
 'visited site mes 4',
 'primer_quincena',
 'segunda_quincena',
 'ad campaign hit mes 5',
 'brand listing mes 5',
 'checkout mes 5',
 'conversion mes 5',
 'generic listing mes 5',
 'lead mes 5',
 'search engine hit mes 5',
 'searched products mes 5',
 'staticpage mes 5',
 'viewed product mes 5',
 'visited site mes 5',
 'ad campaign hit',
 'brand listing',
 'checkout',
 'conversion',
 'generic listing',
 'lead',
 'search engine hit',
 'searched products',
 'staticpage',
 'viewed product',
 'visited site',
 'dia_1',
 'dia_2',
 'dia_3',
 'dia_4',
 'dia_5',
 'dia_6',
 'dia_7',
 'dia_8',
 'dia_9',
 'dia_10',
 'dia_11',
 'dia_12',
 'dia_13',
 'dia_14',
 'dia_15',
 'dia_16',
 'dia_17',
 'dia_18',
 '

#### pruebita 

In [574]:
df_train = df_cadaCuantosDiasHaceEventos2

In [773]:
df_train_con_labels = pd.merge(df_train,df_labels,how = 'inner', on ='person')

In [774]:
df_train_con_labels.drop(columns = ['person'],inplace = True)

In [775]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [776]:
xg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [777]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [778]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [779]:
predsProb1 = pd.DataFrame(xg.predict_proba(X_test))[1]

In [780]:
predsLabel = pd.Series(xg.predict(X_test))

In [781]:
trainAccuracy = accuracy_score(y_train, pd.Series(xg.predict(X_train)))
testAccuracy = accuracy_score(y_test, predsLabel)

cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
classificationReport = classification_report(y_test, predsLabel)
precisionScore = precision_score(y_test, predsLabel)
recallScore = recall_score(y_test, predsLabel)
f1Score = f1_score(y_test, predsLabel)
matrizDeConfusion = confusion_matrix(y_test, predsLabel)

meanSquaredError = mean_squared_error(y_test, predsProb1)
areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)

In [782]:
# Pruebo todas las metricas.
# accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
# f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
# brierScoreLoss.
# Hay metricas para la probabilidad de que sea 1 y otras metricas para las predicciones de los labels.


# Metricas con LABELS.

print()
print("Metricas con Labels:")
print()

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Precision Score: ",precisionScore)
print("Recall Score: ",recallScore)
print("F1 Score: ",f1Score)
print("Cohen Kappa Score: ",cohenKappaScore)
print()
print("Confusion matrix: ")
print(matrizDeConfusion)

# Metricas con PROBABILIDADES. 

print()
print("Metricas sin Labels:")
print()

print("ROC auc score: ", areaDebajoDeCurva)
print("Mean squared error: ", meanSquaredError)


Metricas con Labels:

Train accuracy:  0.8433455669306548
Test acuracy:  0.8434200360545969

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.85      0.91      3711
           1       0.17      0.67      0.27       172

   micro avg       0.84      0.84      0.84      3883
   macro avg       0.58      0.76      0.59      3883
weighted avg       0.95      0.84      0.88      3883


Precision Score:  0.17267267267267267
Recall Score:  0.6686046511627907
F1 Score:  0.2744630071599045
Cohen Kappa Score:  0.21951038894490627

Confusion matrix: 
[[3160  551]
 [  57  115]]

Metricas sin Labels:

ROC auc score:  0.8619699134565372
Mean squared error:  0.14634057686787752
