In [47]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
from datetime import timedelta
%matplotlib inline

#from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score

import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [2]:
labels = pd.read_csv("fiuba-trocafone-tp2-final-set/labels_training_set.csv")
df = pd.read_csv("fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["month"] = df["timestamp"].dt.month
df["day"] = df["timestamp"].dt.day

### Dia, mes con mayor cantidad de visitas de cada persona

In [6]:
# Dia con mas entradas y su cantidad de entradas

entradas_x_dia = df.groupby("person")['day'].value_counts().rename("cant_dia").reset_index()
entradas_x_dia.columns = ['person', 'day', 'cant_dia']
entradas_x_dia = entradas_x_dia.sort_values('cant_dia',ascending=False)
dia_mas_entradas = entradas_x_dia.drop_duplicates(subset=['person'])
dia_mas_entradas.head()

Unnamed: 0,person,day,cant_dia
99325,c76b8417,3,652
28266,37eff05b,29,616
112559,e1443dd4,30,572
51164,656c18ef,10,512
41082,50e16a8a,19,423


In [7]:
# Mes con mas entradas y su cantidad de entradas

entradas_x_mes = df.groupby("person")['month'].value_counts().rename("cant_mes").reset_index()
entradas_x_mes.columns = ['person', 'month', 'cant_mes']
entradas_x_mes = entradas_x_mes.sort_values('cant_mes',ascending=False)
mes_mas_entradas = entradas_x_mes.drop_duplicates(subset=['person'])
mes_mas_entradas.head()

Unnamed: 0,person,month,cant_mes
38966,c76b8417,5,3028
18217,5c76e694,5,1609
32890,a7ffa917,5,1604
4824,18489dd5,5,1563
9989,33385551,4,1539


In [8]:
# Merge

cant_mes_mas_entradas = mes_mas_entradas[["person", "cant_mes"]]
cant_dia_mas_entradas = dia_mas_entradas[["person", "cant_dia"]]

mes_dia_mas_entradas = pd.merge(cant_mes_mas_entradas, cant_dia_mas_entradas, on="person", how="inner")

df_con_labels = pd.merge(mes_dia_mas_entradas, labels, on="person", how="right")
df_con_labels.head()

Unnamed: 0,person,cant_mes,cant_dia,label
0,5c76e694,1609,191,0
1,18489dd5,1563,174,0
2,1775ba85,1450,126,0
3,6abd2bf1,1334,300,0
4,97b0c0d1,1328,217,0


In [9]:
df_con_labels_num = df_con_labels[["cant_mes", "cant_dia", "label"]]

## Prueba

In [8]:
# Separamos la variable a predecir

X, y = df_con_labels_num.iloc[:,:-1],df_con_labels_num.iloc[:,-1]

In [9]:
# Convertimos los datos a DMatrix

data_dmatrix = xgb.DMatrix(data=X,label=y)


In [10]:

#Creamos set de entrenamiento y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [11]:
#Instanciamos el regresor de XGBoost

xg_reg = xgb.XGBRegressor(objective ='binary:hinge', 
                colsample_bytree = 0.9, learning_rate = 0.1,
                max_depth = 55, alpha = 70, n_estimators = 6)

In [12]:
#Entrenamos

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=70, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=55, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:hinge', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [13]:
# Predecimos

preds = xg_reg.predict(X_test)

In [14]:
# Vemos que onda

train_accuracy = accuracy_score(y_train, xg_reg.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
matriz_de_confusion = confusion_matrix(y_test, preds)
matriz_de_confusion

array([[3596,   76],
       [ 196,   15]], dtype=int64)

### Entradas en los ultimos 30 dias (hasta la ultima visita)

In [10]:
df_meses = df[["timestamp", "person", "month"]]
df_ultimo_dia = df_meses.groupby("person")["timestamp"].max().reset_index()
df_ultimo_dia.columns = ["person", "last_day"]
df_meses_ultimo_dia = pd.merge(df_ultimo_dia, df_meses, on="person", how="inner")
entradas_30_dias = df_meses_ultimo_dia.loc[((df_meses_ultimo_dia["last_day"] - timedelta(days=30)) < \
                                           df_meses_ultimo_dia["timestamp"]) & (df_meses_ultimo_dia["timestamp"]< \
                                           df_meses_ultimo_dia["last_day"])]
entradas_30_dias.head()

Unnamed: 0,person,last_day,timestamp,month
1,0008ed71,2018-05-17 16:28:37,2018-05-17 12:27:47,5
2,0008ed71,2018-05-17 16:28:37,2018-05-17 13:45:00,5
3,0008ed71,2018-05-17 16:28:37,2018-05-17 16:22:06,5
4,0008ed71,2018-05-17 16:28:37,2018-05-17 13:44:59,5
5,0008ed71,2018-05-17 16:28:37,2018-05-17 16:21:54,5


In [11]:
entradas_30_dias_df = entradas_30_dias["person"].value_counts().rename("entradas_30_dias").reset_index()
entradas_30_dias_df.columns = ["person", "entradas_30_dias"]
entradas_30_dias_df.head()

Unnamed: 0,person,entradas_30_dias
0,c76b8417,3027
1,5c76e694,1608
2,a7ffa917,1603
3,18489dd5,1562
4,622b4acf,1458


In [12]:
# Merge

df_con_labels2 = pd.merge(entradas_30_dias_df, df_con_labels, on="person", how="inner")
df_con_labels2.head()

Unnamed: 0,person,entradas_30_dias,cant_mes,cant_dia,label
0,5c76e694,1608,1609,191,0
1,18489dd5,1562,1563,174,0
2,6abd2bf1,1319,1334,300,0
3,97b0c0d1,1306,1328,217,0
4,595b9b50,1177,1179,377,0


In [13]:
df_con_labels2_num = df_con_labels2[["entradas_30_dias", "cant_mes", "cant_dia", "label"]]

## Prueba

In [19]:
# Separamos la variable a predecir

X, y = df_con_labels2_num.iloc[:,:-1], df_con_labels2_num.iloc[:,-1]

In [20]:
# Convertimos los datos a DMatrix

data_dmatrix = xgb.DMatrix(data=X,label=y)


In [21]:

#Creamos set de entrenamiento y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [22]:
preds

array([0., 0., 1., ..., 0., 0., 0.], dtype=float32)

In [23]:
#Instanciamos el regresor de XGBoost

xg_reg = xgb.XGBRegressor(objective ='binary:hinge', 
                colsample_bytree = 0.1, learning_rate = 0.1,
                max_depth = 15, alpha = 70, n_estimators = 6)

In [24]:
#Entrenamos

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=70, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:hinge', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [25]:
# Predecimos

preds = xg_reg.predict(X_test)
preds

array([0., 0., 0., ..., 0., 1., 0.], dtype=float32)

In [29]:
# Vemos que onda

train_accuracy = accuracy_score(y_train, xg_reg.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
matriz_de_confusion = confusion_matrix(y_test, preds)
matriz_de_confusion

array([[3399,  204],
       [ 178,   22]], dtype=int64)

## Entradas en el ultimo mes (mes 5)

In [23]:
entradas_mes_5 = df_meses.loc[df_meses["month"] == 5]
cant_entradas_mes_5 = entradas_mes_5.groupby("person").agg({"month": "count"}).reset_index()
cant_entradas_mes_5.columns = ["person", "cant_mes_5"]
cant_entradas_mes_5.head()

Unnamed: 0,person,cant_mes_5
0,0008ed71,6
1,00091926,448
2,000ba417,206
3,000c79fe,17
4,000e4d9e,411


In [24]:
# Merge

df_con_mes_5 = pd.merge(cant_entradas_mes_5, df_con_labels2, on="person", how="inner")
df_con_mes_5.head()

Unnamed: 0,person,cant_mes_5,entradas_30_dias,cant_mes,cant_dia,label
0,0008ed71,6,5,6,6,0
1,000c79fe,17,16,17,17,0
2,001802e4,19,18,19,19,0
3,0019e639,290,358,290,72,0
4,001b0bf9,7,6,7,7,0


## Cantidad de entradas totales

In [31]:
entradas_tot = df.groupby("person")["event"].value_counts().rename("entradas_tot").reset_index()
cant_entradas_tot = entradas_tot[["person", "entradas_tot"]]


In [32]:
# Merge

df_con_entradas = pd.merge(cant_entradas_tot, df_con_mes_5, on="person", how="inner")
df_con_entradas.head()

Unnamed: 0,person,entradas_tot,cant_mes_5,entradas_30_dias,cant_mes,cant_dia,label
0,0008ed71,3,6,5,6,6,0
1,0008ed71,2,6,5,6,6,0
2,0008ed71,1,6,5,6,6,0
3,000c79fe,9,17,16,17,17,0
4,000c79fe,3,17,16,17,17,0


### Cantidad de checkouts

In [33]:
checkouts = df.loc[df["event"] == "checkout"]
checkouts_tot = checkouts.groupby("person")["event"].value_counts().rename("check_tot").reset_index()
cant_checkouts_tot = checkouts_tot[["person", "check_tot"]]

In [38]:
# Merge

df_con_check = pd.merge(cant_checkouts_tot, df_con_entradas, on="person", how="inner")
df_con_check.head()

Unnamed: 0,person,check_tot,entradas_tot,cant_mes_5,entradas_30_dias,cant_mes,cant_dia,label
0,0008ed71,3,3,6,5,6,6,0
1,0008ed71,3,2,6,5,6,6,0
2,0008ed71,3,1,6,5,6,6,0
3,000c79fe,1,9,17,16,17,17,0
4,000c79fe,1,3,17,16,17,17,0


In [81]:
# xgboost solo usa valores numericos
df_con_check_num = df_con_check.drop(columns="person")

### Prueba

In [71]:
# Separamos la variable a predecir

X, y = df_con_check_num.iloc[:,:-1], df_con_check_num.iloc[:,-1]

In [72]:
# Convertimos los datos a DMatrix

data_dmatrix = xgb.DMatrix(data=X,label=y)


In [73]:

#Creamos set de entrenamiento y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [74]:
#Instanciamos el regresor de XGBoost

xg_reg = xgb.XGBRegressor(objective ='binary:hinge', 
                colsample_bytree = 0.1, learning_rate = 0.1,
                max_depth = 15, alpha = 70, n_estimators = 6)

In [75]:
#Entrenamos

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=70, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:hinge', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [76]:
# Predecimos

preds = xg_reg.predict(X_test)
preds

array([0., 1., 0., ..., 0., 0., 0.], dtype=float32)

In [77]:
# Vemos que onda

train_accuracy = accuracy_score(y_train, xg_reg.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
matriz_de_confusion = confusion_matrix(y_test, preds)
matriz_de_confusion

array([[18688,   643],
       [  425,   217]], dtype=int64)

In [78]:
train_accuracy

0.9499687069720866

In [79]:
test_accuracy

0.9465278125469384

In [80]:
roc_auc_score(y_test, preds)

0.6523717976919869

## Cantidad de entradas al evento mas visitado por cada persona

In [92]:
s = df.groupby("person")['event'].value_counts().rename().reset_index()
s.columns = ['person', 'event', 'cant_ev']
s = s.sort_values('cant_ev',ascending=False)
mayor_evento = s.drop_duplicates(subset=['person']).drop(columns="event")
mayor_evento.head()

Unnamed: 0,person,cant_ev
97161,6abd2bf1,2355
160896,b1f4dbf6,2233
129961,8fb4929e,1912
141831,9ccf882a,1891
6202,06ed04d6,1881


In [91]:
# Cantidad de entradas al modelo mas visitado por cada persona

mod = df.groupby("person")['model'].value_counts().rename().reset_index()
mod.columns = ['person', 'model', 'cant_mod']
mod = mod.sort_values('cant_mod',ascending=False)
mayor_modelo = mod.drop_duplicates(subset=['person']).drop(columns="model")
mayor_modelo.head()

Unnamed: 0,person,cant_mod
154136,b1f4dbf6,836
20443,171e75cb,683
97765,6f19cfd9,624
94322,6abd2bf1,607
135679,9bf968c5,568


In [90]:
# Cantidad de entradas al search engine mas visitado por cada persona

eng = df.groupby("person")['search_engine'].value_counts().rename().reset_index()
eng.columns = ['person', 'search_engine', 'cant_eng']
eng = eng.sort_values('cant_eng',ascending=False)
mayor_engine = eng.drop_duplicates(subset=['person']).drop(columns="search_engine")
mayor_engine.head()

Unnamed: 0,person,cant_eng
20377,c76b8417,762
11210,6ca3126e,245
10145,622b4acf,206
3906,25b77cf2,161
9593,5c76e694,136


In [88]:
# Cantidad de entradas al campaign source mas visitado por cada persona

src = df.groupby("person")['campaign_source'].value_counts().rename().reset_index()
src.columns = ['person', 'campaign_source', 'cant_src']
src = src.sort_values('cant_src',ascending=False)
mayor_camp = src.drop_duplicates(subset=['person']).drop(columns="campaign_source")
mayor_camp.head()

Unnamed: 0,person,cant_src
529,02f14240,500
36357,c76b8417,374
19910,6ca3126e,335
17899,622b4acf,282
10671,3b2d17f6,221


In [87]:
# Cantidad de entradas desde la ciudad mas frecuente de cada persona

cty = df.groupby("person")['city'].value_counts().rename().reset_index()
cty.columns = ['person', 'city', 'cant_city']
src = cty.sort_values('cant_city',ascending=False)
mayor_ciudad = src.drop_duplicates(subset=['person']).drop(columns="city")
mayor_ciudad.head()

Unnamed: 0,person,cant_city
18380,5059f7fd,268
27926,7ac0c607,208
23603,67bdc946,207
58123,ffee0f18,173
35216,9b3b43aa,171


In [93]:
# Merge

from functools import reduce

# merge de todos los dfs

dfs = [mayor_ciudad, mayor_camp, mayor_engine, mayor_modelo, mayor_evento, df_con_check]
df_final = reduce(lambda left,right: pd.merge(left,right,on='person', how='inner'), dfs)
df_final.head()

Unnamed: 0,person,cant_city,cant_src,cant_eng,cant_mod,cant_ev,check_tot,entradas_tot,cant_mes_5,entradas_30_dias,cant_mes,cant_dia,label
0,ffee0f18,173,76,65,80,1518,2,1518,218,217,1028,282,0
1,ffee0f18,173,76,65,80,1518,2,1484,218,217,1028,282,0
2,ffee0f18,173,76,65,80,1518,2,173,218,217,1028,282,0
3,ffee0f18,173,76,65,80,1518,2,100,218,217,1028,282,0
4,ffee0f18,173,76,65,80,1518,2,92,218,217,1028,282,0


In [94]:
df_final_num = df_final.drop(columns="person")

### Prueba

In [95]:
# Separamos la variable a predecir

X, y = df_final_num.iloc[:,:-1], df_final_num.iloc[:,-1]

# Convertimos los datos a DMatrix

data_dmatrix = xgb.DMatrix(data=X,label=y)

#Creamos set de entrenamiento y test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [96]:
#Instanciamos el regresor de XGBoost

xg_reg = xgb.XGBRegressor(objective ='binary:hinge', 
                colsample_bytree = 0.1, learning_rate = 0.1,
                max_depth = 15, alpha = 70, n_estimators = 6)

In [97]:
#Entrenamos

xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=70, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:hinge', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [98]:
# Predecimos

preds = xg_reg.predict(X_test)

In [99]:
# Vemos que onda

train_accuracy = accuracy_score(y_train, xg_reg.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
matriz_de_confusion = confusion_matrix(y_test, preds)
matriz_de_confusion

array([[11842,   575],
       [  284,   210]], dtype=int64)

In [100]:
train_accuracy

0.9352271406994307

In [101]:
test_accuracy

0.9334675857795678

In [102]:
roc_auc_score(y_test, preds)

0.6893968664482774