In [None]:
import pandas as pd
import numpy as np

In [None]:
#Leo el csv de eventos

events_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv", low_memory=False)

In [None]:
events_df.head()

# Generación de features
---

In [None]:
features_df = pd.DataFrame()

In [None]:
features_df['person'] = events_df.person.unique()

In [None]:
features_df.head()

## Cantidad eventos totales

In [None]:
df = events_df.groupby('person').event.count()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'event':'cant_eventos_totales'}, inplace=True)

In [None]:
features_df.head()

## Cantidad por evento

In [None]:
df = events_df.groupby('person').event.value_counts()

In [None]:
df = df.unstack()
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'ad campaign hit':'cant_ach', 'brand listing':'cant_bl', 'checkout':'cant_ch', 
                            'conversion':'cant_cv', 'generic listing':'cant_gl', 'lead':'cant_l',
                           'search engine hit':'cant_seh', 'searched products':'cant_s_p', 
                            'staticpage':'cant_sp', 'viewed product':'cant_vp', 'visited site':'cant_vs'}, inplace=True)

In [None]:
features_df.head()

In [None]:
features_df.fillna(0, inplace=True)

In [None]:
features_df.head()

## Cantidad de returnings

In [None]:
df = events_df.groupby('person').new_vs_returning.value_counts().unstack().drop(columns=['New'])

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.head()

## Tiempo total

In [None]:
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

In [None]:
df = (events_df.groupby('person').timestamp.agg('max') - events_df.groupby('person').timestamp.agg('min'))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df = features_df.rename(columns={'timestamp':'tiempo_total'})

In [None]:
features_df[['person', 'tiempo_total']].head()

In [None]:
features_df.tiempo_total = pd.to_numeric(features_df.tiempo_total, downcast='float')
features_df.tiempo_total.head()

## Tiempo entre new y returning

In [None]:
df_news = events_df.loc[events_df.new_vs_returning == 'New']
df_news[['person', 'timestamp', 'new_vs_returning']].head()

In [None]:
df_returnings = events_df.loc[events_df.new_vs_returning == 'Returning']
df_returnings[['person', 'timestamp', 'new_vs_returning']].head()

In [None]:
df = (df_returnings.groupby('person').timestamp.min() - df_news.groupby('person').timestamp.min())
df = df.fillna(0)
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df = features_df.rename(columns={'timestamp':'tiempo_new_returning'})

In [None]:
features_df[['person', 'tiempo_new_returning']].head()

In [None]:
features_df.tiempo_new_returning = pd.to_numeric(features_df.tiempo_new_returning, downcast='float')
features_df.tiempo_new_returning.head()

## Mean entre tiempos de returnings

In [None]:
df_returnings = events_df.loc[events_df.new_vs_returning == 'Returning']
df_returnings[['person', 'timestamp', 'new_vs_returning']].head()

In [None]:
df_returnings.groupby('person').head()

In [None]:
events_df.columns

## Channel más frecuente

In [None]:
def agg_mas_frecuente(x):
    return (x.value_counts().index[0] if(len(x.value_counts())>0) else np.nan)

In [None]:
df = events_df.groupby('person').channel.agg(lambda x: agg_mas_frecuente(x))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.columns

In [None]:
features_df.rename(columns={'channel':'channel_frecuente'}, inplace=True)

In [None]:
df.value_counts().idxmax()

In [None]:
features_df.channel_frecuente.fillna(df.value_counts().idxmax(), inplace=True)

In [None]:
features_df.channel_frecuente.isna().value_counts()

In [None]:
features_df[['person', 'channel_frecuente']].head()

# Predicciones
---

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Separación de datos

In [None]:
labels_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

In [350]:
train_df = features_df.merge(labels_df, on='person', how='right')
train_df.shape

(19414, 16)

In [351]:
labels = train_df.label
train_df = train_df.drop(columns=['label'])

In [352]:
to_predict = features_df[~features_df.person.isin(labels_df.person)]
to_predict.shape

(19415, 15)

In [365]:
X_train.reset_index(inplace=True)
X_train.drop(columns='index', inplace=True)
X_test.reset_index(inplace=True)
X_test.drop(columns='index', inplace=True)
y_train = pd.DataFrame(y_train)
y_train.reset_index(inplace=True)
y_train.drop(columns='index', inplace=True)
y_test = pd.DataFrame(y_test)
y_test.reset_index(inplace=True)
y_test.drop(columns='index', inplace=True)
to_predict = pd.DataFrame(to_predict)
to_predict.reset_index(inplace=True)
to_predict.drop(columns='index', inplace=True)

## Encoders

In [354]:
import category_encoders as ce

In [375]:
te = ce.TargetEncoder(handle_unknown='ignore')

In [376]:
X_train.channel_frecuente.value_counts()

Paid        9929
Direct      2122
Organic     2039
Referral    1292
Social       135
Email         14
Name: channel_frecuente, dtype: int64

In [378]:
X_train.channel_frecuente = te.fit_transform(pd.DataFrame(X_train.channel_frecuente), y_train.label)

In [379]:
X_train.channel_frecuente.head()

0    0.061795
1    0.043307
2    0.043307
3    0.043307
4    0.061795
Name: channel_frecuente, dtype: float64

In [380]:
data_dmatrix = xgb.DMatrix(data=X_train , label=labels)

In [360]:
X_train, X_test, y_train, y_test = \
    train_test_split(train_df.drop(columns=['person']) , labels, test_size=0.2, random_state=123)

## Modelo

In [381]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic',
                          base_score= 0.1, 
                          colsample_bylevel= 0.9,
                          booster='dart', 
                          colsample_bytree = 0.5,
                          learning_rate = 0.2,
                          max_depth = 8, alpha = 20,
                          gamma=10, 
                          n_estimators = 35,
                          eval_metric='auc')

In [382]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=20, base_score=0.1, booster='dart', colsample_bylevel=0.9,
       colsample_bytree=0.5, eval_metric='auc', gamma=10,
       learning_rate=0.2, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=35, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Tests

In [383]:
X_test.channel_frecuente.value_counts()

Paid        2483
Direct       545
Organic      512
Referral     314
Social        28
Email          1
Name: channel_frecuente, dtype: int64

In [388]:
X_test.channel_frecuente = te.transform(pd.DataFrame(X_test.channel_frecuente))

In [389]:
preds = xg_reg.predict(X_test)

In [390]:
features_df.columns

Index(['person', 'cant_eventos_totales', 'cant_ach', 'cant_bl', 'cant_ch',
       'cant_cv', 'cant_gl', 'cant_l', 'cant_seh', 'cant_s_p', 'cant_sp',
       'cant_vp', 'cant_vs', 'Returning', 'channel_frecuente'],
      dtype='object')

In [391]:
roc_auc_score(y_test,preds)
# Con tiempos: 0.8082572420432335
# Sin tiempos: 0.80990061942686
# Sin tiempo total (con tiempo new returning): 0.8136200978247129
# Sin tiempo new returning (con tiempo total): 0.8137392183747094

0.8165041116848345

## Predicciones posta

In [392]:
to_predict.drop(columns='person').columns

Index(['cant_eventos_totales', 'cant_ach', 'cant_bl', 'cant_ch', 'cant_cv',
       'cant_gl', 'cant_l', 'cant_seh', 'cant_s_p', 'cant_sp', 'cant_vp',
       'cant_vs', 'Returning', 'channel_frecuente'],
      dtype='object')

In [396]:
to_predict.channel_frecuente = te.transform(pd.DataFrame(to_predict.drop(columns='person').channel_frecuente))

In [398]:
to_predict.channel_frecuente.value_counts()

4.330748e-02    12530
7.728558e-02     2630
6.179500e-02     2462
6.888545e-02     1606
3.703704e-02      175
1.184665e-07       11
Name: channel_frecuente, dtype: int64

In [399]:
preds_posta = xg_reg.predict(to_predict.drop(columns=['person']))

In [400]:
preds_posta

array([0.00981102, 0.04123898, 0.01337361, ..., 0.10599574, 0.10599574,
       0.10599574], dtype=float32)

## Generación del csv a publicar

In [401]:
to_publish = pd.DataFrame()

In [402]:
to_publish['person'] = to_predict.person
to_publish.shape

(19415, 1)

In [403]:
to_publish['label'] = preds_posta
to_publish.shape

(19415, 2)

In [404]:
to_publish.head()

Unnamed: 0,person,label
0,4886f805,0.009811
1,0297fc1e,0.041239
2,2d681dd8,0.013374
3,cccea85e,0.041481
4,4c8a8b93,0.047243


In [405]:
to_publish.to_csv('27_11_3.csv', index=False)

# Algo de cross validation
---

In [None]:
params = {'objective':'binary:logistic', 'base_score':'0.3', 
                      'colsample_bylevel':'0.7', 'booster':'dart', 
                      'colsample_bytree':'0.4', 'learning_rate':'0.25',
                      'max_depth':'13', 'alpha':'25', 'gamma':'10', 
                      'n_estimators':'23', 'eval_metric':'auc'}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
import matplotlib as plt

xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [10, 10]
plt.show()