In [None]:
import pandas as pd

In [None]:
#Leo el csv de eventos

events_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv", low_memory=False)

In [None]:
events_df.head()

# Generación de features
---

In [None]:
features_df = pd.DataFrame()

In [None]:
features_df['person'] = events_df.person.unique()

In [None]:
features_df.head()

## Cantidad eventos totales

In [None]:
df = events_df.groupby('person').event.count()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'event':'cant_eventos_totales'}, inplace=True)

In [None]:
features_df.head()

## Cantidad por evento

In [None]:
df = events_df.groupby('person').event.value_counts()

In [None]:
df = df.unstack()
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'ad campign hit':'cant_ach', 'brand listing':'cant_bl', 'checkout':'cant_ch', 
                            'conversion':'cant_cv', 'generic listing':'cant_gl', 'lead':'cant_l',
                           'search engine hit':'cant_seh', 'searched products':'cant_s_p', 
                            'staticpage':'cant_sp', 'viewed product':'cant_vp', 'visited site':'cant_vs'}, inplace=True)

In [None]:
features_df.head()

In [None]:
features_df.fillna(0, inplace=True)

In [None]:
features_df.head()

## Cantidad de returnings

In [None]:
df = events_df.groupby('person').new_vs_returning.value_counts().unstack().drop(columns=['New'])

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.head()

## Tiempo total

In [None]:
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

In [None]:
df = (events_df.groupby('person').timestamp.agg('max') - events_df.groupby('person').timestamp.agg('min'))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df = features_df.rename(columns={'timestamp':'tiempo_total'})

In [None]:
features_df[['person', 'tiempo_total']].head()

## Tiempo entre new y returning

In [54]:
df_news = events_df.loc[events_df.new_vs_returning == 'New']
df_news[['person', 'timestamp', 'new_vs_returning']].head()

Unnamed: 0,person,timestamp,new_vs_returning
2136660,f35dddc8,2018-04-11 14:46:36,New
2136661,c82ecb12,2018-04-23 14:56:42,New
2136662,102f402d,2018-05-14 15:56:35,New
2136668,1a6498ed,2018-05-15 18:13:22,New
2136669,f11aace6,2018-05-18 14:44:53,New


In [55]:
df_returnings = events_df.loc[events_df.new_vs_returning == 'Returning']
df_returnings[['person', 'timestamp', 'new_vs_returning']].head()

Unnamed: 0,person,timestamp,new_vs_returning
2136629,4640420b,2018-05-10 22:34:50,Returning
2136630,4640420b,2018-05-15 02:39:45,Returning
2136631,4640420b,2018-05-18 01:15:26,Returning
2136632,4640420b,2018-05-18 19:03:37,Returning
2136633,4640420b,2018-05-18 19:35:12,Returning


In [68]:
df = (df_returnings.groupby('person').timestamp.min() - df_news.groupby('person').timestamp.min())
df = df.fillna(0)
df.head()

person
0008ed71   0 days 02:36:55
00091926   1 days 05:34:55
00091a7a   0 days 00:00:00
000ba417   0 days 01:45:31
000c79fe   0 days 00:00:00
Name: timestamp, dtype: timedelta64[ns]

In [69]:
features_df = features_df.join(df, on='person')

In [74]:
features_df = features_df.rename(columns={'timestamp':'tiempo_new_returning'})

In [75]:
features_df[['person', 'tiempo_new_returning']].head()

Unnamed: 0,person,tiempo_new_returning
0,4886f805,0 days 00:00:00
1,ad93850f,1 days 02:57:51
2,0297fc1e,3 days 00:00:24
3,2d681dd8,9 days 19:28:03
4,cccea85e,2 days 21:21:19


# Predicciones
---

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Separación de datos

In [None]:
labels_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

In [None]:
train_df = features_df.merge(labels_df, on='person', how='right')
train_df.shape

In [None]:
labels = train_df.label
train_df = train_df.drop(columns=['label'])

In [None]:
to_predict = features_df[~features_df.person.isin(labels_df.person)]
to_predict.shape

In [None]:
data_dmatrix = xgb.DMatrix(data=train_df.drop(columns=['person']) , label=labels)

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(train_df.drop(columns=['person']) , labels, test_size=0.2, random_state=123)

## Modelo

In [None]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic',base_score= 0.1, 
                          colsample_bylevel= 0.9, booster='dart', 
                colsample_bytree = 0.5, learning_rate = 0.2,
                max_depth = 8, alpha = 20, gamma=10, n_estimators = 35, eval_metric='auc')

In [None]:
xg_reg.fit(X_train,y_train)

## Tests

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
roc_auc_score(y_test,preds)

## Predicciones posta

In [None]:
preds_posta = xg_reg.predict(to_predict.drop(columns=['person']))

In [None]:
preds_posta

## Generación del csv a publicar

In [None]:
to_publish = pd.DataFrame()

In [None]:
to_publish['person'] = to_predict.person
to_publish.shape

In [None]:
to_publish['label'] = preds_posta
to_publish.shape

In [None]:
to_publish.head()

In [None]:
to_publish.to_csv('27_11_2.csv', index=False)

# Algo de cross validation
---

In [None]:
params = {'objective':'binary:logistic', 'base_score':'0.3', 
                      'colsample_bylevel':'0.7', 'booster':'dart', 
                      'colsample_bytree':'0.4', 'learning_rate':'0.25',
                      'max_depth':'13', 'alpha':'25', 'gamma':'10', 
                      'n_estimators':'23', 'eval_metric':'auc'}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10,
                    metrics="auc", as_pandas=True, seed=123)

In [None]:
cv_results.head()

In [None]:
import matplotlib as plt

xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [10, 10]
plt.show()