In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [None]:
#Leo el csv de eventos

events_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/events_up_to_01062018.csv", low_memory=False)

In [None]:
events_df.head()

# Generación de features
---

In [None]:
features_df = pd.DataFrame()

In [None]:
features_df['person'] = events_df.person.unique()

In [None]:
features_df.head()

## Cantidad eventos totales

In [None]:
df = events_df.groupby('person').event.count()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'event':'cant_eventos_totales'}, inplace=True)

In [None]:
features_df.head()

## Cantidad por evento

In [None]:
df = events_df.groupby('person').event.value_counts()

In [None]:
df = df.unstack()
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.rename(columns={'ad campaign hit':'cant_ach', 'brand listing':'cant_bl', 'checkout':'cant_ch', 
                            'conversion':'cant_cv', 'generic listing':'cant_gl', 'lead':'cant_l',
                           'search engine hit':'cant_seh', 'searched products':'cant_s_p', 
                            'staticpage':'cant_sp', 'viewed product':'cant_vp', 'visited site':'cant_vs'}, inplace=True)

In [None]:
features_df.head()

In [None]:
features_df.fillna(0, inplace=True)

In [None]:
features_df.head()

## Cantidad de returnings

In [None]:
df = events_df.groupby('person').new_vs_returning.value_counts().unstack().drop(columns=['New'])

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.head()

## Tiempo total

In [None]:
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])

In [None]:
df = (events_df.groupby('person').timestamp.agg('max') - events_df.groupby('person').timestamp.agg('min'))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df = features_df.rename(columns={'timestamp':'tiempo_total'})

In [None]:
features_df[['person', 'tiempo_total']].head()

In [None]:
features_df.tiempo_total = pd.to_numeric(features_df.tiempo_total, downcast='float')
features_df.tiempo_total.head()

## Tiempo entre new y returning

In [None]:
df_news = events_df.loc[events_df.new_vs_returning == 'New']
df_news[['person', 'timestamp', 'new_vs_returning']].head()

In [None]:
df_returnings = events_df.loc[events_df.new_vs_returning == 'Returning']
df_returnings[['person', 'timestamp', 'new_vs_returning']].head()

In [None]:
df = (df_returnings.groupby('person').timestamp.min() - df_news.groupby('person').timestamp.min())
df = df.fillna(0)
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df = features_df.rename(columns={'timestamp':'tiempo_new_returning'})

In [None]:
features_df[['person', 'tiempo_new_returning']].head()

In [None]:
features_df.tiempo_new_returning = pd.to_numeric(features_df.tiempo_new_returning, downcast='float')
features_df.tiempo_new_returning.head()

## Mean entre tiempos de returnings

## Channel más frecuente

In [None]:
def agg_mas_frecuente(x):
    return (x.value_counts().index[0] if(len(x.value_counts())>0) else np.nan)

In [None]:
events_df.groupby('person').channel.value_counts()

In [None]:
df = events_df.groupby('person').channel.agg(lambda x: agg_mas_frecuente(x))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.columns

In [None]:
features_df.rename(columns={'channel':'channel_frecuente'}, inplace=True)

In [None]:
df.value_counts().idxmax()

In [None]:
features_df.channel_frecuente.fillna(df.value_counts().idxmax(), inplace=True)

In [None]:
features_df.channel_frecuente.isna().value_counts()

In [None]:
features_df[['person', 'channel_frecuente']].head()

## Device más frecuente

In [None]:
events_df.device_type.value_counts()

In [None]:
df = events_df.groupby('person').device_type.agg(lambda x: agg_mas_frecuente(x))
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.columns

In [None]:
features_df.rename(columns={'device_type':'device_frecuente'}, inplace=True)

In [None]:
df.value_counts().idxmax()

In [None]:
features_df.channel_frecuente.fillna(df.value_counts().idxmax(), inplace=True)

In [None]:
features_df[['person', 'device_frecuente']].head()

## Quincenas

In [None]:
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'])
events_df['mes'] = events_df['timestamp'].dt.month
events_df['dia'] = events_df['timestamp'].dt.day

In [None]:
events_df['q1'] = ((events_df['mes']==1) & (events_df['dia']>1) & (events_df['dia']<16))
events_df['q2'] = ((events_df['mes']==1) & (events_df['dia']>15) & (events_df['dia']<32))
events_df['q3'] = ((events_df['mes']==2) & (events_df['dia']>1) & (events_df['dia']<16))
events_df['q4'] = ((events_df['mes']==2) & (events_df['dia']>15) & (events_df['dia']<29))
events_df['q5'] = ((events_df['mes']==3) & (events_df['dia']>1) & (events_df['dia']<16))
events_df['q6'] = ((events_df['mes']==3) & (events_df['dia']>15) & (events_df['dia']<32))
events_df['q7'] = ((events_df['mes']==4) & (events_df['dia']>1) & (events_df['dia']<16))
events_df['q8'] = ((events_df['mes']==4) & (events_df['dia']>15) & (events_df['dia']<31))
events_df['q9'] = ((events_df['mes']==5) & (events_df['dia']>1) & (events_df['dia']<16))
events_df['q10'] = ((events_df['mes']==5) & (events_df['dia']>15) & (events_df['dia']<32))

In [None]:
df = events_df[['person', 'q1','q2','q3','q4','q5','q6','q7','q8','q9','q10']].groupby('person').agg(lambda x: x.any())
df.head()

In [None]:
features_df = features_df.join(df, on='person')

In [None]:
features_df.columns

# Predicciones
---

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## Separación de datos

In [None]:
labels_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

In [None]:
train_df = features_df.merge(labels_df, on='person', how='right')
train_df.shape

In [None]:
labels = train_df.label
train_df = train_df.drop(columns=['label'])

In [None]:
to_predict = features_df[~features_df.person.isin(labels_df.person)]
to_predict.shape

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(train_df.drop(columns=['person']) , labels, test_size=0.2, random_state=123)

In [None]:
X_train.reset_index(inplace=True)
X_train.drop(columns='index', inplace=True)

X_test.reset_index(inplace=True)
X_test.drop(columns='index', inplace=True)

y_train = pd.DataFrame(y_train)
y_train.reset_index(inplace=True)
y_train.drop(columns='index', inplace=True)

y_test = pd.DataFrame(y_test)
y_test.reset_index(inplace=True)
y_test.drop(columns='index', inplace=True)

to_predict = pd.DataFrame(to_predict)
to_predict.reset_index(inplace=True)
to_predict.drop(columns='index', inplace=True)

## Encoders

In [None]:
import category_encoders as ce

### Channel frecuente

In [None]:
te_channel_frecuente = ce.TargetEncoder(handle_unknown='ignore')

In [None]:
X_train.channel_frecuente = te_channel_frecuente.fit_transform(pd.DataFrame(X_train.channel_frecuente), y_train.label)

### Device frecuente

In [None]:
te_device_frecuente = ce.TargetEncoder(handle_unknown='ignore')

In [None]:
X_train.device_frecuente = te_device_frecuente.fit_transform(pd.DataFrame(X_train.device_frecuente), y_train.label)

## Modelo

In [None]:
c=lgb.LGBMRegressor(learning_rate=0.5,
                     objective='regression',
                     max_depth=20, 
                     num_leaves=3000,
                     n_estimators=50,
                     colsample_bytree=0.9,
                     n_jobs=1,
                     random_state=0,
                     silent=False,
                     subsample=1,
                     sumsample_freq=1, 
                     two_round=True,
                     boosting_type='dart')

dt=c.fit(X_train,y_train.label,eval_set=[(X_test,y_test.label)],verbose=True,early_stopping_rounds=1)

In [None]:
c.score(X_test,y_test)
#0.9415400463559104

## Tests

In [None]:
X_test.channel_frecuente = te_channel_frecuente.transform(pd.DataFrame(X_test.channel_frecuente))

In [None]:
X_test.device_frecuente = te_device_frecuente.transform(pd.DataFrame(X_test.device_frecuente))

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
features_df.columns

In [None]:
roc_auc_score(y_test,preds)

## Predicciones posta

In [None]:
to_predict.channel_frecuente = te_channel_frecuente.transform(pd.DataFrame(to_predict.drop(columns='person').channel_frecuente))

In [None]:
to_predict.device_frecuente = te_device_frecuente.transform(pd.DataFrame(to_predict.drop(columns='person').device_frecuente))

In [None]:
preds_posta = c.predict_proba(to_predict.drop(columns=['person']))

In [None]:
preds_posta

In [None]:
preds_posta.shape

In [None]:
preds_posta2=[]
print(c.classes_)
print(len(preds_posta))
for elem in preds_posta:
    preds_posta2.append(elem[1])
print(len(preds_posta2))

## Generación del csv a publicar

In [None]:
to_publish = pd.DataFrame()

In [None]:
to_publish['person'] = to_predict.person
to_publish.shape

In [None]:
to_publish['label'] = preds_posta2
to_publish.shape

In [None]:
to_publish.head()

In [None]:
to_publish.to_csv('28_11_2.csv', index=False)