In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

In [43]:
events_df = pd.read_csv('data/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('data/labels_training_set.csv', low_memory=False)

In [44]:
# armo df con registros para entrenamiento
train_df = events_df.merge(labels_df, on='person', how='right')

In [45]:
# armo df con registros a predecir unicamente
test = events_df[~events_df.person.isin(labels_df.person)]

In [46]:
test_size = 0.33
# define a seed, so same experiments output same results every time
seed = 12

In [47]:
# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [48]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
date_proc(X_train)
date_proc(X_test)

In [49]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [50]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(X_train.weekday)
X_train.weekday = weekday_le.transform(X_train.weekday)

In [62]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [52]:
X_train[['year','month','day','weekday','hour']].head()

Unnamed: 0,year,month,day,weekday,hour
654168,2018,5,22,5,17
755549,2018,5,22,5,18
705141,2018,5,18,0,4
621828,2018,5,16,6,20
592612,2018,5,17,4,13


In [63]:
month_to_cyclic(X_train)
day_to_cyclic(X_train)
weekday_to_cyclic(X_train)
hour_to_cyclic(X_train)

In [61]:
X_train[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

Unnamed: 0,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
654168,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-0.965926,-0.258819
755549,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-1.0,-1.83697e-16
705141,0.5,-0.866025,-0.485302,-0.874347,0.0,1.0,0.866025,0.5
621828,0.5,-0.866025,-0.101168,-0.994869,-0.781831,0.62349,-0.866025,0.5
592612,0.5,-0.866025,-0.299363,-0.954139,-0.433884,-0.900969,-0.258819,-0.9659258


In [66]:
X_train.drop('year_month_day', axis=1, inplace=True)

In [67]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       