In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher

In [3]:
events_df = pd.read_csv('data/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('data/labels_training_set.csv', low_memory=False)

In [4]:
# armo df con registros para entrenamiento
train_df = events_df.merge(labels_df, on='person', how='right')

In [5]:
# armo df con registros a predecir unicamente
test = events_df[~events_df.person.isin(labels_df.person)]

In [6]:
test_size = 0.33
# define a seed, so same experiments output same results every time
seed = 12

In [7]:
# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [8]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
date_proc(X_train)
date_proc(X_test)

In [9]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [10]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(X_train.weekday)
X_train.weekday = weekday_le.transform(X_train.weekday)

In [11]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [12]:
X_train[['year','month','day','weekday','hour']].head()

Unnamed: 0,year,month,day,weekday,hour
654168,2018,5,22,5,17
755549,2018,5,22,5,18
705141,2018,5,18,0,4
621828,2018,5,16,6,20
592612,2018,5,17,4,13


In [13]:
month_to_cyclic(X_train)
day_to_cyclic(X_train)
weekday_to_cyclic(X_train)
hour_to_cyclic(X_train)

In [14]:
X_train[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

Unnamed: 0,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
654168,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-0.965926,-0.258819
755549,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-1.0,-1.83697e-16
705141,0.5,-0.866025,-0.485302,-0.874347,0.0,1.0,0.866025,0.5
621828,0.5,-0.866025,-0.101168,-0.994869,-0.781831,0.62349,-0.866025,0.5
592612,0.5,-0.866025,-0.299363,-0.954139,-0.433884,-0.900969,-0.258819,-0.9659258


In [15]:
X_train.drop('year_month_day', axis=1, inplace=True)

In [16]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       

In [17]:
num_of_unique_browsers = len(X_train.browser_version.unique())

***
Good pipeline source: http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

In [66]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X):
        return self

    def transform(self, X, y=None):
        return X.loc[:,self.cols]

In [164]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaFiller(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, filler):
        self.filler = filler

    def fit(self, X):
        return self

    def transform(self, X, y=None):
        return X.fillna(self.filler)

## Browser version

In [62]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

browser_pipe = Pipeline([
    ('selector', ColumnSelector(['browser_version'])),
    ('hasher', FeatureHasher(n_features=num_of_unique_browsers//2, 
                             input_type='string')) 
])

In [63]:
browser_pipe.fit_transform(X_train)

<1x144 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

This cell has been replaced with the pipeline above

```
# http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
v = FeatureHasher(n_features=num_of_unique_browsers//2, input_type='string')
# browser_version_dict = X_train[['browser_version']]
X_train.browser_version.fillna("", inplace=True)
x = v.fit_transform(X_train[['browser_version']])
# x = v.fit_transform(browser_version_dict)
# Attach de sparse vector to df
```

## operating_system_version

In [72]:
X_train.operating_system_version.unique()

array(['Windows 10 ', nan, 'Android 5.0.2', 'Android 4.2.2', 'Android 7',
       'Windows 7 ', 'iOS 10.3.3', 'Android 5.1', 'Android 6.0.1',
       'Android 4.4.4', 'Android 5.1.1', 'Android 7.1.1', 'Android 6',
       'Windows 8.1 ', 'iOS 11.3', 'iOS 11.2.5', 'Windows 8 ',
       'Android 4.1.2', 'Mac OS X 10.13.4', 'Android 4.3', 'Android 5',
       'Linux ', 'Android 5.0.1', 'BlackBerry OS 10.3.3',
       'Windows Vista ', 'Android 4.4.2', 'iOS 9.3.5', 'Windows XP ',
       'iOS 10.2', 'iOS 8.1.3', 'Other ', 'Ubuntu ', 'iOS 9.3.2',
       'BlackBerry OS 10.3.2', 'Android 8', 'Windows Phone 8.1',
       'iOS 7.1.2', 'Android 2.3.6', 'Android 4.1.1', 'Android 7.1.2',
       'iOS 11.2.6', 'iOS 11.2.1', 'Android 4.0.4', 'iOS 11.2.2',
       'iOS 10.3.2', 'Mac OS X 10.11.6', 'iOS 10.2.1', 'Tizen 3',
       'Mac OS X 10.12.6', 'Android 8.1', 'iOS 11.1.2', 'iOS 3.2',
       'Windows Phone 10', 'Chrome OS 10452.85', 'iOS 11.2', 'iOS 5.0.1',
       'Chrome OS 10452.96', 'Android 4.4', 'Andro

In [73]:
os_num_of_unique = len(X_train.operating_system_version.unique())

In [141]:
os_ver_pipe = Pipeline([
    ('selector', ColumnSelector(['operating_system_version'])),
    ('hasher', FeatureHasher(n_features=os_num_of_unique//10, 
                             input_type='string')) 
])

## screen_resolution

In [137]:
# def split_screen_res(x):
#     if x != "":
#         split = x.split("x")
#         return [split[0], split[1]]
#     else:
#         return [0,0]
    
def get_screen_width(x):
    if x != "":
        return x.split("x")[0]
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return x.split("x")[1]
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
#     df[['screen_width', 'screen_height']] = df['screen_resolution'].apply(lambda x: split_screen_res(x))
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [138]:
process_screen_res(X_train)

## device_type

In [139]:
X_train.device_type.unique()

array(['Computer', nan, 'Smartphone', 'Tablet', 'Unknown'], dtype=object)

In [146]:
# X_train.device_type.fillna("", inplace=True)

In [142]:
device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder()) 
])

In [None]:
device_type_pipe.fit_transform(X_train)

## country

In [159]:
print(X_train.country.unique())
print("\n\tlen: " + str(len(X_train.country.unique())))

['Brazil' nan 'Unknown' 'United States' 'Argentina' 'Uruguay' 'France'
 'Canada' 'India' 'Israel' 'Portugal' 'Mozambique' 'Italy'
 'Slovak Republic' 'South Africa' 'Bolivia' 'Netherlands' 'Guinea-Bissau'
 'Peru' 'Guadeloupe' 'Bulgaria' 'United Kingdom' 'Pakistan' 'Singapore'
 'Colombia' 'Germany' 'Japan' 'Paraguay' 'Russia' 'Romania' 'Burundi'
 'Vietnam' 'Costa Rica' 'Ireland' 'Jamaica']

	len: 35


In [160]:
# X_train.country.fillna("", inplace=True)

In [166]:
device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder()) 
])