In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

In [3]:
events_df = pd.read_csv('data/events_up_to_01062018.csv', low_memory=False)
labels_df = pd.read_csv('data/labels_training_set.csv', low_memory=False)

In [4]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')

In [5]:
# armo df con registros a predecir unicamente
to_predict = events_df[~events_df.person.isin(labels_df.person)]

In [6]:
test_size = 0.33
# define a seed, so same experiments output same results every time
seed = 12

In [7]:
# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines.

Good pipeline sources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

In [8]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
date_proc(X_train)
date_proc(X_test)

In [9]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [10]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(X_train.weekday)

X_train.weekday = weekday_le.transform(X_train.weekday)
X_test.weekday = weekday_le.transform(X_test.weekday)

In [11]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [12]:
# cell to compare results before & after processing
X_train[['year','month','day','weekday','hour']].head()

Unnamed: 0,year,month,day,weekday,hour
654168,2018,5,22,5,17
755549,2018,5,22,5,18
705141,2018,5,18,0,4
621828,2018,5,16,6,20
592612,2018,5,17,4,13


In [13]:
month_to_cyclic(X_train)
day_to_cyclic(X_train)
weekday_to_cyclic(X_train)
hour_to_cyclic(X_train)

month_to_cyclic(X_test)
day_to_cyclic(X_test)
weekday_to_cyclic(X_test)
hour_to_cyclic(X_test)

In [14]:
X_train[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

Unnamed: 0,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,hour_sin,hour_cos
654168,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-0.965926,-0.258819
755549,0.5,-0.866025,-0.968077,-0.250653,-0.974928,-0.222521,-1.0,-1.83697e-16
705141,0.5,-0.866025,-0.485302,-0.874347,0.0,1.0,0.866025,0.5
621828,0.5,-0.866025,-0.101168,-0.994869,-0.781831,0.62349,-0.866025,0.5
592612,0.5,-0.866025,-0.299363,-0.954139,-0.433884,-0.900969,-0.258819,-0.9659258


In [15]:
X_train.drop('year_month_day', axis=1, inplace=True)
X_test.drop('year_month_day', axis=1, inplace=True)

In [16]:
X_train.dtypes

timestamp                   datetime64[ns]
event                               object
person                              object
url                                 object
sku                                float64
model                               object
condition                           object
storage                             object
color                               object
skus                                object
search_term                         object
staticpage                          object
campaign_source                     object
search_engine                       object
channel                             object
new_vs_returning                    object
city                                object
region                              object
country                             object
device_type                         object
screen_resolution                   object
operating_system_version            object
browser_version                     object
year       

### Build some custom transformers

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:,self.cols]

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaFiller(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, filler):
        self.filler = filler

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.fillna(self.filler)

# Feature pipeline creation and some pre processing

## Browser version

In [19]:
num_of_unique_browsers = len(X_train.browser_version.unique())

In [20]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

browser_pipe = Pipeline([
    ('selector', ColumnSelector(['browser_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [21]:
browser_pipe.fit_transform(X_train)

<785163x288 sparse matrix of type '<class 'numpy.float64'>'
	with 785163 stored elements in Compressed Sparse Row format>

This cell has been replaced with the pipeline above

```
# http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
v = FeatureHasher(n_features=num_of_unique_browsers//2, input_type='string')
# browser_version_dict = X_train[['browser_version']]
X_train.browser_version.fillna("", inplace=True)
x = v.fit_transform(X_train[['browser_version']])
# x = v.fit_transform(browser_version_dict)
# Attach de sparse vector to df
```

## operating_system_version

In [22]:
X_train.operating_system_version.unique()

array(['Windows 10 ', nan, 'Android 5.0.2', 'Android 4.2.2', 'Android 7',
       'Windows 7 ', 'iOS 10.3.3', 'Android 5.1', 'Android 6.0.1',
       'Android 4.4.4', 'Android 5.1.1', 'Android 7.1.1', 'Android 6',
       'Windows 8.1 ', 'iOS 11.3', 'iOS 11.2.5', 'Windows 8 ',
       'Android 4.1.2', 'Mac OS X 10.13.4', 'Android 4.3', 'Android 5',
       'Linux ', 'Android 5.0.1', 'BlackBerry OS 10.3.3',
       'Windows Vista ', 'Android 4.4.2', 'iOS 9.3.5', 'Windows XP ',
       'iOS 10.2', 'iOS 8.1.3', 'Other ', 'Ubuntu ', 'iOS 9.3.2',
       'BlackBerry OS 10.3.2', 'Android 8', 'Windows Phone 8.1',
       'iOS 7.1.2', 'Android 2.3.6', 'Android 4.1.1', 'Android 7.1.2',
       'iOS 11.2.6', 'iOS 11.2.1', 'Android 4.0.4', 'iOS 11.2.2',
       'iOS 10.3.2', 'Mac OS X 10.11.6', 'iOS 10.2.1', 'Tizen 3',
       'Mac OS X 10.12.6', 'Android 8.1', 'iOS 11.1.2', 'iOS 3.2',
       'Windows Phone 10', 'Chrome OS 10452.85', 'iOS 11.2', 'iOS 5.0.1',
       'Chrome OS 10452.96', 'Android 4.4', 'Andro

In [23]:
os_num_of_unique = len(X_train.operating_system_version.unique())

In [24]:
os_ver_pipe = Pipeline([
    ('selector', ColumnSelector(['operating_system_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [25]:
os_ver_pipe.fit_transform(X_train)

<785163x104 sparse matrix of type '<class 'numpy.float64'>'
	with 785163 stored elements in Compressed Sparse Row format>

## screen_resolution

In [26]:
def get_screen_width(x):
    if x != "":
        return x.split("x")[0]
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return x.split("x")[1]
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [27]:
process_screen_res(X_train)
process_screen_res(X_test)

## device_type

In [28]:
X_train.device_type.unique()

array(['Computer', nan, 'Smartphone', 'Tablet', 'Unknown'], dtype=object)

In [29]:
# X_train.device_type.fillna("", inplace=True)

In [30]:
device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

In [31]:
device_type_pipe.fit_transform(X_train)

<785163x5 sparse matrix of type '<class 'numpy.float64'>'
	with 785163 stored elements in Compressed Sparse Row format>

## country

In [32]:
print(X_train.country.unique())
print("\n\tlen: " + str(len(X_train.country.unique())))

['Brazil' nan 'Unknown' 'United States' 'Argentina' 'Uruguay' 'France'
 'Canada' 'India' 'Israel' 'Portugal' 'Mozambique' 'Italy'
 'Slovak Republic' 'South Africa' 'Bolivia' 'Netherlands' 'Guinea-Bissau'
 'Peru' 'Guadeloupe' 'Bulgaria' 'United Kingdom' 'Pakistan' 'Singapore'
 'Colombia' 'Germany' 'Japan' 'Paraguay' 'Russia' 'Romania' 'Burundi'
 'Vietnam' 'Costa Rica' 'Ireland' 'Jamaica']

	len: 35


In [33]:
# X_train.country.fillna("", inplace=True)

In [34]:
country_pipe = Pipeline([
    ('selector', ColumnSelector(['country'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

## region

In [35]:
print(X_train.region.unique())
print("\n\tlen: " + str(len(X_train.region.unique())))

['Rio de Janeiro' nan 'Unknown' 'Tocantins' 'Sao Paulo' 'Santa Catarina'
 'Minas Gerais' 'Bahia' 'Parana' 'Ceara' 'Rio Grande do Norte'
 'Federal District' 'Espirito Santo' 'Amazonas' 'Pernambuco' 'Para'
 'Paraíba' 'Goias' 'Rio Grande do Sul' 'California' 'Mato Grosso do Sul'
 'Buenos Aires F.D.' 'Piaui' 'Maranhao' 'Rondonia' 'Alagoas' 'Mato Grosso'
 'Sergipe' 'Departamento de Montevideo' 'Acre' 'Amapa' 'Buenos Aires'
 'Wisconsin' 'New Jersey' 'Quebec' 'Karnataka' 'Roraima' 'Tennessee'
 'Paris' 'Setúbal' 'Illinois' 'British Columbia' 'New York'
 'Cidade de Maputo' 'Texas' 'Florida' 'Milan' 'Western Cape'
 'Departamento de Santa Cruz' 'North Holland' 'Virginia' 'Georgia'
 'Departamento de La Paz' 'Sofia-Capital' 'Oklahoma' 'England' 'Ohio'
 'Delaware' 'Seine-Saint-Denis' 'Connecticut' 'Provincia di Lecce'
 'Entre Rios' 'Iowa' 'Wrexham' 'Hesse' 'Tokyo' 'Asuncion'
 "Tul'skaya Oblast'" 'Ontario' 'Bucuresti' 'Tinh Quang Nam'
 'Sofala Province' 'Washington' 'Michigan' 'Turin' 'North' 'County

In [36]:
region_pipe = Pipeline([
    ('selector', ColumnSelector(['region'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [37]:
# region_pipe.fit_transform(X_train)

## city

In [38]:
print(X_train.city.unique())
print("\n\tlen: " + str(len(X_train.city.unique())))

['Cardoso Moreira' nan 'Unknown' ... 'Tupaciguara' 'Marco' 'Arembepe']

	len: 1729


In [39]:
city_pipe = Pipeline([
    ('selector', ColumnSelector(['city'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [40]:
# city_pipe.fit_transform(X_train)

## new_vs_returning

In [41]:
print(X_train.new_vs_returning.unique())
print("\n\tlen: " + str(len(X_train.new_vs_returning.unique())))

['Returning' nan 'New']

	len: 3


In [42]:
new_vs_returning_pipe = Pipeline([
    ('selector', ColumnSelector(['new_vs_returning'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [43]:
# new_vs_returning_pipe.fit_transform(X_train)

## channel

In [44]:
print(X_train.channel.unique())
print("\n\tlen: " + str(len(X_train.channel.unique())))

['Direct' nan 'Organic' 'Paid' 'Referral' 'Social' 'Email' 'Unknown']

	len: 8


In [45]:
channel_pipe = Pipeline([
    ('selector', ColumnSelector(['channel'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [46]:
# channel_pipe.fit_transform(X_train)

## search_engine

In [47]:
print(X_train.search_engine.unique())
print("\n\tlen: " + str(len(X_train.search_engine.unique())))

[nan 'Google' 'Bing' 'Yahoo' 'Ask']

	len: 5


In [48]:
search_engine_pipe = Pipeline([
    ('selector', ColumnSelector(['search_engine'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [49]:
# search_engine_pipe.fit_transform(X_train)

## campaign_source

In [50]:
print(X_train.campaign_source.unique())
print("\n\tlen: " + str(len(X_train.campaign_source.unique())))

[nan 'rtbhouse' 'google' 'criteo' 'bing' 'zanox' 'manifest' 'buscape'
 'emblue' 'FacebookAds' 'rakuten' 'afilio' 'voxus' 'yotpo' 'Facebook'
 'indexa' 'datacrush' 'blog' 'afiliado' 'mercadopago' 'FacebookSocial'
 'onsite' 'gizmodo' 'MARKETING SOCIAL']

	len: 24


In [51]:
campaign_source_pipe = Pipeline([
    ('selector', ColumnSelector(['campaign_source'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [52]:
# campaign_source_pipe.fit_transform(X_train)

## staticpage

In [53]:
print(X_train.staticpage.unique())
print("\n\tlen: " + str(len(X_train.staticpage.unique())))

[nan 'CustomerService' 'galaxy-s8' 'Quiosks' 'how-to-sell' 'Conditions'
 'AboutUs' 'trust-trocafone' 'TermsAndConditionsReturnEcommerce'
 'FaqEcommerce' 'how-to-buy' 'club-trocafone'
 'TermsAndConditionsEcommerce' 'black_friday' 'PrivacyEcommerce']

	len: 15


In [54]:
staticpage_pipe = Pipeline([
    ('selector', ColumnSelector(['staticpage'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [55]:
# staticpage_pipe.fit_transform(X_train)

## search_term

In [56]:
print(X_train.search_term.unique())
print("\n\tlen: " + str(len(X_train.search_term.unique())))

[nan 'Moto x4' 'iphone x' ... 'IphonIphone7' 'IPHONe 7'
 'Samsung Galaxy a7 2016 32 GB.']

	len: 5285


In [57]:
search_term_pipe = Pipeline([
    ('selector', ColumnSelector(['search_term'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [58]:
# search_term_pipe.fit_transform(X_train)

## skus

In [59]:
print(X_train.skus.unique())
print("\n\tlen: " + str(len(X_train.skus.unique())))

[nan '7631,5016,5088,5135,8429,9216,9174,10450,9203,4969,7827,10434'
 '8485,8471,8443,6371,12758,6357,6412,3371,2777,3179,2778,8513' ...
 '2691,2694,2830,2711,2710,2693,2699,2692,3647,2833,2682,2831'
 '6847,6902,6832,6888,6957,8950,7084,6581,6875,6971,6930,6916'
 '13192,13248']

	len: 29547


In [60]:
skus_pipe = Pipeline([
    ('selector', ColumnSelector(['skus'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [61]:
# skus_pipe.fit_transform(X_train)

## color

In [62]:
print(X_train.color.unique())
print("\n\tlen: " + str(len(X_train.color.unique())))

[nan 'Dourado' 'Branco' 'Preto' 'Prateado' 'Preto Matte' 'Cinza espacial'
 'Couro Vintage' 'Rosa' 'Black Piano' 'Platinum' 'Prata' 'Vermelho' 'Azul'
 'Ametista' 'Preto Brilhante' 'Ouro' 'Titânio' 'Ouro Rosa' 'Branco Azul'
 'Preto Vermelho' 'Azul Escuro' 'Bambu' 'Indigo' 'Verde' 'Branco Pink'
 'Preto Azul' 'Cinza' 'Branco Vermelho' 'Rose' 'Verde Água'
 'Preto Azul Navy' 'Coral' 'Branco Azul Navy' 'Cabernet' 'Amarelo' 'Roxo'
 'Preto Verde' 'Azul Safira' 'Branco Dourado' 'Rouge' 'Azul Topázio'
 'Olympic Edition' 'Iuna' 'Branco Bambu' 'Couro Navy' 'Preto Cabernet'
 'Couro Vinho' 'Framboesa' 'Couro Marrom' 'Preto Pink' 'Branco Framboesa'
 'Ônix' 'Turquesa' 'Branco Cabernet' 'Branco Verde' 'Cobre' 'Preto Tabaco'
 'Preto Branco' 'Preto Bambu' 'Cromo' 'Verde Petroleo' 'Preto Asfalto'
 'Silver']

	len: 64


In [63]:
color_pipe = Pipeline([
    ('selector', ColumnSelector(['color'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [64]:
# color_pipe.fit_transform(X_train)

## storage

In [65]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [66]:
storage_process(X_train)
storage_process(X_test)

## condition

In [67]:
print(X_train.condition.unique())
print("\n\tlen: " + str(len(X_train.condition.unique())))

[nan 'Excelente' 'Muito Bom' 'Bom' 'Bom - Sem Touch ID' 'Novo']

	len: 6


In [68]:
condition_pipe = Pipeline([
    ('selector', ColumnSelector(['condition'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [69]:
# condition_pipe.fit_transform(X_train)

## model

In [70]:
print(X_train.model.unique())
print("\n\tlen: " + str(len(X_train.model.unique())))

[nan 'iPhone 6S Plus' 'iPhone 8' 'Motorola Moto G4 Plus'
 'Samsung Galaxy S7' 'iPhone 5s' 'iPhone 7 Plus'
 'Samsung Galaxy Note 3 Neo Duos' 'Motorola Moto Z' 'iPhone SE' 'iPhone 6'
 'Motorola Moto X2' 'Samsung Galaxy J1 2016' 'Samsung Galaxy A7 2017'
 'Samsung Galaxy A5' 'Samsung Galaxy S5' 'Samsung Galaxy S7 Edge'
 'iPhone 5' 'Samsung Galaxy S6 Edge' 'iPhone 4S' 'Samsung Galaxy S6 Flat'
 'Motorola Moto Z Play' 'iPhone 7' 'Motorola Moto G5 Plus' 'iPhone 6S'
 'Samsung Galaxy A5 2016' 'Samsung Galaxy S8' 'Motorola Moto G3 HDTV'
 'Samsung Galaxy A7' 'Samsung Galaxy A3 Duos' 'iPhone 6 Plus'
 'Samsung Galaxy J3' 'Samsung Galaxy S3 Slim Duos'
 'Samsung Galaxy S8 Plus' 'Lenovo Vibe K5' 'Motorola Moto X Play 4G Dual'
 'Samsung Galaxy J5 PRO' 'Samsung Galaxy Note 3' 'Samsung Galaxy J5'
 'Samsung Galaxy Note 4' 'Samsung Galaxy Gran Prime Duos TV' 'iPhone 5c'
 'Samsung Galaxy J7 Prime' 'Motorola Moto G5 ' 'Motorola Moto E2 4G Dual'
 'Motorola Moto Z Power Edition' 'Samsung Galaxy S5 Duos'
 'Motor

In [71]:
model_pipe = Pipeline([
    ('selector', ColumnSelector(['model'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [72]:
# model_pipe.fit_transform(X_train)

## sku

In [73]:
print(X_train.sku.unique())
print("\n\tlen: " + str(len(X_train.sku.unique())))

[   nan  7013. 13404. ... 13712.  1558.  9889.]

	len: 2168


In [74]:
sku_pipe = Pipeline([
    ('selector', ColumnSelector(['sku'])),
    ('na_filler', NaFiller(0)),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [75]:
# sku_pipe.fit_transform(X_train)

## url

In [76]:
print(X_train.url.unique())
print("\n\tlen: " + str(len(X_train.url.unique())))

[nan '/comprar/samsung/galaxy-s8' '/comprar/iphone/iphone-5'
 '/comprar/motorola/moto-x-force' '/' '/comprar/iphone'
 '/comprar/samsung/galaxy-a7-2017'
 '/comprar/samsung/samsung-gran-prime-duos-tv'
 '/comprar/samsung/galaxy-a5-2017' '/comprar/iphone/iphone-5s'
 '/comprar/motorola/moto-g-2a-geracao' '/comprar/iphone/6'
 '/comprar/samsung/galaxy-a5-2016' '/comprar/samsung/galaxy-win-duos'
 '/comprar/iphone/iphone-6s-plus' '/comprar/iphone/6s'
 '/comprar/samsung/galaxy-s4-i9505'
 '/comprar/samsung/galaxy-note-ii-n7100' '/comprar/samsung/galaxy-note-4'
 '/comprar/samsung/galaxy-s3-mini/' '/comprar/samsung/galaxy-s6-flat'
 '/comprar/samsung/galaxy-a7-2016' '/comprar/samsung/galaxy-j5'
 '/comprar/motorola/motorola-moto-g-3a-geracao'
 '/comprar/iphone/iphone-se' '/comprar/iphone/7-plus'
 '/comprar/samsung/galaxy-s6-edge' '/comprar/ofertas/pre-venda-galaxy-s8'
 '/comprar/samsung/galaxy-s7-edge' '/comprar/iphone/iphone-4s'
 '/comprar/samsung' '/comprar/samsung/galaxy-s7'
 '/comprar/samsung/gal

In [77]:
url_pipe = Pipeline([
    ('selector', ColumnSelector(['url'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [78]:
# url_pipe.fit_transform(X_train)

## person

In [79]:
print(X_train.person.unique())
print("\n\tlen: " + str(len(X_train.person.unique())))

['86742710' 'bb4b8182' '5af2db5e' ... '407fba14' 'c77d7c6a' '266bdbab']

	len: 19321


In [80]:
person_pipe = Pipeline([
    ('selector', ColumnSelector(['person'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [81]:
# person_pipe.fit_transform(X_train)

## event

In [82]:
print(X_train.event.unique())
print("\n\tlen: " + str(len(X_train.event.unique())))

['visited site' 'viewed product' 'searched products' 'brand listing'
 'ad campaign hit' 'checkout' 'search engine hit' 'generic listing'
 'staticpage' 'lead' 'conversion']

	len: 11


In [83]:
event_pipe = Pipeline([
    ('selector', ColumnSelector(['event'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [84]:
# event_pipe.fit_transform(X_train)

## Lets bring all of them together!

In [88]:
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression

In [91]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([
    ('browser', browser_pipe),
    ('os', os_ver_pipe),
    ('device_type', device_type_pipe),
    ('country', country_pipe),
    ('region', region_pipe),
    ('city', city_pipe),
    ('new_vs_ret', new_vs_returning_pipe),
    ('channel', channel_pipe),
    ('search_eng', search_engine_pipe),
    ('campaign', campaign_source_pipe),
    ('staticpage', staticpage_pipe),
    ('searchterm', search_term_pipe),
    ('skus', skus_pipe),
    ('color', color_pipe),
    ('condition', condition_pipe),
    ('model', model_pipe),
    ('sku', sku_pipe),
    ('url', url_pipe),
    ('person', person_pipe),
    ('event', event_pipe)
])

feature_processing = Pipeline([
    ('feats', feats),
    ('lr', LogisticRegression(solver='sag'))
#     ('xgb', XGBRegressor())
])

feature_processing.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=None,
       transformer_list=[('browser', Pipeline(memory=None,
     steps=[('selector', ColumnSelector(cols=['browser_version'])), ('na_filler', NaFiller(filler='')), ('hasher', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'nump... penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False))])

In [103]:
preds = feature_processing.predict_proba(X_test)[:,1]

In [104]:
preds.shape

(386723,)

In [106]:
y_test.shape

(386723,)

In [107]:
roc_auc_score(y_test,preds)

0.9999958398983199