In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import Imputer

In [6]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer
import category_encoders as ce

In [7]:
labels_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

In [8]:
events_df = pd.read_csv("data/9_11_mas_supports.csv", low_memory=False)

In [9]:
events_df.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'mes', 'dia', 'q1', 'q2',
       'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10',
       'support_p_todos_sus_ev', 'support_con_peso', 'support_ind', 'year',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin',
       'weekday_cos', 'hour_sin', 'hour_cos'],
      dtype='object')

# Preprocessing

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines.

Good pipeline sources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

In [10]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
#date_proc(events)

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [11]:
#weekday_le = preprocessing.LabelEncoder()
#weekday_le.fit(events_df.weekday)

#events_df.weekday = weekday_le.transform(events_df.weekday)

In [12]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [13]:
#month_to_cyclic(events_df)
#day_to_cyclic(events_df)
#weekday_to_cyclic(events_df)
#hour_to_cyclic(events_df)

In [14]:
#events_df.drop('year_month_day', axis=1, inplace=True)

In [15]:
def get_screen_width(x):
    if x != "":
        return x.split("x")[0]
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return x.split("x")[1]
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [16]:
process_screen_res(events_df)

In [17]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [18]:
storage_process(events_df)

# Separación de datos
---

In [71]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')
train_df.shape

(1171886, 49)

In [72]:
y_train = train_df.label

In [73]:
train_df = train_df.drop(columns=['label'])

In [74]:
# armo df con registros a predecir unicamente
to_predict = pd.DataFrame(events_df[~events_df.person.isin(labels_df.person)])

In [75]:
test_size = 0.33
# define a seed, so same experiments output same results every time
seed = 12

In [76]:
# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    y_train, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [25]:
X_train.index

Int64Index([ 654168,  755549,  705141,  621828,  592612, 1090120,  686227,
             774368,  202559,  265341,
            ...
             929141,  588160,    9846,   20953,  937613,  114565, 1171505,
             206083,   36482,  564465],
           dtype='int64', length=785163)

In [26]:
y_train.index

Int64Index([ 654168,  755549,  705141,  621828,  592612, 1090120,  686227,
             774368,  202559,  265341,
            ...
             929141,  588160,    9846,   20953,  937613,  114565, 1171505,
             206083,   36482,  564465],
           dtype='int64', length=785163)

In [27]:
X_train.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'operating_system_version',
       'browser_version', 'mes', 'dia', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6',
       'q7', 'q8', 'q9', 'q10', 'support_p_todos_sus_ev', 'support_con_peso',
       'support_ind', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos',
       'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos', 'screen_width',
       'screen_height'],
      dtype='object')

In [78]:
X_train.reset_index(inplace=True)
X_train.drop(columns='index', inplace=True)
X_test.reset_index(inplace=True)
X_test.drop(columns='index', inplace=True)
y_train = pd.DataFrame(y_train)
y_train.reset_index(inplace=True)
y_train.drop(columns='index', inplace=True)
y_test = pd.DataFrame(y_test)
y_test.reset_index(inplace=True)
y_test.drop(columns='index', inplace=True)

# Creación de Pipelines
---

### Build some custom transformers

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:,self.cols]

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaFiller(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, filler):
        self.filler = filler

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.fillna(self.filler)

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

class IndexMatcher(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self):
        return

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print('Index before X: ' + str(X.index))
        X_aux =  X.reset_index().drop(columns='index')
        print('Index after X: ' + str(X_aux.index))
        return X_aux

In [54]:
X_train.dtypes

timestamp                    object
event                        object
person                       object
url                          object
sku                         float64
model                        object
condition                    object
storage                     float64
color                        object
skus                         object
search_term                  object
staticpage                   object
campaign_source              object
search_engine                object
channel                      object
new_vs_returning             object
city                         object
region                       object
country                      object
device_type                  object
operating_system_version     object
browser_version              object
mes                           int64
dia                           int64
q1                             bool
q2                             bool
q3                             bool
q4                          

In [79]:
## q1
q1_pipe = Pipeline([
    ('selector', ColumnSelector(['q1'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q2
q2_pipe = Pipeline([
    ('selector', ColumnSelector(['q2'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q3
q3_pipe = Pipeline([
    ('selector', ColumnSelector(['q3'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q4
q4_pipe = Pipeline([
    ('selector', ColumnSelector(['q4'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q5
q5_pipe = Pipeline([
    ('selector', ColumnSelector(['q5'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q6
q6_pipe = Pipeline([
    ('selector', ColumnSelector(['q6'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q7
q7_pipe = Pipeline([
    ('selector', ColumnSelector(['q7'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q8
q8_pipe = Pipeline([
    ('selector', ColumnSelector(['q8'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q9
q9_pipe = Pipeline([
    ('selector', ColumnSelector(['q9'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## q10
q10_pipe = Pipeline([
    ('selector', ColumnSelector(['q10'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## support_p_todos_sus_ev
sup_pipe = Pipeline([
    ('selector', ColumnSelector(['support_p_todos_sus_ev'])),
    ('na_filler', NaFiller(0)),
    #'hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## Support_con_peso
sup_peso_pipe = Pipeline([
    ('selector', ColumnSelector(['support_con_peso'])),
    ('na_filler', NaFiller(0)),
    #'hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

##  'support_ind'
sup_ind_pipe = Pipeline([
    ('selector', ColumnSelector(['support_ind'])),
    ('na_filler', NaFiller(0)),
    #'hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## url comprar
urlcomprar_pipe = Pipeline([
    ('selector', ColumnSelector(['url_comprar'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore')) 
])

## url vender
urlvender_pipe = Pipeline([
    ('selector', ColumnSelector(['url_vender'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore')) 
])

## year
year_pipe = Pipeline([
    ('selector', ColumnSelector(['year'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## month_sin
monthsin_pipe = Pipeline([
    ('selector', ColumnSelector(['month_sin'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## month_cos
monthcos_pipe = Pipeline([
    ('selector', ColumnSelector(['month_cos'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## day_sin
daysin_pipe = Pipeline([
    ('selector', ColumnSelector(['day_sin'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## day_cos
daycos_pipe = Pipeline([
    ('selector', ColumnSelector(['day_cos'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## weekday_sin
weekdaysin_pipe = Pipeline([
    ('selector', ColumnSelector(['weekday_sin'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## weekday_cos
weekdaycos_pipe = Pipeline([
    ('selector', ColumnSelector(['weekday_cos'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## hour_sin
hoursin_pipe = Pipeline([
    ('selector', ColumnSelector(['hour_sin'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## hour_cos
hourcos_pipe = Pipeline([
    ('selector', ColumnSelector(['hour_cos'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## samsung
samsung_pipe = Pipeline([
    ('selector', ColumnSelector(['samsung'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## motorola
motorola_pipe = Pipeline([
    ('selector', ColumnSelector(['year'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## iphone
iphone_pipe = Pipeline([
    ('selector', ColumnSelector(['iphone'])),
    ('na_filler', NaFiller(0)),
    #'hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## Browser version
browser_pipe = Pipeline([
    ('selector', ColumnSelector(['browser_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#browser_pipe.fit_transform(train_df)

## operating_system_version
os_ver_pipe = Pipeline([
    ('selector', ColumnSelector(['operating_system_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#os_ver_pipe.fit_transform(train_df)

## screen_resolution
screenreswidth_pipe = Pipeline([
    ('selector', ColumnSelector(['screen_width'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#screenreswidth_pipe.fit_transform(train_df)

screenresheight_pipe = Pipeline([
    ('selector', ColumnSelector(['screen_height'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#screenresheight_pipe.fit_transform(train_df)

## device_type
device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#device_type_pipe.fit_transform(train_df)

## country
country_pipe = Pipeline([
    ('selector', ColumnSelector(['country'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#country_pipe.fit_transform(train_df)

## region
region_pipe = Pipeline([
    ('selector', ColumnSelector(['region'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#region_pipe.fit_transform(train_df)

## city
city_pipe = Pipeline([
    ('selector', ColumnSelector(['city'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#city_pipe.fit_transform(train_df)
## new_vs_returning
new_vs_returning_pipe = Pipeline([
    ('selector', ColumnSelector(['new_vs_returning'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#new_vs_returning_pipe.fit_transform(train_df)
## channel
channel_pipe = Pipeline([
    ('selector', ColumnSelector(['channel'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

#channel_pipe.fit_transform(train_df)
## search_engine
search_engine_pipe = Pipeline([
    ('selector', ColumnSelector(['search_engine'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## campaign_source
campaign_source_pipe = Pipeline([
    ('selector', ColumnSelector(['campaign_source'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## staticpage
staticpage_pipe = Pipeline([
    ('selector', ColumnSelector(['staticpage'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## search_term
search_term_pipe = Pipeline([
    ('selector', ColumnSelector(['search_term'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## skus
skus_pipe = Pipeline([
    ('selector', ColumnSelector(['skus'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

## color
color_pipe = Pipeline([
    ('selector', ColumnSelector(['color'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## storage
storage_pipe = Pipeline([
    ('selector', ColumnSelector(['storage'])),
    ('na_filler', NaFiller(0)),
    #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])


## condition

condition_pipe = Pipeline([
    ('selector', ColumnSelector(['condition'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## model

model_pipe = Pipeline([
    ('selector', ColumnSelector(['model'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## sku

sku_pipe = Pipeline([
    ('selector', ColumnSelector(['sku'])),
    ('na_filler', NaFiller(0)),
    ('one_hot', ce.TargetEncoder(handle_unknown='ignore'))
])

## url

url_pipe = Pipeline([
    ('selector', ColumnSelector(['url'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## person

person_pipe = Pipeline([
    ('selector', ColumnSelector(['person'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

## event

event_pipe = Pipeline([
    ('selector', ColumnSelector(['event'])),
    ('na_filler', NaFiller("")),
    ('hasher', ce.TargetEncoder(handle_unknown='ignore'))
])

### Event_count 

event_count_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q1

event_count_q1_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q1'])),
    ('na_filler', NaFiller(0)),
    #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q2

event_count_q2_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q2'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q3

event_count_q3_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q3'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q4

event_count_q4_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q4'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q5

event_count_q5_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q5'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q6

event_count_q6_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q6'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q7

event_count_q7_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q7'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q8

event_count_q8_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q8'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q9

event_count_q9_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q9'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Event_count_q10

event_count_q10_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q10'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total

tiempo_total_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### tiempo_tota_q1

tiempo_total_q1_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q1'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### tiempo_total_q2

tiempo_total_q2_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q2'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q3

tiempo_total_q3_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q3'])),
    ('na_filler', NaFiller(0)),
    #'one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q4 

tiempo_total_q4_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q4'])),
    ('na_filler', NaFiller(0)),
    #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q5

tiempo_total_q5_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q5'])),
    ('na_filler', NaFiller(0)),
   #('ne_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q6

tiempo_total_q6_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q6'])),
    ('na_filler', NaFiller(0)),
   #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q7

tiempo_total_q7_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q7'])),
    ('na_filler', NaFiller(0)),
   #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q8

tiempo_total_q8_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q8'])),
    ('na_filler', NaFiller(0)),
   #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q9

tiempo_total_q9_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q9'])),
    ('na_filler', NaFiller(0)),
   #('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

### Tiempo_total_q10

tiempo_total_q10_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q10'])),
    ('na_filler', NaFiller(0)),
   #('one_hot', OneHotEncoder(handle_unknown='ignore'))_
])

## Lets bring all of them together!

In [50]:
from xgboost import XGBRegressor
import xgboost as xbg
from sklearn.linear_model import LogisticRegression

In [51]:
events_df.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'operating_system_version',
       'browser_version', 'mes', 'dia', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6',
       'q7', 'q8', 'q9', 'q10', 'support_p_todos_sus_ev', 'support_con_peso',
       'support_ind', 'year', 'month_sin', 'month_cos', 'day_sin', 'day_cos',
       'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos', 'screen_width',
       'screen_height'],
      dtype='object')

In [None]:
skus_pipe.fit_transform(X_train, y_train)

In [92]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([
    ('browser', browser_pipe),
    ('os', os_ver_pipe),
    ('device_type', device_type_pipe),
    ('country', country_pipe),
    ('region', region_pipe),
    ('city', city_pipe),
    ('new_vs_ret', new_vs_returning_pipe),
    ('channel', channel_pipe),
    ('search_eng', search_engine_pipe),
    ('campaign', campaign_source_pipe),
    ('staticpage', staticpage_pipe),
    ('searchterm', search_term_pipe),
    ('skus', skus_pipe),
    ('color', color_pipe),
    ('condition', condition_pipe),
    ('model', model_pipe),
    ('sku', sku_pipe),
    ('url', url_pipe),
    ('person', person_pipe),
    ('event', event_pipe), ### TARGET ###
    ('q1', q1_pipe),
    ('q2', q2_pipe),
    ('q3', q3_pipe),
    ('q4', q4_pipe),
    ('q5', q5_pipe),
    ('q6', q6_pipe),
    ('q7', q7_pipe),
    ('q8', q8_pipe),
    ('q9', q9_pipe),
    ('q10', q10_pipe),
    ('sup_pipe', sup_pipe),
    #('sup_peso_pipe', sup_peso_pipe),#este al parecer no ayuda, con este da 0.75
    #Sin el otro support baja a 0.70
    #('sup_ind_pipe', sup_ind_pipe),
    #('urlcomprar_pipe', urlcomprar_pipe),   
    #('urlvender_pipe', urlvender_pipe),   
    ('storage_pipe', storage_pipe),      
    #('screenreswidth_pipe', screenreswidth_pipe),   ### TARGET ###
    #('screenresheight_pipe', screenresheight_pipe), ### TARGET ###
    ('year_pipe', year_pipe),
    ('monthsin_pipe', monthsin_pipe),
    ('monthcos_pipe', monthcos_pipe),
    ('daysin_pipe', daysin_pipe),
    ('daycos_pipe', daycos_pipe),
    ('weekdaysin_pipe', weekdaysin_pipe),
    ('weekdaycos_pipe', weekdaycos_pipe),
    ('hoursin_pipe', hoursin_pipe),
    ('hourcos_pipe', hourcos_pipe),
    #('motorola_pipe', motorola_pipe),
    #('samsung_pipe', samsung_pipe),
    #('iphone_pipe', iphone_pipe)

    ### TODOS ESTOS ANDAN PERO ESTÁN EN OTRO CSV ###
    
    #('event_count_pipe', event_count_pipe),
    #('event_count_q1_pipe', event_count_q1_pipe),
    #('event_count_q2_pipe', event_count_q2_pipe),
    #('event_count_q3_pipe', event_count_q3_pipe),
    #('event_count_q4_pipe', event_count_q4_pipe),
    #('event_count_q5_pipe', event_count_q5_pipe),
    #('event_count_q6_pipe', event_count_q6_pipe),
    #('event_count_q7_pipe', event_count_q7_pipe),
    #('event_count_q8_pipe', event_count_q8_pipe),
    #('event_count_q9_pipe', event_count_q9_pipe),
    #('event_count_q10_pipe', event_count_q10_pipe),
    #('tiempo_total_pipe', tiempo_total_pipe),
    #('tiempo_total_q1_pipe', tiempo_total_q1_pipe),
    #('tiempo_total_q2_pipe', tiempo_total_q2_pipe),
    #('tiempo_total_q3_pipe', tiempo_total_q3_pipe),
    #('tiempo_total_q4_pipe', tiempo_total_q4_pipe),
    #('tiempo_total_q5_pipe', tiempo_total_q5_pipe),
    #('tiempo_total_q6_pipe', tiempo_total_q6_pipe),
    #('tiempo_total_q7_pipe', tiempo_total_q7_pipe),
    #('tiempo_total_q8_pipe', tiempo_total_q8_pipe),
    #('tiempo_total_q9_pipe', tiempo_total_q8_pipe),
    #('tiempo_total_q10_pipe', tiempo_total_q10_pipe)
    
])

feature_processing = Pipeline([
    #('index_matcher', IndexMatcher()),
    ('feats', feats),
    #('lr', LogisticRegression(solver='sag'))
    #('column_purge', SelectKBest()),
    ('xgb', XGBRegressor(objective='binary:logistic', booster='dart', learning_rate=2.0))
])

feature_processing.fit(X_train, y_train.label)

Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=None,
       transformer_list=[('browser', Pipeline(memory=None,
     steps=[('selector', ColumnSelector(cols=['browser_version'])), ('na_filler', NaFiller(filler='')), ('hasher', TargetEncoder(cols=['browser_version'], drop_invariant=False,
       handle_unknown...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

### Grid search

In [93]:
parameters = {}

## Parámetros XGBRegressor ##

#parameters['xgb__colsample_bytree'] = [1,0]
#parameters['xgb__min_child_weight'] = [1.0,1.2]
#parameters['xgb__max_depth'] = [3,4,6]
#parameters['xgb__n_estimators'] = [100,500,1000]

#------- Los de Tati -------#

parameters['xgb__objective'] = ['binary:logistic']
parameters['xgb__booster'] = ['dart']
parameters['xgb__learning_rate'] = [2.0]

#---------------------------#

## Parámetros SelectKBest ##

parameters['column_purge__k'] = [3,5,10,32] # Total: 32

In [94]:
X_train.index

RangeIndex(start=0, stop=785163, step=1)

In [95]:
y_train.index

RangeIndex(start=0, stop=785163, step=1)

In [96]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785163 entries, 0 to 785162
Data columns (total 48 columns):
timestamp                   785163 non-null object
event                       785163 non-null object
person                      785163 non-null object
url                         63491 non-null object
sku                         445755 non-null float64
model                       446046 non-null object
condition                   445755 non-null object
storage                     785163 non-null float64
color                       445755 non-null object
skus                        167163 non-null object
search_term                 37400 non-null object
staticpage                  3832 non-null object
campaign_source             63532 non-null object
search_engine               35379 non-null object
channel                     68601 non-null object
new_vs_returning            68601 non-null object
city                        68601 non-null object
region                      6

In [97]:
#grid_cv = GridSearchCV(feature_processing, param_grid=parameters,scoring = 'neg_mean_absolute_error', verbose=1, error_score='raise')
#grid_cv.fit(X_train,y_train)

In [98]:
#print('Best score and parameter combination: ')
#print(grid.best_score_)    
#print(grid.best_params_)    

#preds = grid_cv.predict(X_test)

#print('On test set: ' + str(roc_auc_score(y_test,preds)))

## Predicciones!
---

### Predicción para train con test

In [99]:
preds = feature_processing.predict(X_test)

### Precisión del test

In [100]:
roc_auc_score(y_test,preds)

0.9999465000429922

### Entrenamiento con train solo

In [101]:
#preds = feature_processing.predict_proba(train_df)

### Predicción posta :)

In [102]:
to_predict = to_predict.reset_index().drop(columns='index')

In [103]:
to_predict.index

RangeIndex(start=0, stop=1169795, step=1)

In [104]:
preds_posta = feature_processing.predict(to_predict)

MemoryError: 

In [None]:
to_publish = pd.DataFrame()

In [None]:
to_publish['person'] = to_predict.person

In [None]:
to_publish['label'] = preds_posta

In [None]:
to_publish.groupby('person', as_index=False).mean().to_csv('predictions/26_11_xgb_te.csv', index=False)