In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
import sklearn

In [None]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

In [None]:
#Leer el que arme hoy y probar
events_df = pd.read_csv('data/events_with_features.csv', low_memory=False) 

#events_df = pd.read_csv('/home/miki_mustard/ev_con_2_supports.csv', low_memory=False) 
#A este csv le tengo que agregar lo de las fechas ciclicas
labels_df = pd.read_csv("data/fiuba-trocafone-tp2-final-set/labels_training_set.csv", low_memory=False)

In [None]:
# armo df con registros completos clasificados
train_df = events_df.merge(labels_df, on='person', how='right')
train_df.shape

In [None]:
y_train = train_df.label

In [None]:
train_df = train_df.drop(columns=['label'])

In [None]:
train_df.columns

In [None]:
#train_df.to_csv('train_datos.csv', index=False)

In [None]:
# armo df con registros a predecir unicamente
to_predict = events_df[~events_df.person.isin(labels_df.person)]

In [None]:
#to_predict.to_csv('to_predict.csv', index=False)

In [None]:
test_size = 0.33
# define a seed, so same experiments output same results every time
seed = 12

In [None]:

# realizo train_test_splitkkk
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    y_train, 
                                                    test_size=test_size, 
                                                    random_state=seed)

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines.

Good pipeline sources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

In [None]:
# some date processing
def date_proc(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['weekday'] = df['timestamp'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour
    df['year_month_day'] = df['timestamp'].map(lambda x: str(x.year)+"/"+str(x.month)+"/"+str(x.day))
    df['year_month_day'] = pd.to_datetime(df['year_month_day'])
    
#date_proc(X_train)
#date_proc(X_test)
date_proc(train_df)

In [None]:
train_df.columns

In [None]:
x_train.dtypes

Primero hacemos un label encoding con el weekday, luego aplicamos una transfrmacion que contemple la naturaleza ciclica de la semana. Esto ultimo lo aplicaremos tambien al resto de los features ciclicos (como se explica, por ejemplo, aca: https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/)

In [None]:
weekday_le = preprocessing.LabelEncoder()
weekday_le.fit(train_df.weekday)

#X_train.weekday = weekday_le.transform(X_train.weekday)
#X_test.weekday = weekday_le.transform(X_test.weekday)
train_df.weekday = weekday_le.transform(train_df.weekday)

In [None]:
def month_to_cyclic(df):
    df['month_sin'] = df['month'].apply(lambda x: np.sin(2*np.pi*x/12))
    df['month_cos'] = df['month'].apply(lambda x: np.cos(2*np.pi*x/12))
    df.drop('month', axis=1, inplace=True)
    
def day_to_cyclic(df):
    df['day_sin'] = df['day'].apply(lambda x: np.sin(2*np.pi*x/31))
    df['day_cos'] = df['day'].apply(lambda x: np.cos(2*np.pi*x/31))
    df.drop('day', axis=1, inplace=True)

def weekday_to_cyclic(df):
    df['weekday_sin'] = df['weekday'].apply(lambda x: np.sin(2*np.pi*x/7))
    df['weekday_cos'] = df['weekday'].apply(lambda x: np.cos(2*np.pi*x/7))
    df.drop('weekday', axis=1, inplace=True)

def hour_to_cyclic(df):
    df['hour_sin'] = df['hour'].apply(lambda x: np.sin(2*np.pi*x/24))
    df['hour_cos'] = df['hour'].apply(lambda x: np.cos(2*np.pi*x/24))
    df.drop('hour', axis=1, inplace=True)

In [None]:
# cell to compare results before & after processing
X_train[['year','month','day','weekday','hour']].head()

In [None]:
month_to_cyclic(train_df)
day_to_cyclic(train_df)
weekday_to_cyclic(train_df)
hour_to_cyclic(train_df)

In [None]:
X_train[['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']].head()

In [None]:
X_train.drop('year_month_day', axis=1, inplace=True)
X_test.drop('year_month_day', axis=1, inplace=True)

In [None]:
X_train.dtypes

### Build some custom transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:,self.cols]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaFiller(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, filler):
        self.filler = filler

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.fillna(self.filler)

In [None]:
X_test.columns

## q1

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q1_pipe = Pipeline([
    ('selector', ColumnSelector(['q1'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q1_pipe.fit_transform(train_df)

## q2

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q2_pipe = Pipeline([
    ('selector', ColumnSelector(['q2'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])


In [None]:
q2_pipe.fit_transform(train_df)

## q3

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q3_pipe = Pipeline([
    ('selector', ColumnSelector(['q3'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q3_pipe.fit_transform(train_df)

## q4

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q4_pipe = Pipeline([
    ('selector', ColumnSelector(['q4'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q4_pipe.fit_transform(train_df)

## q5

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q5_pipe = Pipeline([
    ('selector', ColumnSelector(['q5'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q5_pipe.fit_transform(train_df)

## q6

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q6_pipe = Pipeline([
    ('selector', ColumnSelector(['q6'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q6_pipe.fit_transform(train_df)

## q7

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q7_pipe = Pipeline([
    ('selector', ColumnSelector(['q7'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q7_pipe.fit_transform(train_df)

## q8

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q8_pipe = Pipeline([
    ('selector', ColumnSelector(['q8'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q8_pipe.fit_transform(train_df)

## q9

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q9_pipe = Pipeline([
    ('selector', ColumnSelector(['q9'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q9_pipe.fit_transform(train_df)

## q10

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

q10_pipe = Pipeline([
    ('selector', ColumnSelector(['q10'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
q10_pipe.fit_transform(train_df)

## support_p_todos_sus_ev

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

sup_pipe = Pipeline([
    ('selector', ColumnSelector(['support_p_todos_sus_ev'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
sup_pipe.fit_transform(train_df)

## Support_con_peso

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

sup_peso_pipe = Pipeline([
    ('selector', ColumnSelector(['support_con_peso'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
sup_peso_pipe.fit_transform(train_df)

##  'support_ind'

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

sup_ind_pipe = Pipeline([
    ('selector', ColumnSelector(['support_ind'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
sup_ind_pipe.fit_transform(train_df)

## url comprar

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

urlcomprar_pipe = Pipeline([
    ('selector', ColumnSelector(['url_comprar'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## url vender

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

urlvender_pipe = Pipeline([
    ('selector', ColumnSelector(['url_vender'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## year

In [None]:
train_df.columns

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

year_pipe = Pipeline([
    ('selector', ColumnSelector(['year'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
year_pipe.fit_transform(train_df)

## month_sin

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

monthsin_pipe = Pipeline([
    ('selector', ColumnSelector(['month_sin'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
monthsin_pipe.fit_transform(train_df)

## month_cos

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

monthcos_pipe = Pipeline([
    ('selector', ColumnSelector(['month_cos'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
monthcos_pipe.fit_transform(train_df)

## day_sin

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

daysin_pipe = Pipeline([
    ('selector', ColumnSelector(['day_sin'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
daysin_pipe.fit_transform(train_df)

## day_cos

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

daycos_pipe = Pipeline([
    ('selector', ColumnSelector(['day_cos'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
daycos_pipe.fit_transform(train_df)

## weekday_sin

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

weekdaysin_pipe = Pipeline([
    ('selector', ColumnSelector(['weekday_sin'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
weekdaysin_pipe.fit_transform(train_df)

## weekday_cos

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

weekdaycos_pipe = Pipeline([
    ('selector', ColumnSelector(['weekday_cos'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
weekdaycos_pipe.fit_transform(train_df)

## hour_sin

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

hoursin_pipe = Pipeline([
    ('selector', ColumnSelector(['hour_sin'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))



In [None]:
hoursin_pipe.fit_transform(train_df)

## hour_cos

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

hourcos_pipe = Pipeline([
    ('selector', ColumnSelector(['hour_cos'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
hourcos_pipe.fit_transform(train_df)

## samsung

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

samsung_pipe = Pipeline([
    ('selector', ColumnSelector(['samsung'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])


## motorola

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

motorola_pipe = Pipeline([
    ('selector', ColumnSelector(['year'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
#motorola_pipe.fit_transform(train_df)

## iphone

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

iphone_pipe = Pipeline([
    ('selector', ColumnSelector(['iphone'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
#iphone_pipe.fit_transform(train_df)

# Feature pipeline creation and some pre processing

## Browser version

In [None]:
num_of_unique_browsers = len(X_train.browser_version.unique())

In [None]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

browser_pipe = Pipeline([
    ('selector', ColumnSelector(['browser_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
browser_pipe.fit_transform(train_df)

This cell has been replaced with the pipeline above

```
# http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
v = FeatureHasher(n_features=num_of_unique_browsers//2, input_type='string')
# browser_version_dict = X_train[['browser_version']]
X_train.browser_version.fillna("", inplace=True)
x = v.fit_transform(X_train[['browser_version']])
# x = v.fit_transform(browser_version_dict)
# Attach de sparse vector to df
```

## operating_system_version

In [None]:
X_train.operating_system_version.unique()

In [None]:
os_num_of_unique = len(X_train.operating_system_version.unique())

In [None]:
os_ver_pipe = Pipeline([
    ('selector', ColumnSelector(['operating_system_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
os_ver_pipe.fit_transform(train_df)

## screen_resolution

In [None]:
def get_screen_width(x):
    if x != "":
        return x.split("x")[0]
    else:
        return 0
    
def get_screen_height(x):
    if x != "":
        return x.split("x")[1]
    else:
        return 0

def process_screen_res(df):
    df['screen_resolution'].fillna("", inplace=True)
    df['screen_width'] = df['screen_resolution'].apply(lambda x: get_screen_width(x))
    df['screen_height'] = df['screen_resolution'].apply(lambda x: get_screen_height(x))
    df.drop('screen_resolution', axis=1, inplace=True)

In [None]:
process_screen_res(train_df)
#process_screen_res(X_test)

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

screenreswidth_pipe = Pipeline([
    ('selector', ColumnSelector(['screen_width'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
#creenreswidth_pipe.fit_transform(train_df)

In [None]:
#from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

screenresheight_pipe = Pipeline([
    ('selector', ColumnSelector(['screen_height'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

## device_type

In [None]:
X_train.device_type.unique()

In [None]:
# X_train.device_type.fillna("", inplace=True)

In [None]:
device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

In [None]:
device_type_pipe.fit_transform(train_df)

## country

In [None]:
print(X_train.country.unique())
print("\n\tlen: " + str(len(X_train.country.unique())))

In [None]:
# X_train.country.fillna("", inplace=True)

In [None]:
country_pipe = Pipeline([
    ('selector', ColumnSelector(['country'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

## region

In [None]:
print(X_train.region.unique())
print("\n\tlen: " + str(len(X_train.region.unique())))

In [None]:
region_pipe = Pipeline([
    ('selector', ColumnSelector(['region'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# region_pipe.fit_transform(X_train)

## city

In [None]:
print(X_train.city.unique())
print("\n\tlen: " + str(len(X_train.city.unique())))

In [None]:
city_pipe = Pipeline([
    ('selector', ColumnSelector(['city'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# city_pipe.fit_transform(X_train)

## new_vs_returning

In [None]:
print(X_train.new_vs_returning.unique())
print("\n\tlen: " + str(len(X_train.new_vs_returning.unique())))

In [None]:
new_vs_returning_pipe = Pipeline([
    ('selector', ColumnSelector(['new_vs_returning'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# new_vs_returning_pipe.fit_transform(X_train)

## channel

In [None]:
print(X_train.channel.unique())
print("\n\tlen: " + str(len(X_train.channel.unique())))

In [None]:
channel_pipe = Pipeline([
    ('selector', ColumnSelector(['channel'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# channel_pipe.fit_transform(X_train)

## search_engine

In [None]:
print(X_train.search_engine.unique())
print("\n\tlen: " + str(len(X_train.search_engine.unique())))

In [None]:
search_engine_pipe = Pipeline([
    ('selector', ColumnSelector(['search_engine'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# search_engine_pipe.fit_transform(X_train)

## campaign_source

In [None]:
print(X_train.campaign_source.unique())
print("\n\tlen: " + str(len(X_train.campaign_source.unique())))

In [None]:
campaign_source_pipe = Pipeline([
    ('selector', ColumnSelector(['campaign_source'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# campaign_source_pipe.fit_transform(X_train)

## staticpage

In [None]:
print(X_train.staticpage.unique())
print("\n\tlen: " + str(len(X_train.staticpage.unique())))

In [None]:
staticpage_pipe = Pipeline([
    ('selector', ColumnSelector(['staticpage'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# staticpage_pipe.fit_transform(X_train)

## search_term

In [None]:
print(X_train.search_term.unique())
print("\n\tlen: " + str(len(X_train.search_term.unique())))

In [None]:
search_term_pipe = Pipeline([
    ('selector', ColumnSelector(['search_term'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# search_term_pipe.fit_transform(X_train)

## skus

In [None]:
print(X_train.skus.unique())
print("\n\tlen: " + str(len(X_train.skus.unique())))

In [None]:
skus_pipe = Pipeline([
    ('selector', ColumnSelector(['skus'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# skus_pipe.fit_transform(X_train)

## color

In [None]:
print(X_train.color.unique())
print("\n\tlen: " + str(len(X_train.color.unique())))

In [None]:
color_pipe = Pipeline([
    ('selector', ColumnSelector(['color'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# color_pipe.fit_transform(X_train)

## storage

In [None]:
def process_storage_string(x):
    if pd.isna(x):
        return 0
    s = x.split("GB")
    if len(s) == 2:
        # case data in GB
        return int(s[0])
    else:
        # case data in MB
        return int(x.split("MB")[0])/1024

def storage_process(df):
    df.storage = df.storage.apply(lambda x: process_storage_string(x))

In [None]:
storage_process(train_df)
#storage_process(X_test)

In [None]:
storage_pipe = Pipeline([
    ('selector', ColumnSelector(['storage'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])


## condition

In [None]:
print(X_train.condition.unique())
print("\n\tlen: " + str(len(X_train.condition.unique())))

In [None]:
condition_pipe = Pipeline([
    ('selector', ColumnSelector(['condition'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# condition_pipe.fit_transform(X_train)

## model

In [None]:
print(X_train.model.unique())
print("\n\tlen: " + str(len(X_train.model.unique())))

In [None]:
model_pipe = Pipeline([
    ('selector', ColumnSelector(['model'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# model_pipe.fit_transform(X_train)

## sku

In [None]:
print(X_train.sku.unique())
print("\n\tlen: " + str(len(X_train.sku.unique())))

In [None]:
sku_pipe = Pipeline([
    ('selector', ColumnSelector(['sku'])),
    ('na_filler', NaFiller(0)),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# sku_pipe.fit_transform(X_train)

## url

In [None]:
print(X_train.url.unique())
print("\n\tlen: " + str(len(X_train.url.unique())))

In [None]:
url_pipe = Pipeline([
    ('selector', ColumnSelector(['url'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# url_pipe.fit_transform(X_train)

## person

In [None]:
print(X_train.person.unique())
print("\n\tlen: " + str(len(X_train.person.unique())))

In [None]:
person_pipe = Pipeline([
    ('selector', ColumnSelector(['person'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# person_pipe.fit_transform(X_train)

## event

In [None]:
print(X_train.event.unique())
print("\n\tlen: " + str(len(X_train.event.unique())))

In [None]:
event_pipe = Pipeline([
    ('selector', ColumnSelector(['event'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# event_pipe.fit_transform(X_train)

In [None]:
train_df.columns

# Event_count 

In [None]:
event_count_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
event_count_pipe.fit_transform(train_df)

# Event_count_q1

In [None]:
event_count_q1_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q1'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
#event_count_q1_pipe.fit_transform(train_df)

# Event_count_q2

In [None]:
event_count_q2_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q2'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q3

In [None]:
event_count_q3_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q3'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q4

In [None]:
event_count_q4_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q4'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q5

In [None]:
event_count_q5_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q5'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q6

In [None]:
event_count_q6_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q6'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q7

In [None]:
event_count_q7_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q7'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q8

In [None]:
event_count_q8_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q8'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q9

In [None]:
event_count_q9_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q9'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Event_count_q10

In [None]:
event_count_q10_pipe = Pipeline([
    ('selector', ColumnSelector(['event_count_q10'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Tiempo_total

In [None]:
tiempo_total_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_pipe.fit_transform(train_df)

# tiempo_tota_q1

In [None]:
tiempo_total_q1_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q1'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q1_pipe.fit_transform(train_df)

# tiempo_total_q2

In [None]:
tiempo_total_q2_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q2'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q2_pipe.fit_transform(train_df)

# Tiempo_total_q3

In [None]:
tiempo_total_q3_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q3'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q3_pipe.fit_transform(train_df)

# Tiempo_total_q4 

In [None]:
tiempo_total_q4_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q4'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q4_pipe.fit_transform(train_df)

# Tiempo_total_q5

In [None]:
tiempo_total_q5_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q5'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q5_pipe.fit_transform(train_df)

# Tiempo_total_q6

In [None]:
tiempo_total_q6_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q6'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q6_pipe.fit_transform(train_df)

# Tiempo_total_q7

In [None]:
tiempo_total_q7_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q7'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q7_pipe.fit_transform(train_df)

# Tiempo_total_q8

In [None]:
tiempo_total_q8_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q8'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q8_pipe.fit_transform(train_df)

# Tiempo_total_q9

In [None]:
tiempo_total_q9_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q9'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q9_pipe.fit_transform(train_df)

# Tiempo_total_q10

In [None]:
tiempo_total_q10_pipe = Pipeline([
    ('selector', ColumnSelector(['tiempo_total_q10'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
tiempo_total_q10_pipe.fit_transform(train_df)

## Lets bring all of them together!

In [None]:
#from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([
    ('browser', browser_pipe),
    ('os', os_ver_pipe),
    ('device_type', device_type_pipe),
    ('country', country_pipe),
    ('region', region_pipe),
    ('city', city_pipe),
    ('new_vs_ret', new_vs_returning_pipe),
    ('channel', channel_pipe),
    ('search_eng', search_engine_pipe),
    ('campaign', campaign_source_pipe),
    ('staticpage', staticpage_pipe),
    ('searchterm', search_term_pipe),
    ('skus', skus_pipe),
    ('color', color_pipe),
    ('condition', condition_pipe),
    ('model', model_pipe),
    ('sku', sku_pipe),
    ('url', url_pipe),
    ('person', person_pipe),
    ('event', event_pipe),
    ('q1', q1_pipe),
    ('q2', q2_pipe),
    ('q3', q3_pipe),
    ('q4', q4_pipe),
    ('q5', q5_pipe),
    ('q6', q6_pipe),
    ('q7', q7_pipe),
    ('q8', q8_pipe),
    ('q9', q9_pipe),
    ('q10', q10_pipe),
    #('sup_pipe', sup_pipe),
    #('sup_peso_pipe', sup_peso_pipe),#este al parecer no ayuda, con este da 0.75
    #Sin el otro support baja a 0.70
    #('sup_ind_pipe', sup_ind_pipe),
    #('urlcomprar_pipe', urlcomprar_pipe),   
    #('urlvender_pipe', urlvender_pipe),   
    ('storage_pipe', storage_pipe),   
    #('screenreswidth_pipe', screenreswidth_pipe),   
    #('screenresheight_pipe', screenresheight_pipe),   
    ('year_pipe', year_pipe),
    ('monthsin_pipe', monthsin_pipe),
    ('monthcos_pipe', monthcos_pipe),
    ('daysin_pipe', daysin_pipe),
    ('daycos_pipe', daycos_pipe),
    ('weekdaysin_pipe', weekdaysin_pipe),
    ('weekdaycos_pipe', weekdaycos_pipe),
    ('hoursin_pipe', hoursin_pipe),
    ('hourcos_pipe', hourcos_pipe),
    #('motorola_pipe', motorola_pipe),
    #('samsung_pipe', samsung_pipe),
    #('iphone_pipe', iphone_pipe)
    ('event_count_pipe', event_count_pipe),
    ('event_count_q1_pipe', event_count_q1_pipe),
    ('event_count_q2_pipe', event_count_q2_pipe),
    ('event_count_q3_pipe', event_count_q3_pipe),
    ('event_count_q4_pipe', event_count_q4_pipe),
    ('event_count_q5_pipe', event_count_q5_pipe),
    ('event_count_q6_pipe', event_count_q6_pipe),
    ('event_count_q7_pipe', event_count_q7_pipe),
    ('event_count_q8_pipe', event_count_q8_pipe),
    ('event_count_q9_pipe', event_count_q9_pipe),
    ('event_count_q10_pipe', event_count_q10_pipe),
    ('tiempo_total_pipe', tiempo_total_pipe),
    ('tiempo_total_q1_pipe', tiempo_total_q1_pipe),
    ('tiempo_total_q2_pipe', tiempo_total_q2_pipe),
    ('tiempo_total_q3_pipe', tiempo_total_q3_pipe),
    ('tiempo_total_q4_pipe', tiempo_total_q4_pipe),
    ('tiempo_total_q5_pipe', tiempo_total_q5_pipe),
    ('tiempo_total_q6_pipe', tiempo_total_q6_pipe),
    ('tiempo_total_q7_pipe', tiempo_total_q7_pipe),
    ('tiempo_total_q8_pipe', tiempo_total_q8_pipe),
    ('tiempo_total_q9_pipe', tiempo_total_q8_pipe),
    ('tiempo_total_q10_pipe', tiempo_total_q10_pipe)
    
])

feature_processing = Pipeline([
    ('feats', feats),
    ('lr', LogisticRegression(solver='sag'))
    #('xgb', XGBRegressor())
])

feature_processing.fit(train_df, y_train)

In [None]:
preds = feature_processing.predict_proba(train_df)[:,1]

 ## trying xgb

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train,y_train)

## fin

In [None]:
preds

In [None]:
preds.shape

In [None]:
y_test.shape

In [None]:
roc_auc_score(y_test,preds)

In [None]:
preds_posta = feature_processing.predict_proba(to_predict)[:,1]

In [None]:
preds_posta

In [None]:
to_publish = pd.DataFrame()

In [None]:
to_publish['person'] = to_predict.person

In [None]:
to_publish['label'] = preds_posta

In [None]:
to_publish.groupby('person', as_index=False).mean().to_csv('9_11_2.csv', index=False)