### Dealing with imports...

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

### Loading data...
The files loaded are generated by the notebook `process_dates_screen_starge.ipynb`. 

In [8]:
dtypes = {
    "timestamp":object,
    "event":object,
    "person":object,
    "url":object,
    "sku":float,
    "model":object,
    "condition":object,
    "storage":float,
    "color":object,
    "skus":object,
    "search_term":object,
    "staticpage":object,
    "campaign_source":object,
    "search_engine":object,
    "channel":object,
    "new_vs_returning":object,
    "city":object,
    "region":object,
    "country":object,
    "device_type":object,
    "operating_system_version":object,
    "browser_version":object,
    "label":int,
    "year":int,
    "year_month_day":object,
    "month_sin":float,
    "month_cos":float,
    "day_sin":float,
    "day_cos":float,
    "weekday_sin":float,
    "weekday_cos":float,
    "hour_sin":float,
    "hour_cos":float,
    "screen_width":int,
    "screen_height":int
}
train_df = pd.read_csv('data/train_df_processed_screenResol_storage_dates.csv', dtype=dtypes, parse_dates=['timestamp','year_month_day'])

to_predict = pd.read_csv('data/to_predict_processed_screenResol_storage_dates.csv', dtype=dtypes, parse_dates=['timestamp','year_month_day'])

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines and then stored to a new file, so there will be no need to execute the same code every time we open this notebook again.

Good pipeline resources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

### Build some custom transformers

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:,self.cols]

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class NaFiller(BaseEstimator, TransformerMixin):
    """
    Selector to select the needed columns in the pipeline
    """
    def __init__(self, filler):
        self.filler = filler

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.fillna(self.filler)

# Feature pipeline creation

### Data split

In [12]:
test_size = 0.33
# define a seed, so same experiments output same results every time and experiments between them become comparable
seed = 12

# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [13]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

## Pipelines

In [14]:
# from https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines:
# To make a pipeline, just pass an array of tuples of the format (name, object). The first part is the name of the action, and the second is the actual object. 

screen_res_pipe = Pipeline([
    ('selector', ColumnSelector(['screen_width','screen_height']))
])

storage_pipe = Pipeline([
    ('selector', ColumnSelector(['storage']))
])

times_pipe = Pipeline([
    ('selector', ColumnSelector(['month_sin','month_cos','day_sin','day_cos','weekday_sin','weekday_cos','hour_sin','hour_cos']))
])

browser_pipe = Pipeline([
    ('selector', ColumnSelector(['browser_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

os_ver_pipe = Pipeline([
    ('selector', ColumnSelector(['operating_system_version'])),
    ('na_filler', NaFiller("")),
    ('hasher', OneHotEncoder(handle_unknown='ignore')) 
])

device_type_pipe = Pipeline([
    ('selector', ColumnSelector(['device_type'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

country_pipe = Pipeline([
    ('selector', ColumnSelector(['country'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore')) 
])

region_pipe = Pipeline([
    ('selector', ColumnSelector(['region'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

city_pipe = Pipeline([
    ('selector', ColumnSelector(['city'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

new_vs_returning_pipe = Pipeline([
    ('selector', ColumnSelector(['new_vs_returning'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

channel_pipe = Pipeline([
    ('selector', ColumnSelector(['channel'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

search_engine_pipe = Pipeline([
    ('selector', ColumnSelector(['search_engine'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

campaign_source_pipe = Pipeline([
    ('selector', ColumnSelector(['campaign_source'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

staticpage_pipe = Pipeline([
    ('selector', ColumnSelector(['staticpage'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

search_term_pipe = Pipeline([
    ('selector', ColumnSelector(['search_term'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

skus_pipe = Pipeline([
    ('selector', ColumnSelector(['skus'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

color_pipe = Pipeline([
    ('selector', ColumnSelector(['color'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

condition_pipe = Pipeline([
    ('selector', ColumnSelector(['condition'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

model_pipe = Pipeline([
    ('selector', ColumnSelector(['model'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

sku_pipe = Pipeline([
    ('selector', ColumnSelector(['sku'])),
    ('na_filler', NaFiller(0)),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

url_pipe = Pipeline([
    ('selector', ColumnSelector(['url'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

person_pipe = Pipeline([
    ('selector', ColumnSelector(['person'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

event_pipe = Pipeline([
    ('selector', ColumnSelector(['event'])),
    ('na_filler', NaFiller("")),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([
    ('time_related', times_pipe),
    ('storage', storage_pipe),
    ('screen', screen_res_pipe),
    ('browser', browser_pipe),
    ('os', os_ver_pipe),
    ('device_type', device_type_pipe),
    ('country', country_pipe),
    ('region', region_pipe),
    ('city', city_pipe),
    ('new_vs_ret', new_vs_returning_pipe),
    ('channel', channel_pipe),
    ('search_eng', search_engine_pipe),
    ('campaign', campaign_source_pipe),
    ('staticpage', staticpage_pipe),
    ('searchterm', search_term_pipe),
    ('skus', skus_pipe),
    ('color', color_pipe),
    ('condition', condition_pipe),
    ('model', model_pipe),
    ('sku', sku_pipe),
    ('url', url_pipe),
    ('person', person_pipe),
    ('event', event_pipe)
])

feature_processing = Pipeline([
    ('feats', feats),
    ('xg_reg', GradientBoostingClassifier())
])

In [16]:
feature_processing.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('feats', FeatureUnion(n_jobs=None,
       transformer_list=[('time_related', Pipeline(memory=None,
     steps=[('selector', ColumnSelector(cols=['month_sin', 'month_cos', 'day_sin', 'day_cos', 'weekday_sin', 'weekday_cos', 'hour_sin', 'hour_cos']))])), ('storage', Pipeline(memory=None, steps...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [17]:
preds = feature_processing.predict_proba(X_test)[:,1]

In [18]:
# check shape of predictions
if (preds.shape == y_test.shape):
    print('shapes OK')

shapes OK


In [100]:
# prev: 0.8886417505271881
#       0.7675200441464977
roc_auc_score(y_test,preds)

0.7675200441464977

## Export area

In [101]:
preds_posta = feature_processing.predict_proba(to_predict)[:,1]

In [128]:
preds_posta

array([0.06769457, 0.06769457, 0.06769457, ..., 0.06769457, 0.06769457,
       0.06769457])

In [129]:
to_publish = pd.DataFrame()

In [130]:
to_publish['person'] = to_predict.person

In [131]:
to_publish['label'] = preds_posta

In [132]:
to_publish.groupby('person', as_index=False).mean().to_csv('predictions/with_numeric_xgb3.csv', index=False)