# Note
This notebooks uses the file generated by the notebook `process_dates_screen_storage.ipynb`.

### Dealing with imports...

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

### Loading data...
The files loaded are generated by the notebook `process_dates_screen_starge.ipynb`. 

In [3]:
dtypes = {
    "timestamp":object,
    "event":object,
    "person":object,
    "url":object,
    "sku":float,
    "model":object,
    "condition":object,
    "storage":float,
    "color":object,
    "skus":object,
    "search_term":object,
    "staticpage":object,
    "campaign_source":object,
    "search_engine":object,
    "channel":object,
    "new_vs_returning":object,
    "city":object,
    "region":object,
    "country":object,
    "device_type":object,
    "operating_system_version":object,
    "browser_version":object,
    "label":int,
    "year":int,
    "year_month_day":object,
    "month_sin":float,
    "month_cos":float,
    "day_sin":float,
    "day_cos":float,
    "weekday_sin":float,
    "weekday_cos":float,
    "hour_sin":float,
    "hour_cos":float,
    "screen_width":int,
    "screen_height":int
}
train_df = pd.read_csv('data/train_df_processed_screenResol_storage_dates.csv', dtype=dtypes, parse_dates=['timestamp','year_month_day'])

to_predict = pd.read_csv('data/to_predict_processed_screenResol_storage_dates.csv', dtype=dtypes, parse_dates=['timestamp','year_month_day'])

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines and then stored to a new file, so there will be no need to execute the same code every time we open this notebook again.

Good pipeline resources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

### Build some custom transformers

# Feature pipeline creation

### Data split

In [4]:
test_size = 0.33
# define a seed, so same experiments output same results every time and experiments between them become comparable
seed = 12

# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

## Pipelines

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import TruncatedSVD

In [6]:
# hv = FeatureHasher(input_type='string')
# hv.fit_transform(X_train.event.fillna(""))

In [25]:
limited_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("one_hot",OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
large_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("hashing_trci",FeatureHasher(input_type='string'))
])

In [144]:
# tf = TfidfVectorizer()
# tf.fit_transform(X_train.color.fillna(""), y_train)

<785163x49 sparse matrix of type '<class 'numpy.float64'>'
	with 549665 stored elements in Compressed Sparse Row format>

In [39]:
# X_train.model.fillna("",inplace=True)
X_test.model.fillna("",inplace=True)
# to_predict.model.fillna("",inplace=True)

# X_train.color.fillna("",inplace=True)
X_test.color.fillna("",inplace=True)
# to_predict.color.fillna("",inplace=True)

# X_train.search_term.fillna("",inplace=True)
X_test.search_term.fillna("",inplace=True)
# to_predict.search_term.fillna("",inplace=True)

In [194]:
tf_idf = Pipeline([
#     (("imputer"),SimpleImputer(strategy='constant',fill_value="")),
    ("tf_idf",TfidfVectorizer())
])

In [176]:
tf_idf2 = Pipeline([
#     (("imputer"),SimpleImputer(strategy='constant',fill_value="")),
    ("tf_idf",TfidfVectorizer())
])

In [200]:
tf_idf_reduced = Pipeline([
#     (("imputer"),SimpleImputer(strategy='constant',fill_value="")),
    ("tf_idf",TfidfVectorizer()),
#     ('best', TruncatedSVD())
])

In [172]:
len(X_train.model.unique())

197

In [173]:
len(X_train.color.unique())

64

In [174]:
len(X_train.search_term.unique())

5285

In [26]:
ct = ColumnTransformer([
    ("lim_cat",limited_categorical_transformer,["event","condition","staticpage","campaign_source","search_engine",
                                                "channel","new_vs_returning","device_type","operating_system_version","browser_version"]),
    ("large_cat",large_categorical_transformer,["person","url","skus","city","region","country"]),
    ("tf_idf",TfidfVectorizer(),"model"),
    ("tf_idf2",TfidfVectorizer(),"color"),
    ("tf_idf_reduced",TfidfVectorizer(),"search_term"),
    ("passthrough",'passthrough',["storage","sku","year","month_sin","month_cos","day_sin","day_cos","weekday_sin",
                                  "weekday_cos","hour_sin","hour_cos","screen_width","screen_height"])
],n_jobs=-1)

In [27]:
feature_processing = Pipeline([
    ('preproc', ct),
    ('xg_reg', XGBClassifier())
])

In [28]:
feature_processing.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preproc', ColumnTransformer(n_jobs=-1, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('lim_cat', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', v...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [40]:
preds = feature_processing.predict_proba(X_test)[:,1]

In [41]:
# check shape of predictions
if (preds.shape == y_test.shape):
    print('shapes OK')

shapes OK


In [63]:
# prev: 0.8886417505271881
#       0.7675200441464977
#       0.8501359056263651
#       0.9994019683472557    logistic_regression con preprocesamiento de browswer y os.
#       0.8501359056263651    idem pero con xgb
#       0.8503577934604342    xgb con:
#                                     ct = ColumnTransformer([
#                                         ("lim_cat",limited_categorical_transformer,["event","condition","staticpage",
#                                                                                     "campaign_source","search_engine",
#                                                                                     "channel","new_vs_returning","device_type",
#                                                                                     "operating_system_version","browser_version"]),
#                                         ("large_cat",large_categorical_transformer,["person","url","skus","city","region","country"]),
#                                         ("tf_idf",TfidfVectorizer(),"model"),
#                                         ("tf_idf2",TfidfVectorizer(),"color"),
#                                         ("tf_idf_reduced",TfidfVectorizer(),"search_term"),
#                                         ("passthrough",'passthrough',["storage","sku","year","month_sin","month_cos","day_sin","day_cos","weekday_sin",
#                                                                       "weekday_cos","hour_sin","hour_cos","screen_width","screen_height"])
#                                     ],n_jobs=-1)
roc_auc_score(y_test,preds)

0.8503577934604342

In [59]:
feature_processing.named_steps['preproc'].named_transformers_['tf_idf'].idf_

array([ 9.61080326,  9.63917396,  5.38126785,  5.26255762, 10.44651351,
        6.05585488,  4.86684455,  6.02031565,  5.4326576 ,  4.11716643,
        3.84202346,  7.11950681,  5.17301779,  5.45039163,  7.32017751,
        7.53398754,  7.79486299,  7.61900903,  7.32514382,  8.26919909,
        7.32017751, 13.18735353, 10.93606173,  8.33532327,  7.25378296,
        7.82137752,  9.12260944,  9.16647612,  8.97522593,  7.60573869,
        9.7945244 ,  6.62526261,  5.09343307,  4.75099086,  7.56333602,
        8.58468648,  7.94693014,  4.20300451,  6.28763042,  6.8778899 ,
        9.63917396,  5.45356051,  6.95884253, 11.62920891, 13.4750356 ,
        6.10501489,  5.5541049 ,  5.04558133,  5.46622657,  8.40823004,
        2.5315953 ,  9.27533053,  9.27034298,  8.57471133,  6.21334246,
        8.13589624, 12.37642331,  8.31981908, 10.5482962 , 10.019771  ,
        8.78982271,  8.39778062,  7.87537984,  6.74084038,  8.13589624,
        7.67594295, 13.18735353,  9.81147396,  7.58615765,  7.86

## Export area

In [43]:
preds_posta = feature_processing.predict_proba(to_predict)[:,1]

In [44]:
preds_posta

array([0.06563842, 0.06563842, 0.06563842, ..., 0.07163502, 0.07163502,
       0.07163502], dtype=float32)

In [45]:
to_publish = pd.DataFrame()

In [46]:
to_publish['person'] = to_predict.person

In [47]:
to_publish['label'] = preds_posta

In [48]:
to_publish.groupby('person', as_index=False).mean().to_csv('predictions/30.nov@10.20.csv', index=False)