# Note
This notebooks uses the file generated by the notebook `process_dates_screen_storage.ipynb`.

### Dealing with imports...

In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [56]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

### Loading data...
The files loaded are generated by the notebook `process_dates_screen_starge.ipynb`. 

In [132]:
train_df = pd.read_csv('../data/train_df_processed_screenResol_storage_dates.csv')

to_predict = pd.read_csv('../data/to_predict_processed_screenResol_storage_dates.csv')

In [151]:
# reassure object types are all strings, because some processings fails otherwise. Apparently there is some value which is not a string...
X_train.model = X_train.model.apply(lambda x: str(x))
X_test.model = X_test.model.apply(lambda x: str(x))

X_train.color = X_train.color.apply(lambda x: str(x))
X_test.color = X_test.color.apply(lambda x: str(x))

X_train.search_term = X_train.search_term.apply(lambda x: str(x))
X_test.search_term = X_test.search_term.apply(lambda x: str(x))

X_train.url = X_train.url.apply(lambda x: str(x))
X_test.url = X_test.url.apply(lambda x: str(x))

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines and then stored to a new file, so there will be no need to execute the same code every time we open this notebook again.

Good pipeline resources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

### Build some custom transformers

# Feature pipeline creation

### Data split

In [133]:
test_size = 0.33
# define a seed, so same experiments output same results every time and experiments between them become comparable
seed = 12

# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

## Pipelines

In [279]:
from sklearn.feature_extraction.text import HashingVectorizer

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import FeatureUnion

In [135]:
train_df.select_dtypes('object').columns

Index(['person', 'url', 'model', 'color', 'skus', 'search_term', 'city',
       'region', 'country'],
      dtype='object')

In [136]:
limited_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("one_hot",OneHotEncoder(handle_unknown='ignore'))
])

In [137]:
large_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("hashing_trick",FeatureHasher(input_type='string'))
])

In [251]:
tf_idf = Pipeline([
#     ("imputer",SimpleImputer(strategy='constant',fill_value="")),
#     ('vect',CountVectorizer(ngram_range=(1,1), binary=True, min_df=3,lowercase=False)),
#     ('tfidf', TfidfTransformer())
    ("tf_idf",TfidfVectorizer()),
#     ('best', TruncatedSVD())
])

In [258]:
ct = ColumnTransformer([
    ("large_cat",large_categorical_transformer,["person","skus","city","region","country"]),
#     ("tf_idf",FeatureHasher(n_features=30,input_type='string'),['model','color','search_term','url'])
#     ("tf_idf",tf_idf,['color'])
]
    ,n_jobs=-1
#     ,remainder='passthrough'
)

In [140]:
# X_train.color.fillna("", inplace=True)
# X_train.model.fillna("", inplace=True)

In [272]:
from tempfile import mkdtemp
cachedir = mkdtemp()

In [None]:
feature_processing = Pipeline([
    ('preproc', ct),
    ('predict', XGBClassifier())
    ]))
],memory=cachedir)

In [275]:
feature_processing.fit(X_train.drop(['model','color','search_term','url'],axis=1), y_train)

Pipeline(memory='/tmp/tmpzbqjdrbw',
     steps=[('preproc', ColumnTransformer(n_jobs=-1, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('large_cat', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent',...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [276]:
preds = feature_processing.predict_proba(X_test.drop(['model','color','search_term','url'],axis=1))[:,1]

In [277]:
# check shape of predictions
if (preds.shape == y_test.shape):
    print('shapes OK')

shapes OK


In [278]:
# prev: 0.8886417505271881
#       0.7675200441464977
#       0.8501359056263651
#       0.9994019683472557    logistic_regression con preprocesamiento de browswer y os.
#       0.8501359056263651    idem pero con xgb
#       0.8503577934604342    xgb con:
#                                     ct = ColumnTransformer([
#                                         ("lim_cat",limited_categorical_transformer,["event","condition","staticpage",
#                                                                                     "campaign_source","search_engine",
#                                                                                     "channel","new_vs_returning","device_type",
#                                                                                     "operating_system_version","browser_version"]),
#                                         ("large_cat",large_categorical_transformer,["person","url","skus","city","region","country"]),
#                                         ("tf_idf",TfidfVectorizer(),"model"),
#                                         ("tf_idf2",TfidfVectorizer(),"color"),
#                                         ("tf_idf_reduced",TfidfVectorizer(),"search_term"),
#                                         ("passthrough",'passthrough',["storage","sku","year","month_sin","month_cos","day_sin","day_cos","weekday_sin",
#                                                                       "weekday_cos","hour_sin","hour_cos","screen_width","screen_height"])
#                                     ],n_jobs=-1)
#       0.8492173710139364    [best kaggle score]-> 0.84537 
#                             xgclassifier solamente con features numericos de nuevos superdf de 100 y pico de features. 
roc_auc_score(y_test,preds)

0.49791264437895877

In [23]:
# feature_processing.named_steps['preproc'].named_transformers_['tf_idf'].idf_

## Export area

In [206]:
preds_posta = feature_processing.predict_proba(to_predict.drop(['model','color','search_term','url'],axis=1))[:,1]

In [207]:
preds_posta

array([0.03057893, 0.07978979, 0.03293054, ..., 0.07043858, 0.11072487,
       0.00669352], dtype=float32)

In [208]:
to_publish = pd.DataFrame()

In [209]:
to_publish['person'] = to_predict.person

In [210]:
to_publish['label'] = preds_posta

In [211]:
to_publish.shape

(19415, 2)

In [212]:
to_publish.groupby('person', as_index=False).mean().to_csv('../predictions/2.dic@02.50.csv', index=False)