# Note
This notebooks uses the file generated by the notebook `process_dates_screen_storage.ipynb`.

### Dealing with imports...

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Import Area Under the Receiver Operating Characteristic Curve metric to evaluate results
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [None]:
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelBinarizer

### Loading data...
The files loaded are generated by the notebook `process_dates_screen_starge.ipynb`. 

In [None]:
train_df = pd.read_csv('../data/train_df_processed_screenResol_storage_dates.csv')

to_predict = pd.read_csv('../data/to_predict_processed_screenResol_storage_dates.csv')

***
## Note on preprocessing
All preprocessing which can be done in just one way, i.e. it doesn't need hyper parameter adjustment, will be done outside pipelines and then stored to a new file, so there will be no need to execute the same code every time we open this notebook again.

Good pipeline resources: 
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
* https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
* https://www.kaggle.com/metadist/work-like-a-pro-with-pipelines-and-feature-unions
* https://www.kaggle.com/sermakarevich/sklearn-pipelines-tutorial
* http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
***

### Build some custom transformers

# Feature pipeline creation

### Data split

In [None]:
test_size = 0.33
# define a seed, so same experiments output same results every time and experiments between them become comparable
seed = 12

# realizo train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'label'], 
                                                    train_df.label, 
                                                    test_size=test_size, 
                                                    random_state=seed)

In [None]:
# reassure object types are all strings, because some processings fails otherwise. Apparently there is some value which is not a string...
X_train.model = X_train.model.apply(lambda x: str(x))
X_test.model = X_test.model.apply(lambda x: str(x))
train_df.model = train_df.model.apply(lambda x: str(x))

X_train.color = X_train.color.apply(lambda x: str(x))
X_test.color = X_test.color.apply(lambda x: str(x))
train_df.color = train_df.color.apply(lambda x: str(x))

X_train.search_term = X_train.search_term.apply(lambda x: str(x))
X_test.search_term = X_test.search_term.apply(lambda x: str(x))
train_df.search_term = train_df.search_term.apply(lambda x: str(x))

X_train.url = X_train.url.apply(lambda x: str(x))
X_test.url = X_test.url.apply(lambda x: str(x))
train_df.url = train_df.url.apply(lambda x: str(x))

## Pipelines

## Keras

In [None]:
# from https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
import tensorflow as tf
def as_keras_metric(method):
    import functools
    from keras import backend as K
    import tensorflow as tf
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

auc_roc = as_keras_metric(tf.metrics.auc)
recall = as_keras_metric(tf.metrics.recall)

In [None]:
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

# from https://rdrr.io/github/rstudio/keras/man/sequences_to_matrix.html
# sequences_to_matrix(tokenizer, sequences, mode = c("binary", "count", "tfidf", "freq"))
tokenizer.fit_on_texts(X_train.search_term)
X_train.search_term = tokenizer.texts_to_matrix(X_train.search_term,mode='tfidf')

## End Keras

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import FeatureUnion

In [None]:
train_df.select_dtypes('object').columns

In [None]:
limited_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("one_hot",OneHotEncoder(handle_unknown='ignore'))
])

large_categorical_transformer = Pipeline([
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("hashing_trick",FeatureHasher(input_type='string'))
])

tf_idf = Pipeline([
#     ("imputer",SimpleImputer(strategy='constant',fill_value="")),
#     ('vect',CountVectorizer(ngram_range=(1,1), binary=True, min_df=3,lowercase=False)),
#     ('tfidf', TfidfTransformer())
    ("tf_idf",TfidfVectorizer()),
#     ('best', TruncatedSVD())
])

ct = ColumnTransformer([
#     ("large_cat",large_categorical_transformer,["person","skus","city","region","country"]),
#     ("tf_idf",FeatureHasher(n_features=30,input_type='string'),['model','color','search_term','url'])
#     ("tf_idf",tf_idf,['color'])
]
    ,n_jobs=-1
    ,remainder='passthrough'
)

In [None]:
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
to_predict.fillna(0, inplace=True)

In [None]:
# A host of Scikit-learn models
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline

SEED = seed

def get_models():
    """Generate a library of base learners."""
    nb = GaussianNB()
    svc = SVC(C=100, probability=True)
    knn = KNeighborsClassifier(n_neighbors=10,metric='chebyshev')
    lr = LogisticRegression(solver='saga',C=100, random_state=SEED, n_jobs=-1)
    nn = MLPClassifier((80, 10), early_stopping=True, random_state=SEED)
    gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    xgb = XGBClassifier()
    ada = AdaBoostClassifier()
    bag = BaggingClassifier(n_jobs=-1)
    extra_t = ExtraTreesClassifier(n_jobs=-1)
    dec_tree = DecisionTreeClassifier()
    ext_tree = ExtraTreeClassifier()
    sgd = SGDClassifier(n_jobs=-1, loss='log')
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED, n_jobs=-1)

    models = {'svm': svc,
              'knn': knn,
              'naive bayes': nb,
              'mlp-nn': nn,
              'random forest': rf,
              'gbm': gb,
              'xgb': xgb,
              'ada': ada,
              'bag': bag,
              'extra_tree': extra_t,
              'dec_tree': dec_tree,
              'ext_tree': ext_tree,
              'sgd': sgd,
              'logistic': lr,
              }

    return models


def train_predict(model_list):
    """Fit models in list on training set and return preds"""
    P = np.zeros((y_test.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(X_train.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1), y_train)
        P.iloc[:, i] = m.predict_proba(X_test.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1))[:, 1]
        cols.append(name)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P


def score_models(P, y):
    """Score model in prediction DF"""
    print("Scoring models.")
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        print("%-26s: %.3f" % (m, score))
    print("Done.\n")

In [None]:
models = get_models()
P = train_predict(models)
score_models(P, y_test)

In [None]:
base_learners = get_models()

In [None]:
meta_learner = GradientBoostingClassifier(
    n_estimators=1000,
    loss="exponential",
    max_features=4,
    max_depth=3,
    subsample=0.5,
    learning_rate=0.005, 
    random_state=SEED
)

In [None]:
def train_base_learners(base_learners, inp, out, verbose=True):
    """Train all base learners in the library."""
    if verbose: print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        m.fit(inp, out)
        if verbose: print("done")

In [None]:
xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
    X_train.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1), y_train, test_size=0.5, random_state=SEED)

In [None]:
train_base_learners(base_learners, xtrain_base, ytrain_base)

In [None]:
def predict_base_learners(pred_base_learners, inp, verbose=True):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        p = m.predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]
        if verbose: print("done")

    return P

In [None]:
P_base = predict_base_learners(base_learners, xpred_base)

In [None]:
meta_learner.fit(P_base, ypred_base)

In [None]:
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [None]:
P_pred, p = ensemble_predict(base_learners, meta_learner, X_test.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1))
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

In [None]:
from sklearn.base import clone

def stacking(base_learners, meta_learner, X, y, generator):
    """Simple training routine for stacking."""

    # Train final base learners for test time
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners, X, y, verbose=False)
    print("done")

    # Generate predictions for training meta learners
    # Outer loop:
    print("Generating cross-validated predictions...")
    cv_preds, cv_y = [], []
    for i, (train_idx, test_idx) in enumerate(generator.split(X)):

        fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
        fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]

        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        train_base_learners(
            fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)

        fold_P_base = predict_base_learners(
            fold_base_learners, fold_xtest, verbose=False)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    print("CV-predictions done")
    
    # Be careful to get rows in the right order
    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)

    # Train meta learner
    print("Fitting meta learner...", end="")
    meta_learner.fit(cv_preds, cv_y)
    print("done")

    return base_learners, meta_learner

In [None]:
from sklearn.model_selection import KFold

# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
    get_models(), clone(meta_learner), X_train.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1).values, y_train.values, KFold(2))

In [None]:
P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, X_test.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1).values, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

In [None]:
feature_processing = Pipeline([
    ('preproc', ct),
    ('predict', XGBClassifier())
#     ('svm',SVC())
#     ('sgd', linear_model.SGDClassifier())
]
#     ,memory=cachedir
)

In [None]:
feature_processing.fit(X_train.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1), y_train)

In [None]:
preds = feature_processing.predict_proba(X_test.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1))[:,1]

In [None]:
# check shape of predictions
if (preds.shape == y_test.shape):
    print('shapes OK')

In [None]:
# prev: 0.8886417505271881
#       0.7675200441464977
#       0.8501359056263651
#       0.9994019683472557    logistic_regression con preprocesamiento de browswer y os.
#       0.8501359056263651    idem pero con xgb
#       0.8503577934604342    xgb con:
#                                     ct = ColumnTransformer([
#                                         ("lim_cat",limited_categorical_transformer,["event","condition","staticpage",
#                                                                                     "campaign_source","search_engine",
#                                                                                     "channel","new_vs_returning","device_type",
#                                                                                     "operating_system_version","browser_version"]),
#                                         ("large_cat",large_categorical_transformer,["person","url","skus","city","region","country"]),
#                                         ("tf_idf",TfidfVectorizer(),"model"),
#                                         ("tf_idf2",TfidfVectorizer(),"color"),
#                                         ("tf_idf_reduced",TfidfVectorizer(),"search_term"),
#                                         ("passthrough",'passthrough',["storage","sku","year","month_sin","month_cos","day_sin","day_cos","weekday_sin",
#                                                                       "weekday_cos","hour_sin","hour_cos","screen_width","screen_height"])
#                                     ],n_jobs=-1)
#       0.8492173710139364    [best kaggle score]-> 0.84537 
#                             xgclassifier solamente con features numericos de nuevos superdf de 100 y pico de features. 
roc_auc_score(y_test,preds)

## Export area

In [None]:
P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, to_predict.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1).values)

In [None]:
p

In [None]:
preds_posta = p

In [None]:
# first train with whole dataset
feature_processing.fit(train_df.drop(['model','color','search_term','url','person','skus','city','region','country','label'],axis=1), train_df.label)

In [None]:
preds_posta = feature_processing.predict_proba(to_predict.drop(['model','color','search_term','url','person','skus','city','region','country'],axis=1))[:,1]

In [None]:
preds_posta

In [None]:
to_publish = pd.DataFrame()

In [None]:
to_publish['person'] = to_predict.person

In [None]:
to_publish['label'] = preds_posta

In [None]:
to_publish.shape

In [None]:
to_publish.groupby('person', as_index=False).mean().to_csv('../predictions/6.dic@00.05.csv', index=False)