In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta, datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score, brier_score_loss
import xgboost as xgb
from sklearn import preprocessing

In [3]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../data/trocafone_kaggle_test.csv')

In [58]:
df_labels['label'] = df_labels['label'].astype(int)

In [5]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear

#### Solo pruebo 2 features.

## Cantidad de veces que realiza un evento dado

In [6]:
cant_por_evento = pd.concat([pd.get_dummies(df_events['event']),df_events[['person']]],axis = 1).groupby('person')\
    .sum().reset_index()
cant_por_evento = cant_por_evento[['person','brand listing','checkout','search engine hit','ad campaign hit','generic listing']]

## Entra o no un determinado mes

In [7]:
df_month2 = pd.concat([df_events['person'],pd.get_dummies(df_events['month'])],axis = 1).drop_duplicates()\
                .groupby('person').sum().reset_index()
df_month2.columns = ['person','mes_1','mes_2','mes_3','mes_4','mes_5']
#df_month2 = df_month2[['person','mes_5','mes_4','mes_3']]

In [8]:
df_month2.head()

Unnamed: 0,person,mes_1,mes_2,mes_3,mes_4,mes_5
0,0008ed71,0,0,0,0,1
1,00091926,0,0,0,0,1
2,00091a7a,0,0,1,0,0
3,000ba417,0,0,0,0,1
4,000c79fe,0,0,0,0,1


In [9]:
df_train = pd.merge(df_month2,cant_por_evento,on = 'person',how = 'inner')

In [10]:
df_train['checkout2'] = df_train['checkout']
df_train['brand listing2'] = df_train['brand listing']  

#### Se aplica una transformacion a los datos.

In [11]:
x = df_train.loc[:,df_train.columns != 'person']
maxAbsScaler = preprocessing.RobustScaler()
x_scaled = maxAbsScaler.fit_transform(x)
df_train = pd.concat([df_train['person'],pd.DataFrame(x_scaled)],axis = 1)

In [12]:
df_train.head()

Unnamed: 0,person,0,1,2,3,4,5,6,7,8,9,10,11
0,0008ed71,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0
1,00091926,0.0,0.0,0.0,0.0,0.0,6.25,1.0,-0.333333,3.25,-0.25,1.0,6.25
2,00091a7a,0.0,0.0,1.0,0.0,-1.0,1.25,-1.0,-0.333333,-0.25,-0.25,-1.0,1.25
3,000ba417,0.0,0.0,0.0,0.0,0.0,6.0,5.0,0.0,-0.25,3.25,5.0,6.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0


In [13]:
# df con labels.

In [14]:
df_train_con_labels = pd.merge(df_train,df_labels,on = 'person',how = 'inner').drop(columns = ['person'])
df_train_con_labels.shape

(19414, 13)

In [15]:
df_train_con_labels.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,label
0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.75,0.75,0.0,0.0,0
3,1.0,1.0,0.0,1.0,0.0,41.25,14.0,4.0,6.75,6.75,14.0,41.25,0
4,0.0,0.0,0.0,0.0,0.0,0.25,1.0,-0.333333,-0.5,0.0,1.0,0.25,0


#### Para subir a Kaggle

In [16]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [17]:
xg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [18]:
xg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [19]:
df_predecir = pd.merge(df_train,df_person,on = 'person', how = 'inner')

In [20]:
df_predecir_sin_person = df_predecir.drop(columns = ['person'])

In [22]:
preds = pd.DataFrame(xg.predict_proba(df_predecir_sin_person))[1]

AttributeError: 'XGBRegressor' object has no attribute 'predict_prob'

In [None]:
df_final = df_predecir[['person']].copy()

In [334]:
df_final['label'] = pd.Series(preds)

In [335]:
df_final.to_csv('predicciones.csv', encoding='utf-8', index=False)

#### Test sin subir a Kaggle

Preparo todas las metricas. Pueden copiar esto y usarlo en otros notebooks.

In [244]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [245]:
xg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [247]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [100]:
predsProb1 = pd.DataFrame(xg.predict_proba(X_test))[1]

In [101]:
predsLabel = pd.Series(xg.predict(X_test))

In [102]:
trainAccuracy = accuracy_score(y_train, pd.Series(xg.predict(X_train)))
testAccuracy = accuracy_score(y_test, predsLabel)

cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
classificationReport = classification_report(y_test, predsLabel)
precisionScore = precision_score(y_test, predsLabel)
recallScore = recall_score(y_test, predsLabel)
f1Score = f1_score(y_test, predsLabel)
matrizDeConfusion = confusion_matrix(y_test, predsLabel)

meanSquaredError = mean_squared_error(y_test, predsProb1)
areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)
brierScoreLoss = brier_score_loss(y_test, predsProb1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [103]:
# Pruebo todas las metricas.
# accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
# f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
# brierScoreLoss.
# Hay metricas para la probabilidad de que sea 1 y otras metricas para las predicciones de los labels.


# Metricas con LABELS.

print()
print("Metricas con Labels:")
print()

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Precision Score: ",precisionScore)
print("Recall Score: ",recallScore)
print("F1 Score: ",f1Score)
print("Cohen Kappa Score: ",cohenKappaScore)
print()
print("Confusion matrix: ")
print(matrizDeConfusion)

# Metricas con PROBABILIDADES. 

print()
print("Metricas sin Labels:")
print()

print("ROC auc score: ", areaDebajoDeCurva)
print("Mean squared error: ", meanSquaredError)
print("Brier Score Loss: ",brierScoreLoss)


Metricas con Labels:

Train accuracy:  0.9475886935805807
Test acuracy:  0.957249549317538

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3717
           1       0.00      0.00      0.00       166

   micro avg       0.96      0.96      0.96      3883
   macro avg       0.48      0.50      0.49      3883
weighted avg       0.92      0.96      0.94      3883


Precision Score:  0.0
Recall Score:  0.0
F1 Score:  0.0
Cohen Kappa Score:  0.0

Confusion matrix: 
[[3717    0]
 [ 166    0]]

Metricas sin Labels:

ROC auc score:  0.5714147955826535
Mean squared error:  0.04188519608098265
Brier Score Loss:  0.04188519608098265


#### Encontrando los mejotes hiperparametros en XGBoost (GridSearch y RandomSearch)

In [None]:
# https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost

In [284]:
# Solo para contar cuanto tarda en entrenar.
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [15]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [289]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [350]:
 3*5*3*3*3*10

4050

In [287]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [348]:
xg = xgb.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

Stratification seeks to ensure that each fold is representative of all strata of the data. Generally this is done in a supervised way for classification and aims to ensure each class is (approximately) equally represented across each test fold (which are of course combined in a complementary way to form training folds).

In [None]:
grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3 )
grid.fit(X, Y)

In [349]:
folds = 10
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xg, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )
#grid = GridSearchCV(estimator=xg, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3 )

start_time = timer(None) # timing starts from this point for "start_time" variable
#random_search.fit(X, y)
grid.fit(X, y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 10 folds for each of 405 candidates, totalling 4050 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   34.7s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  5.7min
[Parallel(n_jobs=4)]: Done 504 tasks      | elapsed: 10.3min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 16.3min
[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed: 23.3min
[Parallel(n_jobs=4)]: Done 1560 tasks      | elapsed: 32.3min
[Parallel(n_jobs=4)]: Done 2040 tasks      | elapsed: 44.5min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed: 58.1min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed: 75.2min
[Parallel(n_jobs=4)]: Done 3864 tasks      | elapsed: 93.8min
[Parallel(n_jobs=4)]: Done 4050 out of 4050 | elapsed: 99.8min finished



 Time taken: 1 hours 39 minutes and 51.13 seconds.


In [352]:
xg = random_search.best_estimator_
#xg = grid.best_estimator_

In [358]:
# Guardo el modelo.
xg.save_model("modelo1")

In [359]:
grid.best_params_

{'colsample_bytree': 1.0,
 'gamma': 5,
 'max_depth': 3,
 'min_child_weight': 5,
 'subsample': 0.6}

In [360]:
results = pd.DataFrame(grid.cv_results_)
results.to_csv('../../xgb-grid-search-results-01.csv', index=False)



#### Usando bayesian optimization.


In [None]:
# https://www.kaggle.com/tilii7/bayesian-optimization-of-xgboost-parameters

In [91]:
from skopt import BayesSearchCV

In [92]:
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [93]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [95]:
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 10 # 1000
TRAINING_SIZE = 100000 # 20000000
TEST_SIZE = 25000

In [96]:
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)

In [97]:

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")
    
    

In [98]:
result = bayes_cv_tuner.fit(X_train, y_train, callback=status_print)

Model #1
Best ROC-AUC: 0.5351
Best params: {'colsample_bylevel': 0.4160029192647807, 'colsample_bytree': 0.7304484857455519, 'gamma': 0.13031389926541354, 'learning_rate': 0.042815319280763466, 'max_delta_step': 13, 'max_depth': 21, 'min_child_weight': 2, 'n_estimators': 87, 'reg_alpha': 5.497557739289786e-07, 'reg_lambda': 0.05936070635912049, 'scale_pos_weight': 0.060830282487222144, 'subsample': 0.13556548021189216}

Model #2
Best ROC-AUC: 0.6025
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #3
Best ROC-AUC: 0.6067
Best params: {'colsample_bylevel': 0.4503841871781403, 'colsample_bytree': 0.9195352964526833, 'gamma': 8.168958221061441e-09, '

In [99]:
xg = result.best_estimator_