In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
                                                            
import xgboost as xgb
from sklearn import preprocessing
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [3]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [4]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear
df_events['second'] = df_events['timestamp'].dt.second

In [5]:
df_todas_las_personas = df_events[['person']].drop_duplicates()

In [6]:
# Me quedo solo con los users que tienen label.
df_events_con_labels = pd.merge(df_events,df_labels,on = 'person',how = 'inner')

In [8]:
def stratifiedCV(data):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
    #results2 = cross_val_predict(model, X, y, cv = stratifiedKFold, method='predict_proba')
    
    print("ROC AUC: ",results.mean())

In [444]:
def stratifiedCVKaggle(data, K, cantidadUnos):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    rfe = RFE(model, K)
    rfe = rfe.fit(X, y.ravel())
    
    # Selecciono los n features para entrenar
    mask = rfe.get_support()
    features_X = X.columns[mask]
    X = X.filter(items = features_X)
    
    #nombreColumnas = X.columns
    
    #X, y = balanceoDeDatos(X, y, cantidadUnos)
    
    #X = transformacionDeDatos(X)
    
    #X.columns = nombreColumnas
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
        
    print("ROC AUC: ",results.mean())
    
    df_kaggle = pd.merge(df_person, data, on = 'person', how = 'inner')
    df_kaggleSinPersonas = df_kaggle.drop(columns = ['person'])
    df_kaggleSinPersonas = df_kaggleSinPersonas.filter(items = features_X)
    model.fit(X,y)
    preds = model.predict(df_kaggleSinPersonas)
    predicciones = df_kaggle[['person']]
    predicciones['label'] = preds
    print(predicciones.shape[0] == 19415)
    predicciones.to_csv('predicciones_SI_SE_PUEDE.csv', encoding='utf-8', index=False)
    return results.mean()

In [32]:
def stratifiedCVConRFE(data, K):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    rfe = RFE(model, K)
    rfe = rfe.fit(X, y.ravel())
    
    # Selecciono los n features para entrenar
    mask = rfe.get_support()
    
    features_X = X.columns[mask]
    
    X = X.filter(items = features_X)
    
    X = transformacionDeDatos(X)
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
    #results2 = cross_val_score(model, X, y, cv = KFold, scoring = 'roc_auc')   
    
    print("ROC AUC con StratifiedKFold: ",results.mean())
    #print("ROC AUC con KFOld: ", results2.mean())
    return results.mean()

In [31]:
def pruebaConRFEUsandoSPlit(data, K, cantidadUnos):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

    
    rfe = RFE(model, K)
    rfe = rfe.fit(X, y.ravel())
    
    # Selecciono los n features para entrenar
    mask = rfe.get_support()
    
    features_X = X.columns[mask]
    
    X = X.filter(items = features_X)
    
    X_mejores_features = X
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123, stratify=y)
    
    #columnas = X_train.columns
    
    #X_train, y_train = balanceoDeDatos(X_train, y_train, cantidadUnos)
    
    #X_train.columns = columnas
    
    X = transformacionDeDatos(X)
    
    model.fit(X_train,y_train)
    
    predsProb1 = pd.DataFrame(model.predict_proba(X_test))[1]
    predsLabel = pd.Series(model.predict(X_test))

    trainAccuracy = accuracy_score(y_train, pd.Series(model.predict(X_train)))
    testAccuracy = accuracy_score(y_test, predsLabel)

    cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
    classificationReport = classification_report(y_test, predsLabel)
    matrizDeConfusion = confusion_matrix(y_test, predsLabel)

    meanSquaredError = mean_squared_error(y_test, predsProb1)
    areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)

    # Metricas con LABELS.

    print()
    print("Metricas con Labels:")
    print()

    print("Train accuracy: ", trainAccuracy)
    print("Test acuracy: ", testAccuracy)
    print()
    print("Classification Report:")
    print(classificationReport)
    print()
    print("Cohen Kappa Score: ",cohenKappaScore)
    print()
    print("Confusion matrix: ")
    print(matrizDeConfusion)

    # Metricas con PROBABILIDADES. 

    print()
    print("Metricas sin Labels:")
    print()

    print("ROC auc score: ", areaDebajoDeCurva)
    print("Mean squared error: ", meanSquaredError)
    
    return X_mejores_features

In [130]:
def stratifiedCVConKBestChi2(data, K):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    selector = SelectKBest(chi2, k = K)
    selector.fit(X, y)
    X_new = selector.transform(X)
    
    print(len(X.columns[selector.get_support(indices=True)]))
    print(X.columns[selector.get_support(indices=True)])
    
    nombre_columnas = list(X.columns[selector.get_support(indices=True)])
    
    print(X_new.shape)
    
    X_new = pd.DataFrame(X_new)
    X_new.columns = nombre_columnas
    
    model.fit(X_new, y)
    
    df_kaggle_con_personas = pd.merge(df_person, data, how = 'inner', on = 'person')
    df_kaggle_sin_personas = df_kaggle_con_personas[list(X.columns[selector.get_support(indices=True)])]
    preds = model.predict(df_kaggle_sin_personas)
    predicciones = df_kaggle_con_personas[['person']]
    predicciones['label'] = preds
    
    predicciones.to_csv('ultima_prueba.csv', encoding='utf-8', index=False)
    
    results = cross_val_score(model, X_new, y, cv = stratifiedKFold, scoring = 'roc_auc')   
    
    print("ROC AUC: ",results.mean())

In [128]:
predicciones = pd.read_csv('ultima_prueba.csv')

In [129]:
predicciones

Unnamed: 0.1,Unnamed: 0,person,label
0,0,4886f805,0.290142
1,1,0297fc1e,0.361287
2,2,2d681dd8,0.307374
3,3,cccea85e,0.449834
4,4,4c8a8b93,0.349192
5,5,29ebb414,0.297539
6,6,3dc1950f,0.411992
7,7,8ea4c165,0.394800
8,8,d8cfe234,0.428852
9,9,d6bc64df,0.445895


In [None]:
# Create and fit selector
selector = SelectKBest(f_classif, k=5)
selector.fit(features_df, target)
# Get idxs of columns to keep
cols = selector.get_support(indices=True)
# Create new dataframe with only desired columns, or overwrite existing
features_df_new = features_df[cols]

In [23]:
def balanceoDeDatos(X, y, cantidadUnos):
    cantidadCeros = np.sum(y == 0)
    ros = RandomOverSampler(random_state = 2, sampling_strategy = {0: cantidadCeros, 1: cantidadUnos})
    X, y = ros.fit_sample(X, y)
    X = pd.DataFrame(X)
    return X, y

In [24]:
def transformacionDeDatos(X):
    transformador = preprocessing.RobustScaler()
    X = transformador.fit_transform(X.values)
    return pd.DataFrame(X)

In [434]:
preds = pd.read_csv('predicciones_SI_SE_PUEDE.csv')

In [436]:
preds.head(60)

Unnamed: 0,person,label
0,4886f805,0.465493
1,0297fc1e,0.443111
2,2d681dd8,0.443111
3,cccea85e,0.443111
4,4c8a8b93,0.385669
5,29ebb414,0.443111
6,3dc1950f,0.443111
7,8ea4c165,0.443111
8,d8cfe234,0.477664
9,d6bc64df,0.443111


In [14]:
# 0.845696189422742
df_seba = pd.read_csv('../features_varios.csv')

In [15]:
# 0.8520506767923741
df_seba2 = pd.read_csv('features_seba.csv')

In [16]:
# 0.8201881323017529
df_laggedFeatures  = pd.read_csv('fetures_nuevos_santi.csv')

In [17]:
# 0.8300765707861256
df_featuresEventos = pd.read_csv('Santi_FeaturesConEventos.csv')
df_featuresEventos.drop(columns = ['Unnamed: 0'],inplace = True)

In [18]:
# 0.7286368575010472
df_timeFeatures = pd.read_csv('santi_timefeatures.csv')
df_timeFeatures.drop(columns = ['Unnamed: 0'], inplace = True)

In [348]:
columnasEventos = ['person','checkout_iPhone 6S',
 'checkout_Samsung Galaxy J5',
 'checkout_Samsung Galaxy J7 Prime',
 'checkout_iPhone 7 Plus',
 'checkout_iPhone 4G',
 'checkout_Samsung Galaxy On 7',
 'checkout_Motorola Moto X Play 4G Dual',
 'checkout_Samsung Galaxy J7 2016 Metal',
 'checkout_iPhone 7',
 'checkout_Motorola Moto X2',
 'checkout_Samsung Galaxy S7',
 'checkout_iPhone 4S',
 'checkout_Lenovo Vibe A7010 Dual Chip',
 'checkout_Motorola Moto G4 Plus',
 'checkout_iPhone 5s',
 'viewedproduct_128GB',
 'viewedproduct_16GB',
 'viewedproduct_256GB',
 'viewedproduct_32GB',
 'viewedproduct_4GB',
 'viewedproduct_512MB',
 'viewedproduct_64GB',
 'viewedproduct_8GB',
 'viewedproduct_Bom',
 'viewedproduct_Bom - Sem Touch ID',
 'viewedproduct_Excelente',
 'viewedproduct_Muito Bom',
 'viewedproduct_Novo',
 'vuelve',
 'visitedsite_Sao Paulo',
 'visitedsite_region desconocida',
 'visitedsite_Minas Gerais',
 'visitedsite_Rio de Janeiro',
 'visitedsite_Bahia',
 'visitedsite_Pernambuco',
 'visitedsite_Ceara',
 'visitedsite_Parana',
 'visitedsite_Rio Grande do Sul',
 'visitedsite_Espirito Santo',
 'visitedsite_Federal District',
 'visitedsite_Maranhao',
 'visitedsite_Goias',
 'visitedsite_Santa Catarina',
 'visitedsite_Para',
 'visitedsite_Rio Grande do Norte',
 'visitedsite_Paraíba',
 'visitedsite_Piaui',
 'visitedsite_Alagoas',
 'visitedsite_Sergipe',
 'visitedsite_Amazonas',
 'visitedsite_Mato Grosso',
 'visitedsite_Tocantins',
 'visitedsite_Mato Grosso do Sul',
 'visitedsite_Rondonia',
 'visitedsite_Amapa',
 'visitedsite_Acre',
 'visitedsite_Roraima',
 'visitedsite_Computer',
 'visitedsite_Smartphone',
 'visitedsite_Tablet',
 'visitedsite_device desconocido',
 'visitedsite_360x640',
 'visitedsite_1366x768',
 'visitedsite_320x570',
 'visitedsite_1600x900',
 'visitedsite_320x534',
 'visitedsite_1280x1024',
 'visitedsite_320x569',
 'visitedsite_1920x1080',
 'visitedsite_1024x768',
 'visitedsite_1324x745',
 'visitedsite_Android 7',
 'visitedsite_Windows 7 ',
 'visitedsite_Windows 10 ',
]

In [353]:
columnasSeba = ['person','ad campaign hit mes 5',
 'brand listing mes 5',
 'checkout mes 5',
 'conversion mes 5',
 'generic listing mes 5',
 'lead mes 5',
 'search engine hit mes 5',
 'searched products mes 5',
 'visited site mes 5',
 'dias_hasta_ultimo',
 'ad campaign hit mes 4',
 'brand listing mes 4',
 'checkout mes 4',
 'conversion mes 4',
 'generic listing mes 4',
 'lead mes 4',
 'search engine hit mes 4',
 'searched products mes 4',
 'visited site mes 4',
 'distan_dias',
 'nuevo_mes_5',
 'cant_dias_dist',
 'modelos_dist',
 'promedio_por_dia_x',
 'promedio_por_mes',
 'mes_primer_entrada',
 'promedio_por_dia_y']

In [349]:
list(df_seba2.columns)

['person',
 'ad campaign hit mes 5',
 'brand listing mes 5',
 'checkout mes 5',
 'conversion mes 5',
 'generic listing mes 5',
 'lead mes 5',
 'search engine hit mes 5',
 'searched products mes 5',
 'staticpage mes 5',
 'viewed product mes 5',
 'visited site mes 5',
 'dias_hasta_ultimo',
 'ad campaign hit mes 4',
 'brand listing mes 4',
 'checkout mes 4',
 'conversion mes 4',
 'generic listing mes 4',
 'lead mes 4',
 'search engine hit mes 4',
 'searched products mes 4',
 'staticpage mes 4',
 'viewed product mes 4',
 'visited site mes 4',
 'distan_dias',
 'nuevo_mes_5',
 'cant_dias_dist',
 'modelos_dist',
 'promedio_por_dia_x',
 'promedio_por_mes',
 'mes_primer_entrada',
 'promedio_por_dia_y']

In [385]:
cant_ceros = np.sum(y == 0)

In [386]:
cant_ceros

18434

In [387]:
np.sum(y_train == 1)

808

In [None]:
cant_ceros = np.sum(y == 0)
cant_unos = 7000
ros = RandomOverSampler(random_state = 2, sampling_strategy = {0: cant_ceros, 1: cant_unos})
X, y = ros.fit_sample(X, y)
X = pd.DataFrame(X)

In [36]:
df_train = pd.merge(df_seba2,df_featuresEventos, on = 'person', how = 'inner')
#df_train = pd.merge(df_train, df_laggedFeatures, on = 'person', how = 'inner')

In [382]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [37]:
stratifiedCVConKBestChi2(df_train,85)

ROC AUC:  0.8600479918440177


0.8600479918440177

In [None]:
pruebaConRFEUsandoSPlit(df_train, 78, 800)

In [None]:
# https://www.kaggle.com/tilii7/bayesian-optimization-of-xgboost-parameters

In [46]:
from skopt import BayesSearchCV

In [47]:
class BayesSearchCV(BayesSearchCV):
    def _run_search(self, x): raise BaseException('Use newer skopt')

In [7]:
df_train = pd.read_csv('cuarenta_cols.csv')

In [10]:
stratifiedCV(df_train)

ROC AUC:  0.860095096469798


In [13]:
len(df_train)

19414

In [562]:
df_train2 = pd.merge(df_train,df_labels,how = 'inner', on ='person')
df_train2.drop(columns = ['person'],inplace = True)    
X = df_train2.drop(columns = ['label'])
y = df_train2['label']

In [488]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123, stratify = y)

In [51]:
# SETTINGS - CHANGE THESE TO GET SOMETHING MEANINGFUL
ITERATIONS = 1000 # 1000
TRAINING_SIZE = 10000 # 20000000
TEST_SIZE = 25000

In [52]:
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb.XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)

In [53]:

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")
    
    

In [54]:
result = bayes_cv_tuner.fit(X, y, callback=status_print)

Model #1
Best ROC-AUC: 0.8346
Best params: {'colsample_bylevel': 0.4160029192647807, 'colsample_bytree': 0.7304484857455519, 'gamma': 0.13031389926541354, 'learning_rate': 0.042815319280763466, 'max_delta_step': 13, 'max_depth': 21, 'min_child_weight': 2, 'n_estimators': 87, 'reg_alpha': 5.497557739289786e-07, 'reg_lambda': 0.05936070635912049, 'scale_pos_weight': 0.060830282487222144, 'subsample': 0.13556548021189216}

Model #2
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #3
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, '

Model #21
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #22
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #23
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'lea

Model #41
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #42
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #43
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'lea

Model #61
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #62
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'learning_rate': 0.7988179462781242, 'max_delta_step': 17, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 68, 'reg_alpha': 0.0005266983003701547, 'reg_lambda': 276.5424475574225, 'scale_pos_weight': 0.3016410771843142, 'subsample': 0.9923710598637134}

Model #63
Best ROC-AUC: 0.8602
Best params: {'colsample_bylevel': 0.8390144719977516, 'colsample_bytree': 0.8844821246070537, 'gamma': 4.358684608480795e-07, 'lea

Model #81
Best ROC-AUC: 0.861
Best params: {'colsample_bylevel': 0.6460198516096589, 'colsample_bytree': 1.0, 'gamma': 0.00019104624384953453, 'learning_rate': 0.04212380066940533, 'max_delta_step': 17, 'max_depth': 17, 'min_child_weight': 5, 'n_estimators': 100, 'reg_alpha': 0.06792882730611047, 'reg_lambda': 5.976432069607228e-09, 'scale_pos_weight': 0.2925631846961957, 'subsample': 0.48180984894100914}

Model #82
Best ROC-AUC: 0.861
Best params: {'colsample_bylevel': 0.6460198516096589, 'colsample_bytree': 1.0, 'gamma': 0.00019104624384953453, 'learning_rate': 0.04212380066940533, 'max_delta_step': 17, 'max_depth': 17, 'min_child_weight': 5, 'n_estimators': 100, 'reg_alpha': 0.06792882730611047, 'reg_lambda': 5.976432069607228e-09, 'scale_pos_weight': 0.2925631846961957, 'subsample': 0.48180984894100914}

Model #83
Best ROC-AUC: 0.861
Best params: {'colsample_bylevel': 0.6460198516096589, 'colsample_bytree': 1.0, 'gamma': 0.00019104624384953453, 'learning_rate': 0.04212380066940533,

Model #101
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #102
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #103
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #121
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #122
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #123
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #141
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #142
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #143
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #161
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #162
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #163
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #181
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #182
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #183
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #201
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #202
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #203
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

Model #221
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #222
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.0002590416110078333, 'learning_rate': 0.012918613998115362, 'max_delta_step': 9, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 83, 'reg_alpha': 8.980915232484609e-07, 'reg_lambda': 1.7631122877203713e-05, 'scale_pos_weight': 0.10487891675488856, 'subsample': 0.8589845107389557}

Model #223
Best ROC-AUC: 0.8622
Best params: {'colsample_bylevel': 0.9082170492130397, 'colsample_bytree': 0.7236913235694855, 'gamma': 0.00025904

KeyboardInterrupt: 

In [507]:
xg = result.best_estimator_

In [509]:
params = {'colsample_bylevel': 0.8805332918491438, 'colsample_bytree': 0.9352504152264935, 'gamma': 9.881437744998433e-06, 'learning_rate': 0.19933863096197899, 'max_delta_step': 6, 'max_depth': 18, 'min_child_weight': 3, 'n_estimators': 61, 'reg_alpha': 7.871401820904661e-05, 'reg_lambda': 1000.0, 'scale_pos_weight': 5.390140010639928, 'subsample': 1.0}

In [510]:
model = xgb.XGBClassifier(params)

In [512]:
len(df_train)

19414

In [None]:
pd.merge(df_person, )

In [None]:
{'colsample_bylevel': 0.8805332918491438, 'colsample_bytree': 0.9352504152264935, 'gamma': 9.881437744998433e-06, 'learning_rate': 0.19933863096197899, 'max_delta_step': 6, 'max_depth': 18, 'min_child_weight': 3, 'n_estimators': 61, 'reg_alpha': 7.871401820904661e-05, 'reg_lambda': 1000.0, 'scale_pos_weight': 5.390140010639928, 'subsample': 1.0}

In [None]:
xg

In [None]:
{'colsample_bylevel': 0.8805332918491438, 'colsample_bytree': 0.9352504152264935, 'gamma': 9.881437744998433e-06, 'learning_rate': 0.19933863096197899, 'max_delta_step': 6, 'max_depth': 18, 'min_child_weight': 3, 'n_estimators': 61, 'reg_alpha': 7.871401820904661e-05, 'reg_lambda': 1000.0, 'scale_pos_weight': 5.390140010639928, 'subsample': 1.0}

In [508]:
xg

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree',
       colsample_bylevel=0.8390144719977516,
       colsample_bytree=0.8844821246070537, eval_metric='auc',
       gamma=4.358684608480795e-07, learning_rate=0.7988179462781242,
       max_delta_step=17, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=68, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0,
       reg_alpha=0.0005266983003701547, reg_lambda=276.5424475574225,
       scale_pos_weight=0.3016410771843142, seed=None, silent=1,
       subsample=0.9923710598637134, tree_method='approx')>

In [None]:
xg.predict()

In [None]:
x

In [None]:
# 0.8659608289470141

In [254]:
len(df_train.columns)

111

In [28]:
df_train = pd.merge(df_seba2, df_laggedFeatures, on = 'person', how = 'inner')
df_train = pd.merge(df_train, df_featuresEventos, on = 'person', how = 'inner')

In [55]:
#stratifiedCVConRFE(df_train,83)

In [None]:
df_train2.columns

In [324]:
len(df_train2.columns)

310

In [264]:
X_new

array([[ 0.,  0.,  0., ..., 17.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ..., 13.,  3.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  3.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.]])

In [256]:
df_train = pd.merge(df_seba2,df_featuresEventos[columnas],on = 'person', how = 'inner')

In [331]:
list(df_train2.columns)

['person',
 'ad campaign hit mes 5',
 'brand listing mes 5',
 'checkout mes 5',
 'conversion mes 5',
 'generic listing mes 5',
 'lead mes 5',
 'search engine hit mes 5',
 'searched products mes 5',
 'staticpage mes 5',
 'viewed product mes 5',
 'visited site mes 5',
 'dias_hasta_ultimo',
 'ad campaign hit mes 4',
 'brand listing mes 4',
 'checkout mes 4',
 'conversion mes 4',
 'generic listing mes 4',
 'lead mes 4',
 'search engine hit mes 4',
 'searched products mes 4',
 'staticpage mes 4',
 'viewed product mes 4',
 'visited site mes 4',
 'distan_dias',
 'nuevo_mes_5',
 'cant_dias_dist',
 'modelos_dist',
 'promedio_por_dia_x',
 'promedio_por_mes',
 'mes_primer_entrada',
 'promedio_por_dia_y',
 'promedioCadaCuantosDiasVuelve',
 'cadaCuantasHorasHaceEventos',
 'cadaCuantosMinutosHaceEventos',
 'cadaCuantosDiasEnMes5HaceEventos',
 'cadaCuantasHorasEnUnDiaDelMes5HaceEventos',
 'cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaC

In [336]:
df_train2

Unnamed: 0,person,ad campaign hit mes 5,brand listing mes 5,checkout mes 5,conversion mes 5,generic listing mes 5,lead mes 5,search engine hit mes 5,searched products mes 5,staticpage mes 5,...,dia_del_anio_149,dia_del_anio_150,dia_del_anio_151,dia_del_anio_148,dia_del_anio_142,dia_del_anio_144,dia_del_anio_143,dia_del_anio_141,dia_del_anio_136,dia_del_anio_135
0,0008ed71,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00091926,15.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,16.0,0.0,12.0,0.0,36.0,21.0,1.0,22.0
2,000ba417,1.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.0
3,000c79fe,1.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,...,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,000e4d9e,19.0,17.0,1.0,0.0,17.0,0.0,5.0,0.0,0.0,...,61.0,7.0,0.0,7.0,79.0,29.0,5.0,20.0,195.0,0.0
5,000e619d,6.0,11.0,1.0,0.0,8.0,0.0,3.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,10.0
6,001001be,0.0,0.0,3.0,1.0,3.0,0.0,0.0,17.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,66.0,0.0
7,0010e89a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0016c4b5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,001802e4,5.0,0.0,1.0,0.0,4.0,0.0,0.0,4.0,0.0,...,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [339]:
stratifiedCVConKBestChi2(df_train2, 20)

ROC AUC:  0.7438951231899325


0.7438951231899325

In [347]:
stratifiedCVConRFE(df_train,85)

ROC AUC:  0.8659158489130127


0.8659158489130127

In [75]:
len([ True  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True False  True  True False  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True False False False False  True
 False False False False False False False  True  True False False False
 False False False False False False False False False  True  True False
 False False  True False  True  True  True  True False False False  True
 False  True  True  True  True  True  True  True False False False  True
 False False  True False False False False  True  True  True  True  True
  True  True  True False False False False  True  True  True  True  True
  True False  True  True False  True False False  True  True  True False
  True])

SyntaxError: invalid syntax (<ipython-input-75-38268f7554b2>, line 1)

In [None]:
pd.DataFrame

In [131]:
stratifiedCVConKBestChi2(df_train,139)

139
Index(['ad campaign hit mes 5', 'brand listing mes 5', 'checkout mes 5',
       'conversion mes 5', 'generic listing mes 5', 'search engine hit mes 5',
       'searched products mes 5', 'staticpage mes 5', 'viewed product mes 5',
       'visited site mes 5',
       ...
       'adcampaignhit_rtbhouse', 'adcampaignhit_criteo',
       'adcampaignhit_google', 'adcampaignhit_manifest', 'adcampaignhit_bing',
       'adcampaignhit_buscape', 'adcampaignhit_indexa',
       'adcampaignhit_datacrush', 'adcampaignhit_emblue',
       'adcampaignhit_yotpo'],
      dtype='object', length=139)
(19414, 139)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ROC AUC:  0.867080615221008


In [62]:
df_train.shape

(38829, 194)

In [40]:
bestK = 0
score = 0
for i in range(16,308):
    score_act = stratifiedCVConKBestChi2(df_train, i)
    if(score_act > score):
        score = score_act
        bestK = i
        print(bestK)

ROC AUC:  0.7471183047165193
16
ROC AUC:  0.7408347305516803
ROC AUC:  0.7447536628911064
ROC AUC:  0.7415980144686982
ROC AUC:  0.7403796750296956
ROC AUC:  0.7388834764576375
ROC AUC:  0.7436279939059165
ROC AUC:  0.7450721766102887
ROC AUC:  0.7425623032035313
ROC AUC:  0.7455305424240144
ROC AUC:  0.7471542860114162
26
ROC AUC:  0.7448171020426418
ROC AUC:  0.7415481168874734
ROC AUC:  0.7452832525215805
ROC AUC:  0.7463036384661076
ROC AUC:  0.7433187647669037
ROC AUC:  0.7416947689667979
ROC AUC:  0.7454354291100107
ROC AUC:  0.743947411537415
ROC AUC:  0.7447268408206598
ROC AUC:  0.7467662663742793
ROC AUC:  0.7426202780467313
ROC AUC:  0.7475288293833181
38
ROC AUC:  0.7492869865787051
39
ROC AUC:  0.7446141613421963
ROC AUC:  0.743345292889449
ROC AUC:  0.7484170663797656
ROC AUC:  0.7442024218508727
ROC AUC:  0.7467965493134566
ROC AUC:  0.7474228524574658
ROC AUC:  0.7364099314384984
ROC AUC:  0.7459678028384595
ROC AUC:  0.7520568525507123
48
ROC AUC:  0.7463961223125446
R

ValueError: k should be >=0, <= n_features = 193; got 194. Use k='all' to return all features.

In [305]:
bestK

63

In [306]:
stratifiedCVConKBestChi2(df_train, 63)

ROC AUC:  0.8640908891295291


0.8640908891295291

In [308]:
stratifiedCVConRFE(df_train,85)

ROC AUC:  0.8659158489130127


0.8659158489130127

In [309]:
stratifiedCV(df_train)

ROC AUC:  0.8584254040813828


In [194]:
model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

In [198]:
n = 43
rfe = RFE(model, n)
rfe = rfe.fit(X, y.ravel())
   
print('n = ', n)
   
# Selecciono los n features para entrenar
mask = rfe.get_support()
features_X = X.columns[mask]
new_x = X.filter(items=features_X)

n =  43


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [161]:
#list(df_featuresEventos.columns)