In [187]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
import xgboost as xgb
from sklearn import preprocessing
from sklearn.feature_selection import RFE, SelectKBest, chi2


In [2]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [3]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [4]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear
df_events['second'] = df_events['timestamp'].dt.second

In [5]:
df_todas_las_personas = df_events[['person']].drop_duplicates()

In [6]:
# Me quedo solo con los users que tienen label.
df_events_con_labels = pd.merge(df_events,df_labels,on = 'person',how = 'inner')

In [174]:
def stratifiedCV(data):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
    #results2 = cross_val_predict(model, X, y, cv = stratifiedKFold, method='predict_proba')
    
    print("ROC AUC: ",results.mean())

In [266]:
def stratifiedCVConRFE(data, K):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    rfe = RFE(model, K)
    rfe = rfe.fit(X, y.ravel())

    # Selecciono los n features para entrenar
    mask = rfe.get_support()
    features_X = X.columns[mask]
    X = X.filter(items = features_X)
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
        
    print("ROC AUC: ",results.mean())

In [267]:
def stratifiedCVConKBestChi2(data, K):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    X = SelectKBest(chi2, K = K).fit_transform(X, y)
    
    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')
        
    print("ROC AUC: ",results.mean())

In [None]:
df

In [29]:
# 0.845696189422742
df_seba = pd.read_csv('../features_varios.csv')

In [75]:
# 0.8520506767923741
df_seba2 = pd.read_csv('features_seba.csv')

In [89]:
# 0.8201881323017529
df_laggedFeatures  = pd.read_csv('fetures_nuevos_santi.csv')

In [91]:
# 0.8300765707861256
df_featuresEventos = pd.read_csv('Santi_FeaturesConEventos.csv')

In [253]:
columnas = ['person','checkout_iPhone 6S',
 'checkout_Samsung Galaxy J5',
 'checkout_Samsung Galaxy J7 Prime',
 'checkout_iPhone 7 Plus',
 'checkout_iPhone 4G',
 'checkout_Samsung Galaxy On 7',
 'checkout_Motorola Moto X Play 4G Dual',
 'checkout_Samsung Galaxy J7 2016 Metal',
 'checkout_iPhone 7',
 'checkout_Motorola Moto X2',
 'checkout_Samsung Galaxy S7',
 'checkout_iPhone 4S',
 'checkout_Lenovo Vibe A7010 Dual Chip',
 'checkout_Motorola Moto G4 Plus',
 'checkout_iPhone 5s',
 'viewedproduct_128GB',
 'viewedproduct_16GB',
 'viewedproduct_256GB',
 'viewedproduct_32GB',
 'viewedproduct_4GB',
 'viewedproduct_512MB',
 'viewedproduct_64GB',
 'viewedproduct_8GB',
 'viewedproduct_Bom',
 'viewedproduct_Bom - Sem Touch ID',
 'viewedproduct_Excelente',
 'viewedproduct_Muito Bom',
 'viewedproduct_Novo',
 'vuelve',
 'visitedsite_Sao Paulo',
 'visitedsite_region desconocida',
 'visitedsite_Minas Gerais',
 'visitedsite_Rio de Janeiro',
 'visitedsite_Bahia',
 'visitedsite_Pernambuco',
 'visitedsite_Ceara',
 'visitedsite_Parana',
 'visitedsite_Rio Grande do Sul',
 'visitedsite_Espirito Santo',
 'visitedsite_Federal District',
 'visitedsite_Maranhao',
 'visitedsite_Goias',
 'visitedsite_Santa Catarina',
 'visitedsite_Para',
 'visitedsite_Rio Grande do Norte',
 'visitedsite_Paraíba',
 'visitedsite_Piaui',
 'visitedsite_Alagoas',
 'visitedsite_Sergipe',
 'visitedsite_Amazonas',
 'visitedsite_Mato Grosso',
 'visitedsite_Tocantins',
 'visitedsite_Mato Grosso do Sul',
 'visitedsite_Rondonia',
 'visitedsite_Amapa',
 'visitedsite_Acre',
 'visitedsite_Roraima',
 'visitedsite_Computer',
 'visitedsite_Smartphone',
 'visitedsite_Tablet',
 'visitedsite_device desconocido',
 'visitedsite_360x640',
 'visitedsite_1366x768',
 'visitedsite_320x570',
 'visitedsite_1600x900',
 'visitedsite_320x534',
 'visitedsite_1280x1024',
 'visitedsite_320x569',
 'visitedsite_1920x1080',
 'visitedsite_1024x768',
 'visitedsite_1324x745',
 'visitedsite_Android 7',
 'visitedsite_Windows 7 ',
 'visitedsite_Windows 10 ',
]

In [254]:
len(df_train.columns)

111

In [264]:
X_new

array([[ 0.,  0.,  0., ..., 17.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ..., 13.,  3.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  3.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.]])

In [256]:
df_train = pd.merge(df_seba2,df_featuresEventos[columnas],on = 'person', how = 'inner')

In [257]:
stratifiedCVConRFE(df_train)

ROC AUC:  0.8659158489130127


In [212]:
stratifiedCV(df_train)

ROC AUC:  0.863210183684831


In [194]:
model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

In [198]:
n = 43
rfe = RFE(model, n)
rfe = rfe.fit(X, y.ravel())
   
print('n = ', n)
   
# Selecciono los n features para entrenar
mask = rfe.get_support()
features_X = X.columns[mask]
new_x = X.filter(items=features_X)

n =  43


In [161]:
#list(df_featuresEventos.columns)

In [93]:
# 0.7286368575010472
df_timeFeatures = pd.read_csv('santi_timefeatures.csv')

In [97]:
list(df_seba2.columns)

['person',
 'ad campaign hit mes 5',
 'brand listing mes 5',
 'checkout mes 5',
 'conversion mes 5',
 'generic listing mes 5',
 'lead mes 5',
 'search engine hit mes 5',
 'searched products mes 5',
 'staticpage mes 5',
 'viewed product mes 5',
 'visited site mes 5',
 'dias_hasta_ultimo',
 'ad campaign hit mes 4',
 'brand listing mes 4',
 'checkout mes 4',
 'conversion mes 4',
 'generic listing mes 4',
 'lead mes 4',
 'search engine hit mes 4',
 'searched products mes 4',
 'staticpage mes 4',
 'viewed product mes 4',
 'visited site mes 4',
 'distan_dias',
 'nuevo_mes_5',
 'cant_dias_dist',
 'modelos_dist',
 'promedio_por_dia_x',
 'promedio_por_mes',
 'mes_primer_entrada',
 'promedio_por_dia_y']

In [None]:
d