In [16]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
import xgboost as xgb
from sklearn import preprocessing

In [17]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [18]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [19]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear
df_events['second'] = df_events['timestamp'].dt.second

In [20]:
df_events.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'month', 'day',
       'day_of_week', 'day_of_year', 'hour', 'minute', 'week_of_year',
       'second'],
      dtype='object')

In [22]:
df_todas_las_personas = df_events[['person']].drop_duplicates()

In [54]:
def evaluarMetricas(y_test,preds):

    meanSquaredError = mean_squared_error(y_test, preds)
    areaDebajoDeCurva = roc_auc_score(y_test, preds)


    print("ROC auc score: ", areaDebajoDeCurva)
    print("Mean squared error: ", meanSquaredError)
    

In [47]:
def testSinSubirAKaggle(dfSinLabels):
    
    df_train_con_labels = pd.merge(dfSinLabels,df_labels,how = 'inner', on ='person')
    df_train_con_labels.drop(columns = ['person'],inplace = True)

    X = df_train_con_labels.drop(columns = ['label'])
    y = df_train_con_labels['label']
    
    xg = xgb.XGBClassifier(objective ='binary:logistic', 
                    colsample_bytree = 0.8, learning_rate = 0.1,
                    max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

    xg.fit(X_train,y_train)

    predsProb1 = pd.DataFrame(xg.predict_proba(X_test))[1]

    predsLabel = pd.Series(xg.predict(X_test))

    trainAccuracy = accuracy_score(y_train, pd.Series(xg.predict(X_train)))
    testAccuracy = accuracy_score(y_test, predsLabel)

    cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
    classificationReport = classification_report(y_test, predsLabel)
    precisionScore = precision_score(y_test, predsLabel)
    recallScore = recall_score(y_test, predsLabel)
    f1Score = f1_score(y_test, predsLabel)
    matrizDeConfusion = confusion_matrix(y_test, predsLabel)

    meanSquaredError = mean_squared_error(y_test, predsProb1)
    areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)

    # Pruebo todas las metricas.
    # accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
    # f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
    # brierScoreLoss.
    # Hay metricas para la probabilidad de que sea 1 y otras metricas para las predicciones de los labels.


    # Metricas con LABELS.

    print()
    print("Metricas con Labels:")
    print()

    print("Train accuracy: ", trainAccuracy)
    print("Test acuracy: ", testAccuracy)
    print()
    print("Classification Report:")
    print(classificationReport)
    print()
    print("Precision Score: ",precisionScore)
    print("Recall Score: ",recallScore)
    print("F1 Score: ",f1Score)
    print("Cohen Kappa Score: ",cohenKappaScore)
    print()
    print("Confusion matrix: ")
    print(matrizDeConfusion)

    # Metricas con PROBABILIDADES. 

    print()
    print("Metricas sin Labels:")
    print()

    print("ROC auc score: ", areaDebajoDeCurva)
    print("Mean squared error: ", meanSquaredError)
    
    return predsProb1, y_test

In [121]:
df_seba = pd.read_csv('features_seba.csv')
df_featuresNuevos = pd.read_csv('fetures_nuevos_santi.csv')

In [122]:
df_santiEventos = pd.read_csv('Santi_FeaturesConEventos.csv')

In [123]:
df_santiTime = pd.read_csv('santi_timefeatures.csv')

In [135]:
predsTime, y_test = testSinSubirAKaggle(df_santiTime)


Metricas con Labels:

Train accuracy:  0.9032901938059366
Test acuracy:  0.8820499613700746

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.91      0.94      3692
           1       0.14      0.28      0.19       191

   micro avg       0.88      0.88      0.88      3883
   macro avg       0.55      0.60      0.56      3883
weighted avg       0.92      0.88      0.90      3883


Precision Score:  0.144
Recall Score:  0.28272251308900526
F1 Score:  0.19081272084805653
Cohen Kappa Score:  0.1343929116565945

Confusion matrix: 
[[3371  321]
 [ 137   54]]

Metricas sin Labels:

ROC auc score:  0.7305962233327472
Mean squared error:  0.16826066383951369


In [134]:
predsEventos, y_test = testSinSubirAKaggle(df_santiEventos)


Metricas con Labels:

Train accuracy:  0.8881591655398879
Test acuracy:  0.8750965748132887

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      3692
           1       0.21      0.54      0.30       191

   micro avg       0.88      0.88      0.88      3883
   macro avg       0.59      0.72      0.61      3883
weighted avg       0.94      0.88      0.90      3883


Precision Score:  0.206
Recall Score:  0.5392670157068062
F1 Score:  0.29811866859623737
Cohen Kappa Score:  0.24432609073359468

Confusion matrix: 
[[3295  397]
 [  88  103]]

Metricas sin Labels:

ROC auc score:  0.8267869399238768
Mean squared error:  0.15509996464630738


In [167]:
predsSeba, y_test = testSinSubirAKaggle(df_seba)


Metricas con Labels:

Train accuracy:  0.860858927306677
Test acuracy:  0.8526912181303116

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.87      0.92      3670
           1       0.21      0.60      0.31       213

   micro avg       0.85      0.85      0.85      3883
   macro avg       0.59      0.73      0.61      3883
weighted avg       0.93      0.85      0.88      3883


Precision Score:  0.20717781402936378
Recall Score:  0.596244131455399
F1 Score:  0.3075060532687651
Cohen Kappa Score:  0.24612690158915496

Confusion matrix: 
[[3184  486]
 [  86  127]]

Metricas sin Labels:

ROC auc score:  0.8519860306251679
Mean squared error:  0.1500565326519106


In [126]:
predsFeaturesNuevos, y_test = testSinSubirAKaggle(df_featuresNuevos)


Metricas con Labels:

Train accuracy:  0.8871933552250338
Test acuracy:  0.886170486737059

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      3692
           1       0.21      0.49      0.30       191

   micro avg       0.89      0.89      0.89      3883
   macro avg       0.59      0.70      0.62      3883
weighted avg       0.93      0.89      0.91      3883


Precision Score:  0.214123006833713
Recall Score:  0.49214659685863876
F1 Score:  0.29841269841269846
Cohen Kappa Score:  0.24677783473302817

Confusion matrix: 
[[3347  345]
 [  97   94]]

Metricas sin Labels:

ROC auc score:  0.8290367456450342
Mean squared error:  0.16026689238951006


In [205]:
predsPromedio1 = (0.95*predsSeba + 0.05*predsFeaturesNuevos) / 2

In [226]:
predsPromedio = (0.8*predsSeba + 0.10*predsFeaturesNuevos + 0.05*predsEventos + 0.05*predsTime)/4

#### 0.8519860306251679

In [278]:
evaluarMetricas(y_test,predsPromedio)

ROC auc score:  0.8501605454708268
Mean squared error:  0.05080793616983466


In [327]:
union = pd.merge(df_seba,df_santiEventos[columnas])

In [328]:
union = pd.merge(union,df_featuresNuevos[columnasfeaturesNuevos])

In [329]:
union = pd.merge(union,df_santiTime[columnasTiempo])

In [321]:
columnasfeaturesNuevos = ['person','promedioCadaCuantosDiasVuelve','cadaCuantosDiasEnMes5HaceEventos','cadaCuantasHorasEnUnDiaDelMes5HaceEventos',
 'cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaCuantosDiasHaceConversion',
 'cadaCuantosDiasHaceCheckout']

In [314]:
list(df_featuresNuevos.columns)

['person',
 'promedioCadaCuantosDiasVuelve',
 'cadaCuantasHorasHaceEventos',
 'cadaCuantosMinutosHaceEventos',
 'cadaCuantosDiasEnMes5HaceEventos',
 'cadaCuantasHorasEnUnDiaDelMes5HaceEventos',
 'cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5',
 'cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5',
 'cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5',
 'cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5',
 'cadaCuantosDiasHaceConversion',
 'cadaCuantosDiasHaceCheckout']

In [326]:
columnasTiempo = ['person',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
'primer_quincena',
 'segunda_quincena',
'mes_1',
 'mes_2',
 'mes_3',
 'mes_4',
 'mes_5',
'hora_rango_0-3',
 'hora_rango_4-7',
 'hora_rango_8-11',
 'hora_rango_12-15',
 'hora_rango_16-19',
 'hora_rango_20-23']

In [298]:
list(df_santiTime.columns)

['Unnamed: 0',
 'person',
 'mes_1',
 'mes_2',
 'mes_3',
 'mes_4',
 'mes_5',
 'dia_1',
 'dia_2',
 'dia_3',
 'dia_4',
 'dia_5',
 'dia_6',
 'dia_7',
 'dia_8',
 'dia_9',
 'dia_10',
 'dia_11',
 'dia_12',
 'dia_13',
 'dia_14',
 'dia_15',
 'dia_16',
 'dia_17',
 'dia_18',
 'dia_19',
 'dia_20',
 'dia_21',
 'dia_22',
 'dia_23',
 'dia_24',
 'dia_25',
 'dia_26',
 'dia_27',
 'dia_28',
 'dia_29',
 'dia_30',
 'dia_31',
 'primer_quincena',
 'segunda_quincena',
 'Friday',
 'Monday',
 'Saturday',
 'Sunday',
 'Thursday',
 'Tuesday',
 'Wednesday',
 'fin_de_semana',
 'martes_miercoles_jueves',
 'hora_0',
 'hora_1',
 'hora_2',
 'hora_3',
 'hora_4',
 'hora_5',
 'hora_6',
 'hora_7',
 'hora_8',
 'hora_9',
 'hora_10',
 'hora_11',
 'hora_12',
 'hora_13',
 'hora_14',
 'hora_15',
 'hora_16',
 'hora_17',
 'hora_18',
 'hora_19',
 'hora_20',
 'hora_21',
 'hora_22',
 'hora_23',
 'hora_rango_0-3',
 'hora_rango_4-7',
 'hora_rango_8-11',
 'hora_rango_12-15',
 'hora_rango_16-19',
 'hora_rango_20-23',
 'Morning',
 'Afterno

In [304]:
columnasEventos =  ['person',
 'checkout_iPhone 6S',
 'checkout_Samsung Galaxy J5',
 'checkout_Samsung Galaxy J7 Prime',
 'checkout_iPhone 7 Plus',
 'checkout_iPhone 4G',
 'checkout_Samsung Galaxy On 7',
 'checkout_Motorola Moto X Play 4G Dual',
 'checkout_Samsung Galaxy J7 2016 Metal',
 'checkout_iPhone 7',
 'checkout_Motorola Moto X2',
 'checkout_Samsung Galaxy S7',
 'checkout_iPhone 4S',
 'checkout_Lenovo Vibe A7010 Dual Chip',
 'checkout_Motorola Moto G4 Plus',
 'checkout_iPhone 5s',
 'viewedproduct_Cinza espacial',
 'viewedproduct_Preto',
 'viewedproduct_Branco',
 'viewedproduct_Prateado',
 'viewedproduct_Ouro Rosa',
 'viewedproduct_Prata',
 'viewedproduct_Rosa',
 'viewedproduct_Dourado',
 'viewedproduct_Azul',
 'viewedproduct_Ametista',
 'viewedproduct_Cinza',
 'viewedproduct_Preto Matte',
 'viewedproduct_Verde',
 'viewedproduct_Vermelho',
 'viewedproduct_Preto Brilhante',
 'viewedproduct_128GB',
 'viewedproduct_16GB',
 'viewedproduct_256GB',
 'viewedproduct_32GB',
 'viewedproduct_4GB',
 'viewedproduct_512MB',
 'viewedproduct_64GB',
 'viewedproduct_8GB',
 'viewedproduct_Bom',
 'viewedproduct_Bom - Sem Touch ID',
 'viewedproduct_Excelente',
 'viewedproduct_Muito Bom',
 'viewedproduct_Novo',
 'vuelve',
 'visitedsite_Sao Paulo',
 'visitedsite_region desconocida',
 'visitedsite_Minas Gerais',
 'visitedsite_Rio de Janeiro',
 'visitedsite_Bahia',
 'visitedsite_Pernambuco',
 'visitedsite_Ceara',
 'visitedsite_Parana',
 'visitedsite_Rio Grande do Sul',
 'visitedsite_Espirito Santo',
 'visitedsite_Federal District',
 'visitedsite_Maranhao',
 'visitedsite_Goias',
 'visitedsite_Santa Catarina',
 'visitedsite_Para',
 'visitedsite_Rio Grande do Norte',
 'visitedsite_Paraíba',
 'visitedsite_Piaui',
 'visitedsite_Alagoas',
 'visitedsite_Sergipe',
 'visitedsite_Amazonas',
 'visitedsite_Mato Grosso',
 'visitedsite_Tocantins',
 'visitedsite_Mato Grosso do Sul',
 'visitedsite_Rondonia',
 'visitedsite_Amapa',
 'visitedsite_Acre',
 'visitedsite_Roraima',
 'visitedsite_Computer',
 'visitedsite_Smartphone',
 'visitedsite_Tablet',
 'visitedsite_device desconocido',
 'visitedsite_360x640',
 'visitedsite_1366x768',
 'visitedsite_320x570',
 'visitedsite_1600x900',
 'visitedsite_320x534',
 'visitedsite_1280x1024',
 'visitedsite_412x732',
 'visitedsite_375x667',
 'visitedsite_414x736',
 'visitedsite_320x490',
 'visitedsite_1536x864',
 'visitedsite_320x569',
 'visitedsite_1920x1080',
 'visitedsite_1024x768',
 'visitedsite_1324x745',
 'visitedsite_Android 7',
 'visitedsite_Windows 7 ',
 'visitedsite_Windows 10 ',
 'visitedsite_Android 5.1',
 'visitedsite_Android 6.0.1',
 'visitedsite_Android 7.1.1',
 'visitedsite_Android 5.0.2',
 'visitedsite_Android 6',
 'visitedsite_Android 4.4.2',
 'visitedsite_iOS 11.2.6',
 'visitedsite_Windows Phone 8.1',
 'visitedsite_Android 4.1.2',
 'visitedsite_Windows Vista ',
 'visitedsite_iOS 10.3.3',
 'visitedsite_Android 8',
 'visitedsite_Chrome Mobile 66.0',
 'visitedsite_Chrome 65.0',
 'visitedsite_Chrome 66.0',
 'visitedsite_Chrome 64.0',
 'visitedsite_Chrome 63.0',
 'visitedsite_Chrome Mobile 65.0',
 'visitedsite_Chrome Mobile 64.0',
 'visitedsite_Chrome Mobile 39',
 'visitedsite_Chrome Mobile 63.0',
 'visitedsite_Samsung Internet 3.3',
 'visitedsite_Samsung Internet 6.4',
 'visitedsite_Mobile Safari 11',
 'visitedsite_IE Mobile 11',
 'visitedsite_Edge 16.16299',
 'visitedsite_Chrome Mobile 36.0',
 'adcampaignhit_rtbhouse',
 'adcampaignhit_criteo',
 'adcampaignhit_google',
 'adcampaignhit_zanox',
 'adcampaignhit_manifest',
 'adcampaignhit_bing',
 'adcampaignhit_afilio',
 'adcampaignhit_buscape',
 'adcampaignhit_rakuten',
 'adcampaignhit_FacebookAds',
 'adcampaignhit_indexa',
 'adcampaignhit_datacrush',
 'adcampaignhit_emblue',
 'adcampaignhit_blog',
 'adcampaignhit_yotpo']

In [330]:
a,b = testSinSubirAKaggle(union)


Metricas con Labels:

Train accuracy:  0.8581546584250853
Test acuracy:  0.8408447077002318

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.85      0.91      3670
           1       0.20      0.61      0.30       213

   micro avg       0.84      0.84      0.84      3883
   macro avg       0.58      0.73      0.60      3883
weighted avg       0.93      0.84      0.88      3883


Precision Score:  0.19548872180451127
Recall Score:  0.6103286384976526
F1 Score:  0.296127562642369
Cohen Kappa Score:  0.23233964089387527

Confusion matrix: 
[[3135  535]
 [  83  130]]

Metricas sin Labels:

ROC auc score:  0.8559184352253394
Mean squared error:  0.14601759983542648
