In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
import xgboost as xgb
from sklearn import preprocessing

In [651]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [652]:
df_labels['label'] = df_labels['label'].astype(int)

In [653]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear

## Cantidad de veces que realiza un evento dado.

In [821]:
cant_por_evento = pd.concat([pd.get_dummies(df_events['event']),df_events[['person']]],axis = 1).groupby('person')\
    .sum().reset_index()
cant_por_evento = cant_por_evento[['person','brand listing','checkout','search engine hit','ad campaign hit','generic listing']]

In [822]:
df_month2 = pd.concat([df_events['person'],pd.get_dummies(df_events['month'])],axis = 1).drop_duplicates()\
                .groupby('person').sum().reset_index()
df_month2.columns = ['person','mes_1','mes_2','mes_3','mes_4','mes_5']
#df_month2 = df_month2[['person','mes_5','mes_4','mes_3']]

In [823]:
df_month2.head()

Unnamed: 0,person,mes_1,mes_2,mes_3,mes_4,mes_5
0,0008ed71,0,0,0,0,1
1,00091926,0,0,0,0,1
2,00091a7a,0,0,1,0,0
3,000ba417,0,0,0,0,1
4,000c79fe,0,0,0,0,1


In [824]:
df_train = pd.merge(df_month2,cant_por_evento,on = 'person',how = 'inner')

In [825]:
df_train['checkout2'] = df_train['checkout']
df_train['brand listing2'] = df_train['brand listing']  

In [826]:
x = df_train.loc[:,df_train.columns != 'person']
maxAbsScaler = preprocessing.RobustScaler()
x_scaled = maxAbsScaler.fit_transform(x)
df_train = pd.concat([df_train['person'],pd.DataFrame(x_scaled)],axis = 1)

In [827]:
df_train.head()

Unnamed: 0,person,0,1,2,3,4,5,6,7,8,9,10,11
0,0008ed71,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0
1,00091926,0.0,0.0,0.0,0.0,0.0,6.25,1.0,-0.333333,3.25,-0.25,1.0,6.25
2,00091a7a,0.0,0.0,1.0,0.0,-1.0,1.25,-1.0,-0.333333,-0.25,-0.25,-1.0,1.25
3,000ba417,0.0,0.0,0.0,0.0,0.0,6.0,5.0,0.0,-0.25,3.25,5.0,6.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0


In [828]:
# df con labels.

In [829]:
df_train_con_labels = pd.merge(df_train,df_labels,on = 'person',how = 'inner').drop(columns = ['person'])
df_train_con_labels.shape

(19414, 13)

In [830]:
df_train_con_labels.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,label
0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.75,0.75,0.0,0.0,0
3,1.0,1.0,0.0,1.0,0.0,41.25,14.0,4.0,6.75,6.75,14.0,41.25,0
4,0.0,0.0,0.0,0.0,0.0,0.25,1.0,-0.333333,-0.5,0.0,1.0,0.25,0


In [838]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [832]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [839]:
xg_reg = xgb.XGBRegressor(objective ='binary:hinge', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [840]:
xg_reg.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:hinge', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [841]:
df_train

Unnamed: 0,person,0,1,2,3,4,5,6,7,8,9,10,11
0,0008ed71,0.0,0.0,0.0,0.0,0.0,0.00,2.0,-0.333333,-0.50,0.00,2.0,0.00
1,00091926,0.0,0.0,0.0,0.0,0.0,6.25,1.0,-0.333333,3.25,-0.25,1.0,6.25
2,00091a7a,0.0,0.0,1.0,0.0,-1.0,1.25,-1.0,-0.333333,-0.25,-0.25,-1.0,1.25
3,000ba417,0.0,0.0,0.0,0.0,0.0,6.00,5.0,0.000000,-0.25,3.25,5.0,6.00
4,000c79fe,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,-0.25,0.00,0.0,0.00
5,000e4d9e,0.0,0.0,0.0,0.0,0.0,4.25,0.0,1.333333,4.25,4.00,0.0,4.25
6,000e619d,0.0,0.0,0.0,0.0,0.0,2.75,0.0,0.666667,1.00,1.75,0.0,2.75
7,001001be,0.0,0.0,0.0,0.0,0.0,0.00,2.0,-0.333333,-0.50,0.50,2.0,0.00
8,0010e89a,0.0,0.0,0.0,0.0,0.0,0.00,0.0,-0.333333,-0.25,-0.25,0.0,0.00
9,0016c4b5,0.0,0.0,0.0,0.0,0.0,0.00,0.0,-0.333333,-0.25,-0.25,0.0,0.00


In [847]:
df_predecir = pd.merge(df_train,df_person,on = 'person', how = 'inner')

In [848]:
df_predecir_sin_person = df_predecir.drop(columns = ['person'])

In [845]:
preds = xg_reg.predict(df_predecir_sin_person)

In [849]:
df_final = df_predecir[['person']].copy()

In [851]:
df_final['label'] = pd.Series(preds)

In [855]:
df_final.to_csv('predicciones.csv', encoding='utf-8', index=False)

In [836]:
# Pruebo todas las metricas.
# accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
# f1_score,precision_score,recall_score,classification_report,cohen_kappa_score

trainAccuracy = accuracy_score(y_train, xg_reg.predict(X_train))
testAccuracy = accuracy_score(y_test, preds)

meanSquaredError = mean_squared_error(y_test, preds)
classificationReport = classification_report(y_test, preds)
precisionScore = precision_score(y_test, preds)
recallScore = recall_score(y_test, preds)
f1Score = f1_score(y_test, preds)
cohenKappaScore = cohen_kappa_score(y_test, preds)
areaDebajoDeCurva = roc_auc_score(y_test, preds)
matrizDeConfusion = confusion_matrix(y_test, preds)

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Precision Score: ",precisionScore)
print("Recall Score: ",recallScore)
print("F1 Score: ",f1Score)
print("Cohen Kappa Score: ",cohenKappaScore)
print("ROC auc score: ", areaDebajoDeCurva)
print("Confusion matrix: ")
print(matrizDeConfusion)

Train accuracy:  0.8083188461786105
Test acuracy:  0.8055627092454288

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.81      0.89      3692
           1       0.17      0.73      0.27       191

   micro avg       0.81      0.81      0.81      3883
   macro avg       0.57      0.77      0.58      3883
weighted avg       0.94      0.81      0.86      3883


Precision Score:  0.16587677725118483
Recall Score:  0.7329842931937173
F1 Score:  0.2705314009661836
Cohen Kappa Score:  0.20690724218090806
ROC auc score:  0.7711508681569887
Confusion matrix: 
[[2988  704]
 [  51  140]]


In [646]:
len(y_test)

3883

In [647]:
y_test.sum()

191

In [648]:
# 191 son positivos (1s) de los 3883 

In [649]:
# si predice todos 0s obtiene un accuracy de
((3883 - 191) / 3883 ) * 100

95.0811228431625