In [132]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score, brier_score_loss
import xgboost as xgb
from sklearn import preprocessing

In [34]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [35]:
df_labels['label'] = df_labels['label'].astype(int)

In [36]:
df_events["timestamp"] = pd.to_datetime(df_events["timestamp"])
df_events["month"] = df_events["timestamp"].dt.month
df_events["day"] = df_events["timestamp"].dt.day
df_events["day_of_week"] = df_events['timestamp'].dt.weekday_name
df_events['day_of_year'] = df_events['timestamp'].dt.dayofyear
df_events['hour'] = df_events['timestamp'].dt.hour
df_events['minute'] = df_events['timestamp'].dt.minute
df_events['week_of_year'] = df_events['timestamp'].dt.weekofyear

#### Solo pruebo 2 features.

## Cantidad de veces que realiza un evento dado

In [37]:
cant_por_evento = pd.concat([pd.get_dummies(df_events['event']),df_events[['person']]],axis = 1).groupby('person')\
    .sum().reset_index()
cant_por_evento = cant_por_evento[['person','brand listing','checkout','search engine hit','ad campaign hit','generic listing']]

## Entra o no un determinado mes

In [38]:
df_month2 = pd.concat([df_events['person'],pd.get_dummies(df_events['month'])],axis = 1).drop_duplicates()\
                .groupby('person').sum().reset_index()
df_month2.columns = ['person','mes_1','mes_2','mes_3','mes_4','mes_5']
#df_month2 = df_month2[['person','mes_5','mes_4','mes_3']]

In [39]:
df_month2.head()

Unnamed: 0,person,mes_1,mes_2,mes_3,mes_4,mes_5
0,0008ed71,0,0,0,0,1
1,00091926,0,0,0,0,1
2,00091a7a,0,0,1,0,0
3,000ba417,0,0,0,0,1
4,000c79fe,0,0,0,0,1


In [40]:
df_train = pd.merge(df_month2,cant_por_evento,on = 'person',how = 'inner')

In [41]:
df_train['checkout2'] = df_train['checkout']
df_train['brand listing2'] = df_train['brand listing']  

#### Se aplica una transformacion a los datos.

In [42]:
x = df_train.loc[:,df_train.columns != 'person']
maxAbsScaler = preprocessing.RobustScaler()
x_scaled = maxAbsScaler.fit_transform(x)
df_train = pd.concat([df_train['person'],pd.DataFrame(x_scaled)],axis = 1)

In [43]:
df_train.head()

Unnamed: 0,person,0,1,2,3,4,5,6,7,8,9,10,11
0,0008ed71,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0
1,00091926,0.0,0.0,0.0,0.0,0.0,6.25,1.0,-0.333333,3.25,-0.25,1.0,6.25
2,00091a7a,0.0,0.0,1.0,0.0,-1.0,1.25,-1.0,-0.333333,-0.25,-0.25,-1.0,1.25
3,000ba417,0.0,0.0,0.0,0.0,0.0,6.0,5.0,0.0,-0.25,3.25,5.0,6.0
4,000c79fe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0


In [44]:
# df con labels.

In [79]:
df_train_con_labels = pd.merge(df_train,df_labels,on = 'person',how = 'inner').drop(columns = ['person'])
df_train_con_labels.shape

(19414, 13)

In [80]:
df_train_con_labels.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,label
0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.333333,-0.5,0.0,2.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.75,0.75,0.0,0.0,0
3,1.0,1.0,0.0,1.0,0.0,41.25,14.0,4.0,6.75,6.75,14.0,41.25,0
4,0.0,0.0,0.0,0.0,0.0,0.25,1.0,-0.333333,-0.5,0.0,1.0,0.25,0


#### Para subir a Kaggle

In [224]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [225]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [226]:
xg_reg.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [227]:
df_predecir = pd.merge(df_train,df_person,on = 'person', how = 'inner')

In [228]:
df_predecir_sin_person = df_predecir.drop(columns = ['person'])

In [229]:
preds = xg_reg.predict(df_predecir_sin_person)

In [230]:
df_final = df_predecir[['person']].copy()

In [231]:
df_final['output'] = pd.Series(preds)

In [232]:
df_final.to_csv('predicciones.csv', encoding='utf-8', index=False)

#### Test sin subir a Kaggle

In [244]:
X, y = df_train_con_labels.iloc[:,:-1],df_train_con_labels.iloc[:,-1]

In [245]:
xg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 9.5, min_child_weight=1)

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)

In [247]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=9.5, seed=None,
       silent=True, subsample=1)

In [248]:
predsProb1 = pd.DataFrame(xg.predict_proba(X_test))[1]

In [260]:
predsLabel = pd.Series(xg.predict(X_test))

In [276]:
trainAccuracy = accuracy_score(y_train, pd.Series(xg.predict(X_train)))
testAccuracy = accuracy_score(y_test, predsLabel)

cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
classificationReport = classification_report(y_test, predsLabel)
precisionScore = precision_score(y_test, predsLabel)
recallScore = recall_score(y_test, predsLabel)
f1Score = f1_score(y_test, predsLabel)
matrizDeConfusion = confusion_matrix(y_test, predsLabel)

meanSquaredError = mean_squared_error(y_test, predsProb1)
areaDebajoDeCurva = roc_auc_score(y_test, predsProb1)
brierScoreLoss = brier_score_loss(y_test, predsProb1)

In [282]:
# Pruebo todas las metricas.
# accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,
# f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
# brierScoreLoss.
# Hay metricas para la probabilidad de que sea 1 y otras metricas para las predicciones de los labels.


# Metricas con LABELS.

print()
print("Metricas con Labels:")
print()

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Precision Score: ",precisionScore)
print("Recall Score: ",recallScore)
print("F1 Score: ",f1Score)
print("Cohen Kappa Score: ",cohenKappaScore)
print()
print("Confusion matrix: ")
print(matrizDeConfusion)

# Metricas con PROBABILIDADES. 

print()
print("Metricas sin Labels:")
print()

print("ROC auc score: ", areaDebajoDeCurva)
print("Mean squared error: ", meanSquaredError)
print("Brier Score Loss: ",brierScoreLoss)


Metricas con Labels:

Train accuracy:  0.8319490052153757
Test acuracy:  0.8297707957764615

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.84      0.90      3692
           1       0.17      0.64      0.27       191

   micro avg       0.83      0.83      0.83      3883
   macro avg       0.58      0.74      0.59      3883
weighted avg       0.94      0.83      0.87      3883


Precision Score:  0.1717877094972067
Recall Score:  0.643979057591623
F1 Score:  0.27122381477398017
Cohen Kappa Score:  0.20986101024852788

Confusion matrix: 
[[3099  593]
 [  68  123]]

Metricas sin Labels:

ROC auc score:  0.8277427351057615
Mean squared error:  0.15366677944368673
Brier Score Loss:  0.15366677944368673
