## Imports

In [18]:

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, confusion_matrix, accuracy_score, roc_auc_score, cohen_kappa_score
from sklearn.metrics import mean_squared_error, classification_report

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

## Se cargan los csv

In [11]:
df_featuresMagui = pd.read_csv('featuresMagui.csv')
df_featuresSeba = pd.read_csv('features_seba.csv')
df_featuresSanti = pd.read_csv('santi_timefeatures.csv')
df_featuresSanti2 = pd.read_csv('Santi_FeaturesConEventos.csv')
labels = pd.read_csv('../../data/labels_training_set.csv') # Las personas de las cuales tengo Info
personas = pd.read_csv('../../data/trocafone_kaggle_test.csv') # Las personas a las que le tengo que predecir

## Set de entrenamiento y set para predecir

In [12]:
#Datos a entregar
df_train = pd.merge(df_featuresSanti2,df_featuresSeba, on = 'person',how = 'inner')
datos = pd.merge(personas, df_train, on = 'person', how = 'inner')

## Set de entrenamiento

In [13]:
#Datos para entrenar
df_trainConLabels = pd.merge(labels, df_train, on = 'person', how = 'inner')

y = df_trainConLabels.iloc[:,1:2]
X = df_trainConLabels.iloc[:,3: 406]

## Seleccion de features con RFE

In [14]:
from sklearn.feature_selection import RFE

#Instanciamos el regresor de XGBoost
model = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

n = 43
rfe = RFE(model, n)
rfe = rfe.fit(X, y['label'].ravel())
   
# Selecciono los n features para entrenar

mask = rfe.get_support()
features_X = X.columns[mask]
new_x = X.filter(items=features_X)

## Entrenamiento y prediccion

In [19]:
X_train, X_test, y_train, y_test = train_test_split(new_x, y['label'], test_size=0.2, random_state=123)

model.fit(X_train,y_train)

predsProba = model.predict_proba(X_test)[:,1]
predsLabel = pd.Series(model.predict(X_test))

trainAccuracy = accuracy_score(y_train, pd.Series(model.predict(X_train)))
testAccuracy = accuracy_score(y_test, predsLabel)

cohenKappaScore = cohen_kappa_score(y_test, predsLabel)
classificationReport = classification_report(y_test, predsLabel)
matrizDeConfusion = confusion_matrix(y_test, predsLabel)

meanSquaredError = mean_squared_error(y_test, predsProba)
areaDebajoDeCurva = roc_auc_score(y_test, predsProba)

# Metricas con LABELS.

print()
print("Metricas con Labels:")
print()

print("Train accuracy: ", trainAccuracy)
print("Test acuracy: ", testAccuracy)
print()
print("Classification Report:")
print(classificationReport)
print()
print("Cohen Kappa Score: ",cohenKappaScore)
print()
print("Confusion matrix: ")
print(matrizDeConfusion)

# Metricas con PROBABILIDADES. 

print()
print("Metricas sin Labels:")
print()

print("ROC auc score: ", areaDebajoDeCurva)
print("Mean squared error: ", meanSquaredError)

# con df_featuresSanti2,df_featuresSeba, y n=43 me da 0.8626966186972823 (Filtrando X) ESTE SUBI A KAGGLE Y DIO 0.86148


Metricas con Labels:

Train accuracy:  0.902066834073788
Test acuracy:  0.8980169971671388

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94      3692
           1       0.24      0.50      0.32       191

   micro avg       0.90      0.90      0.90      3883
   macro avg       0.61      0.71      0.63      3883
weighted avg       0.94      0.90      0.91      3883


Cohen Kappa Score:  0.2762375808877936

Confusion matrix: 
[[3392  300]
 [  96   95]]

Metricas sin Labels:

ROC auc score:  0.8710343008514235
Mean squared error:  0.13671487809063365


## Para subir a Kaggle

In [26]:
model = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

model.fit(new_x,y['label'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=15, missing=None, n_estimators=6,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=7, seed=None,
       silent=True, subsample=1)

In [27]:
df_final = datos.filter(items= ['person'] + list(features_X))
df_finalSinPersonas = df_final.drop(columns='person')
df_final.head(5)

Unnamed: 0,person,brand listing,checkout,conversion,generic listing,lead,search engine hit,searched products,staticpage,viewed product,...,conversion mes 5,searched products mes 5,viewed product mes 5,dias_hasta_ultimo,viewed product mes 4,distan_dias,cant_dias_dist,modelos_dist,promedio_por_dia_x,promedio_por_mes
0,4886f805,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,4.0,...,0.0,1.0,4.0,13,0.0,0,1,1,9.0,9.0
1,0297fc1e,4.0,7.0,0.0,21.0,1.0,0.0,6.0,0.0,404.0,...,0.0,0.0,133.0,3,100.0,138,59,15,9.610169,113.4
2,2d681dd8,5.0,1.0,0.0,1.0,0.0,2.0,1.0,0.0,13.0,...,0.0,1.0,13.0,4,0.0,9,2,3,13.0,26.0
3,cccea85e,7.0,1.0,0.0,20.0,0.0,26.0,1.0,5.0,739.0,...,0.0,1.0,739.0,0,0.0,23,13,57,64.307692,836.0
4,4c8a8b93,8.0,2.0,0.0,14.0,0.0,13.0,9.0,0.0,177.0,...,0.0,9.0,177.0,9,0.0,4,5,9,51.4,257.0


In [28]:
proba = pd.Series(model.predict_proba(df_finalSinPersonas)[:,1])

predicciones = df_final[['person']]

predicciones['label'] = proba

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [29]:
predicciones.to_csv('predicciones_kaggle.csv', encoding='utf-8', index=False)

In [30]:
predicciones.shape

(19415, 2)

In [31]:
predicciones.head()

Unnamed: 0,person,label
0,4886f805,0.282464
1,0297fc1e,0.304237
2,2d681dd8,0.302778
3,cccea85e,0.441948
4,4c8a8b93,0.379345
