In [2]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
%matplotlib inline

from datetime import timedelta

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score
                                                            
import xgboost as xgb
from sklearn import preprocessing
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
# df_events información de eventos realizado en la plataforma para un conjunto de usuarios hasta el 31/05/2018
df_events = pd.read_csv('../../data/events_up_to_01062018.csv',dtype=object)
# df_train archivo labels_training_set.csv indica para un subconjunto de los usuarios incluidos en el set de 
# eventos events_up_to_01062018.csv si los mismos realizaron una conversión (columna label = 1) o no (columna 
# label = 0) desde el 01/06/2018 hasta el 15/06/2018
df_labels = pd.read_csv('../../data/labels_training_set.csv',dtype=object)
# Personas a predecir conversion.
df_person = pd.read_csv('../../data/trocafone_kaggle_test.csv')

In [4]:
df_labels['label'] = df_labels['label'].astype(int)
df_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19414 entries, 0 to 19413
Data columns (total 2 columns):
person    19414 non-null object
label     19414 non-null int64
dtypes: int64(1), object(1)
memory usage: 303.4+ KB


In [32]:
def stratifiedCVConKBestChi2(data, K):
    
    df_train = pd.merge(data,df_labels,how = 'inner', on ='person')
    df_train.drop(columns = ['person'],inplace = True)
    
    X = df_train.drop(columns = ['label'])
    y = df_train['label']
    
    model = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)
    
    stratifiedKFold = StratifiedKFold(n_splits = 10, random_state = 7)
    
    selector = SelectKBest(chi2, k = K)
    X = selector.fit_transform(X,y)

    results = cross_val_score(model, X, y, cv = stratifiedKFold, scoring = 'roc_auc')   
    
    print("ROC AUC: ",results.mean())
    return results.mean()

In [6]:
df_seba2 = pd.read_csv('features_seba.csv')

In [7]:
df_featuresEventos = pd.read_csv('Santi_FeaturesConEventos.csv')
df_featuresEventos.drop(columns = ['Unnamed: 0'],inplace = True)

In [29]:
df_laggedFeatures = pd.read_csv('fetures_nuevos_santi.csv')
df_laggedFeatures = df_laggedFeatures.replace(-1,0)

In [12]:
df_timeFeatures = pd.read_csv('santi_timefeatures.csv')
df_timeFeatures.drop(columns = ['Unnamed: 0'],inplace = True)

In [13]:
df_magui = pd.read_csv('featuresMagui.csv')
df_magui.drop(columns = ['Unnamed: 0'],inplace = True)

In [30]:
df_train = pd.merge(df_seba2,df_featuresEventos, on = 'person', how = 'inner')
df_train = pd.merge(df_train,df_timeFeatures, on = 'person', how = 'inner')
df_train = pd.merge(df_train,df_laggedFeatures, on = 'person', how = 'inner')

In [28]:
df_train

Unnamed: 0,person,ad campaign hit mes 5,brand listing mes 5,checkout mes 5,conversion mes 5,generic listing mes 5,lead mes 5,search engine hit mes 5,searched products mes 5,staticpage mes 5,...,cadaCuantosMinutosHaceEventos,cadaCuantosDiasEnMes5HaceEventos,cadaCuantasHorasEnUnDiaDelMes5HaceEventos,cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5,cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5,cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5,cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5,cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5,cadaCuantosDiasHaceConversion,cadaCuantosDiasHaceCheckout
0,0008ed71,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,48,0.0,2.0,2.0,68.0,2.0,1.666667,67.666667,-1.0,0.0
1,00091926,15.0,25.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,340,1.0,3.0,2.0,64.0,17.0,1.500000,20.000000,-1.0,5.0
2,000ba417,1.0,24.0,6.0,1.0,14.0,0.0,1.0,0.0,0.0,...,167,4.0,1.0,2.0,52.0,1.0,2.000000,41.000000,0.0,4.0
3,000c79fe,1.0,0.0,1.0,0.0,1.0,0.0,1.0,9.0,0.0,...,1,0.0,0.0,2.0,44.0,0.0,2.000000,44.000000,-1.0,0.0
4,000e4d9e,19.0,17.0,1.0,0.0,17.0,0.0,5.0,0.0,0.0,...,209,2.0,1.0,2.0,35.0,5.0,0.000000,1.000000,-1.0,0.0
5,000e619d,6.0,11.0,1.0,0.0,8.0,0.0,3.0,6.0,0.0,...,175,1.0,1.0,1.0,20.0,4.0,0.500000,12.500000,-1.0,0.0
6,001001be,0.0,0.0,3.0,1.0,3.0,0.0,0.0,17.0,0.0,...,295,3.0,0.0,1.0,35.0,0.0,0.000000,0.000000,0.0,0.0
7,0010e89a,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,1.0,14.0,0.0,1.000000,14.000000,-1.0,0.0
8,0016c4b5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,2.0,0.0,0.000000,2.000000,-1.0,0.0
9,001802e4,5.0,0.0,1.0,0.0,4.0,0.0,0.0,4.0,0.0,...,1,0.0,0.0,2.0,50.0,0.0,2.000000,50.000000,-1.0,0.0


In [13]:
stratifiedCVConKBestChi2(df_train,139)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ROC AUC:  0.867080615221008


In [22]:
df_train.shape

(38829, 414)

In [33]:
bestK = 0
score = 0
for i in range(16,415):
    score_act = stratifiedCVConKBestChi2(df_train,i)
    if(score_act > score):
        bestK = i
        score = score_act
        print(score)
        print(i)

ROC AUC:  0.6781500523029098
0.6781500523029098
16
ROC AUC:  0.6810927302162132
0.6810927302162132
17
ROC AUC:  0.6827597982413666
0.6827597982413666
18
ROC AUC:  0.6806812741639287
ROC AUC:  0.6782166318028642
ROC AUC:  0.6814589723372386
ROC AUC:  0.6840874909536708
0.6840874909536708
22
ROC AUC:  0.6806743396657058
ROC AUC:  0.6976234848111624
0.6976234848111624
24
ROC AUC:  0.6949513116285646
ROC AUC:  0.7023360131313463
0.7023360131313463
26
ROC AUC:  0.7193991952000703
0.7193991952000703
27
ROC AUC:  0.7233091202431754
0.7233091202431754
28
ROC AUC:  0.7176922125628493
ROC AUC:  0.7205791211429742
ROC AUC:  0.7203972472592706
ROC AUC:  0.7246870802985439
0.7246870802985439
32
ROC AUC:  0.7213691723342265
ROC AUC:  0.7228931557386785
ROC AUC:  0.7204354645398906
ROC AUC:  0.7256366097861363
0.7256366097861363
36
ROC AUC:  0.7282155111077236
0.7282155111077236
37
ROC AUC:  0.7205450677739973
ROC AUC:  0.7237339061872509
ROC AUC:  0.7400101232174358
0.7400101232174358
40
ROC AUC:  0

KeyboardInterrupt: 