In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
from datetime import timedelta
%matplotlib inline

#from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score

from imblearn.over_sampling import RandomOverSampler

from sklearn.feature_selection import RFE

In [2]:
labels = pd.read_csv('../data/labels_training_set.csv') #las personas de las cuales tengo Info
personas =pd.read_csv('../data/trocafone_kaggle_test.csv') #las personas a las que le tengo que predecir

# Todos los features

In [3]:

df_santi = pd.read_csv('../features_csv/fetures_nuevos_santi.csv')
df_seba =  pd.read_csv('../features_csv/features_finales_seba.csv')
    
df_santi_eventos = pd.read_csv('Santi_FeaturesConEventos.csv')
df_santi_time = pd.read_csv('santi_timefeatures.csv')

df_final_features = pd.merge(df_santi, df_santi_eventos, on='person', how='inner')
df_final_features = pd.merge(df_final_features, df_santi_time, on='person', how='inner')
df_final_features = pd.merge(df_final_features, df_seba, on='person', how='inner')

labels_f = pd.merge(df_final_features, labels, on='person', how='inner')

In [4]:
y = labels_f[['label']]
X = labels_f.drop(columns=['person', 'label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 123)

X.shape

(19414, 372)

## Selección de features

### Selección con random forest

In [5]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, class_weight = {0: 1, 1: 15})

clf.fit(X_train, y_train)

preds = clf.predict(X_test)
preds_prob = clf.predict_proba(X_test)[:,1]
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, preds)
area_debajo_de_curva = roc_auc_score(y_test, preds_prob)
matriz_de_confusion = confusion_matrix(y_test, preds)
print('train acurracy: ')
print(train_accuracy)
print('test acurracy: ')
print(test_accuracy)
print('Matriz de confusión: ')
print(matriz_de_confusion)
print('Área bajo la curva: ')
print(area_debajo_de_curva)

  This is separate from the ipykernel package so we can avoid doing imports until


train acurracy: 
0.8185564355160646
test acurracy: 
0.8202420808653104
Matriz de confusión: 
[[3055  637]
 [  61  130]]
Área bajo la curva: 
0.8379047948585594


In [6]:
importancia = clf.feature_importances_
len(importancia)

372

In [7]:
importancia_con_nombre = {}
features = X.columns
for i in range(372):
    importancia_con_nombre[features[i]] = importancia[i]

sorted_by_value = sorted(importancia_con_nombre.items(), key=lambda kv: -kv[1])
sorted_by_value

[('checkout mes 5', 0.11868390731635853),
 ('checkout_y', 0.11514652547724044),
 ('cadaCuantosDiasHaceCheckout', 0.05509720552969096),
 ('checkout_x', 0.05293589617093125),
 ('distan_dias', 0.04632160893023039),
 ('mes_primer_entrada', 0.045474660812630054),
 ('mes_5', 0.03331426219173941),
 ('dias_hasta_ultimo', 0.03177067731019073),
 ('visited site mes 4', 0.026062153073754174),
 ('promedio_por_dia_y', 0.021475505122133236),
 ('search engine hit_y', 0.021282568961384616),
 ('modelos_dist_mes_5', 0.019677263139934776),
 ('cadaCuantosMinutosHaceEventos', 0.018718151148828098),
 ('visited site', 0.01768062984839662),
 ('nuevo_mes_5', 0.017263066935825314),
 ('primer_quincena_x', 0.01682758866215167),
 ('minuto_0-19', 0.016023406041039377),
 ('cadaCuantosDiasEnMes5HaceEventos', 0.0158435063954833),
 ('cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5', 0.015223563260718522),
 ('segunda_quincena_y', 0.013564646075235332),
 ('mes_4', 0.013272968459879773),
 ('search engine hit_x', 0.0129339

In [8]:
atributos_importantes = []
for clave in importancia_con_nombre.keys():
    if importancia_con_nombre[clave] > 0:
        atributos_importantes.append(clave)

len(atributos_importantes)

107

In [9]:
X_importantes = X.filter(items = atributos_importantes)
print(atributos_importantes)

['promedioCadaCuantosDiasVuelve', 'cadaCuantasHorasHaceEventos', 'cadaCuantosMinutosHaceEventos', 'cadaCuantosDiasEnMes5HaceEventos', 'cadaCuantosMinutosHaceEventosEnUnDiaYHoraDelMes5', 'cadaCuantosSegundosHaceEventosEnUnDiaYHoraDelMes5', 'cadaCuantasHorasHaceEventosEnSuUltimaConexionDelMes5', 'cadaCuantosMinutosHaceEventosEnSuUltimaConexionDelMes5', 'cadaCuantosSegundosHaceEventosEnSuUltimaConexionDelMes5', 'cadaCuantosDiasHaceConversion', 'cadaCuantosDiasHaceCheckout', 'brand listing_x', 'checkout_x', 'conversion_x', 'search engine hit_x', 'viewed product', 'visited site', 'vis_iPhone SE', 'vis_Samsung Galaxy J5', 'viewedproduct_Preto', 'viewedproduct_Dourado', 'viewedproduct_Ametista', 'viewedproduct_Preto Brilhante', 'viewedproduct_16GB', 'viewedproduct_32GB', 'viewedproduct_8GB', 'viewedproduct_Bom', 'viewedproduct_Excelente', 'viewedproduct_Muito Bom', 'vuelve', 'visitedsite_Piaui', 'visitedsite_Computer', 'visitedsite_Smartphone', 'visitedsite_1366x768', 'visitedsite_375x667', '

In [11]:
# Estimador para RFE y RFECV
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 100, scale_pos_weight = 7, min_child_weight=15)


### Selección con algoritmo RFE (con features de random forest)

In [12]:
n = 70
rfe = RFE(xg_reg, n)
rfe = rfe.fit(X_importantes, y['label'].ravel())
   
print('n = ', n)
   
# Selecciono los n features para entrenar
mask = rfe.get_support()
features_X = X_importantes.columns[mask]
new_x = X_importantes.filter(items=features_X)

n =  70


In [None]:
# Entrenamos y predecimos
X_train, X_test, y_train, y_test = train_test_split(new_x, y['label'], test_size=0.2, random_state=123)
    
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

xg_reg.fit(X_train,y_train)
proba = xg_reg.predict_proba(X_test)[:,1]
print("ROC auc score: ", roc_auc_score(y_test, proba))

### Selección con RFECV (con los feaures de random forest)

In [None]:
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=xg_reg, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc')

rfecv.fit(X_importantes, y['label'].ravel())

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
# Selecciono los n features para entrenar
mask = rfecv.get_support()
features_X = X_importantes.columns[mask]
new_x = X_importantes.filter(items=features_X)

In [None]:
# Entrenamos y predecimos
X_train, X_test, y_train, y_test = train_test_split(new_x, y['label'], test_size=0.2, random_state=123)
    
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

xg_reg.fit(X_train,y_train)
proba = xg_reg.predict_proba(X_test)[:,1]
print("ROC auc score: ", roc_auc_score(y_test, proba))

### Selección con algoritmo RFE (con todos los features)

In [1]:
n = 70
rfe = RFE(xg_reg, n)
rfe = rfe.fit(X, y['label'].ravel())
   
print('n = ', n)
   
# Selecciono los n features para entrenar
mask = rfe.get_support()
features_X = X.columns[mask]
new_x = X.filter(items=features_X)

NameError: name 'RFE' is not defined

In [None]:
# Entrenamos y predecimos
X_train, X_test, y_train, y_test = train_test_split(new_x, y['label'], test_size=0.2, random_state=123)
    
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

xg_reg.fit(X_train,y_train)
proba = xg_reg.predict_proba(X_test)[:,1]
print("ROC auc score: ", roc_auc_score(y_test, proba))

### Selección con RFECV (con todos los feaures)

In [None]:
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=xg_reg, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc', n_jobs=4)

rfecv.fit(X, y['label'].ravel())

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
  # Selecciono los n features para entrenar
mask = rfecv.get_support()
features_X = X.columns[mask]
new_x = X.filter(items=features_X)

In [None]:
# Entrenamos y predecimos
X_train, X_test, y_train, y_test = train_test_split(new_x, y['label'], test_size=0.2, random_state=123)
    
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

xg_reg.fit(X_train,y_train)
proba = xg_reg.predict_proba(X_test)[:,1]
print("ROC auc score: ", roc_auc_score(y_test, proba))