In [8]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt 
from datetime import timedelta
%matplotlib inline

#from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, confusion_matrix,\
                            f1_score,precision_score,recall_score,classification_report,cohen_kappa_score

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_selection import RFE

In [9]:
labels = pd.read_csv('../data/labels_training_set.csv') #las personas de las cuales tengo Info
personas =pd.read_csv('../data/trocafone_kaggle_test.csv') #las personas a las que le tengo que predecir

# Todos los features

In [10]:

df_santi = pd.read_csv('../features_csv/fetures_nuevos_santi.csv')
df_seba =  pd.read_csv('../features_csv/features_finales_seba.csv')
    
df_santi_eventos = pd.read_csv('Santi_FeaturesConEventos.csv')
df_santi_time = pd.read_csv('santi_timefeatures.csv')

df_final_features = pd.merge(df_santi, df_santi_eventos, on='person', how='inner')
df_final_features = pd.merge(df_final_features, df_santi_time, on='person', how='inner')
df_final_features = pd.merge(df_final_features, df_seba, on='person', how='inner')

labels_f = pd.merge(df_final_features, labels, on='person', how='inner')

In [17]:
y = labels_f[['label']]
X = labels_f.drop(columns=['person', 'label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 123)



In [18]:
def probar(X, y):
  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

    xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

    xg_reg.fit(X_train,y_train)

    preds = xg_reg.predict(X_test)
    proba = xg_reg.predict_proba(X_test)[:,1]

    train_accuracy = accuracy_score(y_train, xg_reg.predict(X_train))
    test_accuracy = accuracy_score(y_test, preds)
    matriz_de_confusion = confusion_matrix(y_test, preds)
    area_debajo_de_curva = roc_auc_score(y_test, proba)

    print("Train accuracy: ", train_accuracy)
    print("Test acuracy: ", test_accuracy)
    print("ROC auc score: ", area_debajo_de_curva)
    print("Confusion matrix: ")
    print(matriz_de_confusion)
    
 

In [27]:
def probar_oversampling(df, cant_unos):
    
    labels_f = pd.merge(df, labels, on='person', how='inner')
    
    y = labels_f[['label']]
    X = labels_f.drop(columns=['person', 'label'])

    X_train, X_test, y_train, y_test = train_test_split(X, y['label'], test_size=0.2, random_state=123)
    
    cant_ceros = np.sum(y_train == 0)
    
    ros = RandomOverSampler(random_state=2, sampling_strategy = {0: cant_ceros, 1: cant_unos})
    X_train_res, y_train_res = ros.fit_sample(X_train, y_train)
    X_train_res_df = pd.DataFrame(X_train_res)
    X_train_res_df.columns = X_test.columns
    
    xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

    xg_reg.fit(X_train_res_df, y_train_res)

    preds = xg_reg.predict(X_test)
    proba = xg_reg.predict_proba(X_test)[:,1]

    train_accuracy = accuracy_score(y_train_res, xg_reg.predict(X_train_res_df))
    test_accuracy = accuracy_score(y_test, preds)
    matriz_de_confusion = confusion_matrix(y_test, preds)
    area_debajo_de_curva = roc_auc_score(y_test, proba)

    print("Train accuracy: ", train_accuracy)
    print("Test acuracy: ", test_accuracy)
    print("ROC auc score: ", area_debajo_de_curva)
    print("Confusion matrix: ")
    print(matriz_de_confusion)

In [34]:
def probar_undersampling(X, y, cant_ceros):
   
    X_train, X_test, y_train, y_test = train_test_split(X, y['label'], test_size=0.2, random_state=123)
    
    cant_unos = np.sum(y_train == 1)
    
    ros = RandomUnderSampler(random_state=2, sampling_strategy = {0: cant_ceros, 1: cant_unos})
    X_train_res, y_train_res = ros.fit_sample(X_train, y_train)
    X_train_res_df = pd.DataFrame(X_train_res)
    X_train_res_df.columns = X_test.columns
    
    xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

    xg_reg.fit(X_train_res_df, y_train_res)

    preds = xg_reg.predict(X_test)
    proba = xg_reg.predict_proba(X_test)[:,1]

    train_accuracy = accuracy_score(y_train_res, xg_reg.predict(X_train_res_df))
    test_accuracy = accuracy_score(y_test, preds)
    matriz_de_confusion = confusion_matrix(y_test, preds)
    area_debajo_de_curva = roc_auc_score(y_test, proba)

    print("Train accuracy: ", train_accuracy)
    print("Test acuracy: ", test_accuracy)
    print("ROC auc score: ", area_debajo_de_curva)
    print("Confusion matrix: ")
    print(matriz_de_confusion)

In [42]:
def probar_under_y_over(X, y, cant_ceros, cant_unos):
   
    X_train, X_test, y_train, y_test = train_test_split(X, y['label'], test_size=0.2, random_state=123)
    
    ros = RandomUnderSampler(random_state=2, sampling_strategy = {0: cant_ceros, 1: np.sum(y_train == 1)})
    X_train_res, y_train_res = ros.fit_sample(X_train, y_train)
    X_train_res_df = pd.DataFrame(X_train_res)
    X_train_res_df.columns = X_test.columns
    
    cant_ceros = np.sum(y_train_res == 0)
    
    ros = RandomOverSampler(random_state=2, sampling_strategy = {0: cant_ceros, 1: cant_unos})
    X_train_res, y_train_res = ros.fit_sample(X_train_res_df, y_train_res)
    X_train_res_df = pd.DataFrame(X_train_res)
    X_train_res_df.columns = X_test.columns
    
    xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.8, learning_rate = 0.1,
                max_depth = 5, n_estimators = 6, scale_pos_weight = 7, min_child_weight=15)

    xg_reg.fit(X_train_res_df, y_train_res)

    preds = xg_reg.predict(X_test)
    proba = xg_reg.predict_proba(X_test)[:,1]

    train_accuracy = accuracy_score(y_train_res, xg_reg.predict(X_train_res_df))
    test_accuracy = accuracy_score(y_test, preds)
    matriz_de_confusion = confusion_matrix(y_test, preds)
    area_debajo_de_curva = roc_auc_score(y_test, proba)

    print("Train accuracy: ", train_accuracy)
    print("Test acuracy: ", test_accuracy)
    print("ROC auc score: ", area_debajo_de_curva)
    print("Confusion matrix: ")
    print(matriz_de_confusion)

In [43]:
cant_ceros = np.sum(y_train == 0)
cant_unos = np.sum(y_train == 1)
cant_ceros, cant_unos

(label    14742
 dtype: int64, label    789
 dtype: int64)

In [44]:
probar(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Train accuracy:  0.8842315369261478
Test acuracy:  0.8771568374967809
ROC auc score:  0.8546950247599168
Confusion matrix: 
[[3293  399]
 [  78  113]]


In [51]:
probar_oversampling(df_final_features, 800)

Train accuracy:  0.8819971689615236
Test acuracy:  0.8750965748132887
ROC auc score:  0.8530429455508727
Confusion matrix: 
[[3288  404]
 [  81  110]]


In [50]:
probar_undersampling(X, y, 1400)

Train accuracy:  0.7103700319780721
Test acuracy:  0.5621941797579192
ROC auc score:  0.8366434004753451
Confusion matrix: 
[[2009 1683]
 [  17  174]]


In [49]:
probar_under_y_over(X, y, 1400, 800)

Train accuracy:  0.7159090909090909
Test acuracy:  0.5573010558846253
ROC auc score:  0.8346928125336797
Confusion matrix: 
[[1990 1702]
 [  17  174]]
