In [1]:
#################################################
# Prepare training set using balance techniques #
#################################################

import numpy as np
import pandas as pd
from scipy import stats

from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
try:
    h2o.cluster().shutdown()
except AttributeError:
    pass
    
h2o.init(nthreads = -1,
         max_mem_size = "24G")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_171"; OpenJDK Runtime Environment (build 1.8.0_171-8u171-b11-0ubuntu0.16.04.1-b11); OpenJDK 64-Bit Server VM (build 25.171-b11, mixed mode)
  Starting server from /usr/local/lib/python2.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp0wsRXu
  JVM stdout: /tmp/tmp0wsRXu/h2o_mourao_started_from_python.out
  JVM stderr: /tmp/tmp0wsRXu/h2o_mourao_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Sao_Paulo
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.4
H2O cluster version age:,2 months and 3 days
H2O cluster name:,H2O_from_python_mourao_go6wik
H2O cluster total nodes:,1
H2O cluster free memory:,21.33 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [3]:
# random seed
my_seed = 1980

In [4]:
# csv files' path
my_path = '../data/'

In [5]:
## load datasets
train = pd.read_csv(my_path + 'c2_e6_1_train.csv')
test = pd.read_csv(my_path + 'c2_e6_1_test.csv')

In [6]:
complete = train.columns.tolist()
remove = ['cd_pss', 'nm_mun_uor', 'sg_uf_uor', 'target', 'base_guess', 'label']
slim = [x for x in complete if x not in remove]

In [7]:
# separate train x and y
x = train[slim]
y = train['label']

In [8]:
# create h2o dataframe for test
test_h2o = h2o.H2OFrame(test[slim + ['label']])
test_h2o['label'] = test_h2o['label'].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
# samplers
samplers = ['RandomUnderSampler', 'ClusterCentroids', 'RandomOverSampler', 
            'SMOTE', 'ADASYN', 'SMOTEENN', 'SMOTETomek' ] 

In [10]:
def test_samplers(x, y, features, test_h2o, sampler, my_seed):
    s = eval(sampler + '(random_state=my_seed, ratio="all")')
    x_s, y_s = s.fit_sample(x, y)
    train_s = pd.concat([pd.Series(y_s, name='label'),
                       pd.DataFrame(x_s, columns=features)], axis=1)

    print(sampler)   
    print('Train: ' + str(len(train)) + ' records')
    print('Label ratio in training set: ' + str(100 * sum(train['label']) / len(train)) + '%')
    print
    print('Train resampled: ' + str(len(train_s)) + ' records')
    print('Label ratio in training set: ' + str(100 * sum(train_s['label']) / len(train_s)) + '%')
    print    
    
    score_sampler(train_s, test_h2o, features, sampler, my_seed)

In [11]:
def score_sampler(train, test_h2o, features, name, my_seed):
    train_h2o = h2o.H2OFrame(train)
    train_h2o['label'] = train_h2o['label'].asfactor()

    hyper_parameters = { 'alpha': [0, 0.1, 0.5, 0.7, 1.0] }

    lr_grid = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', 
                                                          seed=my_seed,
                                                          fold_assignment='Modulo',
                                                          lambda_search=True,
                                                          nfolds=5), 
                             grid_id=name,
                             hyper_params=hyper_parameters)

    lr_grid.train(x=features, 
                   y='label', 
                   training_frame=train_h2o)

    # Get the grid results, sorted by validation F1-Measure
    lr_gridperf1 = lr_grid.get_grid(sort_by='F1', decreasing=True)
    print(lr_gridperf1)

    # Grab the top LR model, chosen by validation F1
    best_lr1 = lr_gridperf1.models[0]

    # Now let's evaluate the model performance on a test set
    # so we get an honest estimate of top model performance
    best_lr_perf1 = best_lr1.model_performance(test_h2o)
    print(best_lr_perf1)

In [12]:
for sampler in samplers:
    test_samplers(x, y, slim, test_h2o, sampler, my_seed)

RandomUnderSampler
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 1608 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha                   model_ids                  f1
0    [1.0]  RandomUnderSampler_model_4  0.8251497005988024
1    [0.7]  RandomUnderSampler_model_3  0.8226779252110976
2    [0.1]  RandomUnderSampler_model_1  0.8220183486238531
3    [0.5]  RandomUnderSampler_model_2  0.8215158924205379
4    [0.0]  RandomUnderSampler_model_0   0.821014050091631


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.108033881523
RMSE: 0.328685079556
LogLoss: 0.391499279147
Null degrees of freedom: 12476
Residual degrees of freedom: 12433
Null deviance: 17296.7947437
Residual deviance: 11117.1882961
AIC: 11205.1882961
AUC: 0.895044679476
Gini: 0.790089358952
Confusion Matrix (Act/Pred) for m

0,1,2,3,4
,0.0,1.0,Error,Rate
0,11680.0,217.0,0.0182,(217.0/11897.0)
1,305.0,275.0,0.5259,(305.0/580.0)
Total,11985.0,492.0,0.0418,(522.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.8998219,0.5130597,27.0
max f2,0.6414935,0.5500686,95.0
max f0point5,0.9705271,0.5627828,9.0
max accuracy,0.9820827,0.9616094,6.0
max precision,0.9998040,0.8137931,0.0
max recall,0.0479551,1.0,379.0
max specificity,0.9998040,0.9977305,0.0
max absolute_mcc,0.9089627,0.4943758,25.0
max min_per_class_accuracy,0.4145523,0.8172414,171.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9994443,17.8980414,17.8980414,0.832,0.832,0.1793103,0.1793103,1689.8041379,1689.8041379
,2,0.0200369,0.9825765,11.8746621,14.8863517,0.552,0.692,0.1189655,0.2982759,1087.4662069,1388.6351724
,3,0.0300553,0.9434728,9.8095034,13.1940690,0.456,0.6133333,0.0982759,0.3965517,880.9503448,1219.4068966
,4,0.0400737,0.8942021,8.0885379,11.9176862,0.376,0.554,0.0810345,0.4775862,708.8537931,1091.7686207
,5,0.0500120,0.8486500,4.8575640,10.5147132,0.2258065,0.4887821,0.0482759,0.5258621,385.7563960,951.4713196
,6,0.1000240,0.6596621,2.8613810,6.6880471,0.1330128,0.3108974,0.1431034,0.6689655,186.1380968,568.8047082
,7,0.1500361,0.5258450,1.5858256,4.9873066,0.0737179,0.2318376,0.0793103,0.7482759,58.5825597,398.7306587
,8,0.2000481,0.4328556,1.2066064,4.0421315,0.0560897,0.1879006,0.0603448,0.8086207,20.6606432,304.2131548
,9,0.2999920,0.2944092,0.6555402,2.9138709,0.0304731,0.1354528,0.0655172,0.8741379,-34.4459807,191.3870950




ClusterCentroids
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 1608 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha                 model_ids                  f1
0    [1.0]  ClusterCentroids_model_4  0.8255159474671669
1    [0.7]  ClusterCentroids_model_3  0.8248222365869425
2    [0.5]  ClusterCentroids_model_2  0.8247291268323773
3    [0.0]  ClusterCentroids_model_0  0.8241269841269842
4    [0.1]  ClusterCentroids_model_1  0.8241269841269842


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.136952180865
RMSE: 0.370070507964
LogLoss: 0.453390564551
Null degrees of freedom: 12476
Residual degrees of freedom: 12430
Null deviance: 17296.7947437
Residual deviance: 11313.9081478
AIC: 11407.9081478
AUC: 0.86272973772
Gini: 0.72545947544
Confusion Matrix (Act/Pred) for max f1 @ thresh

0,1,2,3,4
,0.0,1.0,Error,Rate
0,11655.0,242.0,0.0203,(242.0/11897.0)
1,365.0,215.0,0.6293,(365.0/580.0)
Total,12020.0,457.0,0.0486,(607.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.9304528,0.4146577,21.0
max f2,0.6658742,0.4636252,99.0
max f0point5,0.9834127,0.5326705,5.0
max accuracy,0.9834127,0.9609682,5.0
max precision,0.9981034,0.8515625,1.0
max recall,0.0066874,1.0,396.0
max specificity,0.9997782,0.9987392,0.0
max absolute_mcc,0.9757117,0.4194846,7.0
max min_per_class_accuracy,0.4282994,0.7854081,174.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9969335,18.4143310,18.4143310,0.856,0.856,0.1844828,0.1844828,1741.4331035,1741.4331035
,2,0.0200369,0.9703348,9.9816,14.1979655,0.464,0.66,0.1,0.2844828,898.16,1319.7965517
,3,0.0300553,0.9453265,4.8187034,11.0715448,0.224,0.5146667,0.0482759,0.3327586,381.8703448,1007.1544828
,4,0.0400737,0.9211793,5.1628966,9.5943828,0.24,0.446,0.0517241,0.3844828,416.2896552,859.4382759
,5,0.0500120,0.8982295,3.2962041,8.3428216,0.1532258,0.3878205,0.0327586,0.4172414,229.6204116,734.2821618
,6,0.1000240,0.7725523,2.8269065,5.5848641,0.1314103,0.2596154,0.1413793,0.5586207,182.6906499,458.4864058
,7,0.1500361,0.6309578,2.1374171,4.4357151,0.0993590,0.2061966,0.1068966,0.6655172,113.7417109,343.5715075
,8,0.2000481,0.5101042,1.2755554,3.6456751,0.0592949,0.1694712,0.0637931,0.7293103,27.5555371,264.5675149
,9,0.2999920,0.3383093,1.1040677,2.7989254,0.0513232,0.1301095,0.1103448,0.8396552,10.4067693,179.8925350




RandomOverSampler
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 31664 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha                  model_ids                  f1
0    [0.1]  RandomOverSampler_model_1  0.8346641906530485
1    [0.0]  RandomOverSampler_model_0  0.8343070229956495
2    [0.5]  RandomOverSampler_model_2  0.8337945028592512
3    [0.7]  RandomOverSampler_model_3  0.8331159555307123
4    [1.0]  RandomOverSampler_model_4  0.8319687383699294


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.107862551509
RMSE: 0.328424346706
LogLoss: 0.375572969033
Null degrees of freedom: 12476
Residual degrees of freedom: 12427
Null deviance: 17296.7947437
Residual deviance: 9372.04786924
AIC: 9472.04786924
AUC: 0.897141556985
Gini: 0.79428311397
Confusion Matrix (Act/Pred) for max f1

0,1,2,3,4
,0.0,1.0,Error,Rate
0,11599.0,298.0,0.025,(298.0/11897.0)
1,276.0,304.0,0.4759,(276.0/580.0)
Total,11875.0,602.0,0.046,(574.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.8446125,0.5143824,43.0
max f2,0.7574453,0.5522247,64.0
max f0point5,0.9560257,0.5692220,14.0
max accuracy,0.9874275,0.9624910,5.0
max precision,0.9997644,0.864,0.0
max recall,0.0352667,1.0,383.0
max specificity,0.9997644,0.9985711,0.0
max absolute_mcc,0.9079308,0.4939335,27.0
max min_per_class_accuracy,0.4225921,0.8206897,167.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9988865,18.7585241,18.7585241,0.872,0.872,0.1879310,0.1879310,1775.8524138,1775.8524138
,2,0.0200369,0.9668401,11.7025655,15.2305448,0.544,0.708,0.1172414,0.3051724,1070.2565517,1423.0544828
,3,0.0300553,0.9213672,9.4653103,13.3088,0.44,0.6186667,0.0948276,0.4,846.5310345,1230.88
,4,0.0400737,0.8767107,7.0559586,11.7455897,0.328,0.546,0.0706897,0.4706897,605.5958621,1074.5589655
,5,0.0500120,0.8331605,5.5515017,10.5147132,0.2580645,0.4887821,0.0551724,0.5258621,455.1501669,951.4713196
,6,0.1000240,0.6552013,2.9303299,6.7225216,0.1362179,0.3125,0.1465517,0.6724138,193.0329907,572.2521552
,7,0.1500361,0.5323348,1.4479277,4.9643236,0.0673077,0.2307692,0.0724138,0.7448276,44.7927719,396.4323607
,8,0.2000481,0.4359109,1.3100298,4.0507502,0.0608974,0.1883013,0.0655172,0.8103448,31.0029841,305.0750166
,9,0.2999920,0.3039130,0.7245444,2.9426073,0.0336808,0.1367887,0.0724138,0.8827586,-27.5455576,194.2607350




SMOTE
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 31664 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha      model_ids                  f1
0    [0.0]  SMOTE_model_0  0.8434541746568046
1    [0.1]  SMOTE_model_1  0.8433566007254548
2    [0.5]  SMOTE_model_2  0.8431029278599456
3    [0.7]  SMOTE_model_3  0.8424216918174713
4    [1.0]  SMOTE_model_4  0.8403183344932974


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.108176854579
RMSE: 0.32890250011
LogLoss: 0.371877694809
Null degrees of freedom: 12476
Residual degrees of freedom: 12426
Null deviance: 17296.7947437
Residual deviance: 9279.83599627
AIC: 9381.83599627
AUC: 0.894030297409
Gini: 0.788060594818
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.866633962276: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,11627.0,270.0,0.0227,(270.0/11897.0)
1,289.0,291.0,0.4983,(289.0/580.0)
Total,11916.0,561.0,0.0448,(559.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.8666340,0.5100789,37.0
max f2,0.6822320,0.5366269,89.0
max f0point5,0.9455934,0.5626327,16.0
max accuracy,0.9926330,0.9620902,2.0
max precision,0.9997118,0.8692308,0.0
max recall,0.0160812,1.0,390.0
max specificity,0.9997118,0.9985711,0.0
max absolute_mcc,0.8666340,0.4866852,37.0
max min_per_class_accuracy,0.4078529,0.8107926,176.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9985619,18.7585241,18.7585241,0.872,0.872,0.1879310,0.1879310,1775.8524138,1775.8524138
,2,0.0200369,0.9666433,11.3583724,15.0584483,0.528,0.7,0.1137931,0.3017241,1035.8372414,1405.8448276
,3,0.0300553,0.9269453,9.4653103,13.1940690,0.44,0.6133333,0.0948276,0.3965517,846.5310345,1219.4068966
,4,0.0400737,0.8848325,6.7117655,11.5734931,0.312,0.538,0.0672414,0.4637931,571.1765517,1057.3493103
,5,0.0500120,0.8393581,5.8984705,10.4457643,0.2741935,0.4855769,0.0586207,0.5224138,489.8470523,944.5764257
,6,0.1000240,0.6711253,2.6545342,6.5501492,0.1233974,0.3044872,0.1327586,0.6551724,165.4534151,555.0149204
,7,0.1500361,0.5404758,1.4479277,4.8494087,0.0673077,0.2254274,0.0724138,0.7275862,44.7927719,384.9408709
,8,0.2000481,0.4347454,1.3100298,3.9645640,0.0608974,0.1842949,0.0655172,0.7931034,31.0029841,296.4563992
,9,0.2999920,0.2960831,0.8798039,2.9368601,0.0408982,0.1365215,0.0879310,0.8810345,-12.0196057,193.6860070




ADASYN
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 31761 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha       model_ids                  f1
0    [0.1]  ADASYN_model_1  0.8251945382469534
1    [0.5]  ADASYN_model_2   0.825139023744373
2    [0.0]  ADASYN_model_0  0.8248073914455235
3    [0.7]  ADASYN_model_3  0.8240675165250236
4    [1.0]  ADASYN_model_4  0.8231472928275575


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.127258310214
RMSE: 0.356732827497
LogLoss: 0.420095125127
Null degrees of freedom: 12476
Residual degrees of freedom: 12430
Null deviance: 17366.0369299
Residual deviance: 10483.0537524
AIC: 10577.0537524
AUC: 0.890219064209
Gini: 0.780438128418
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.871666632971: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,11622.0,275.0,0.0231,(275.0/11897.0)
1,294.0,286.0,0.5069,(294.0/580.0)
Total,11916.0,561.0,0.0456,(569.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.8716666,0.5013146,34.0
max f2,0.7554464,0.5310263,67.0
max f0point5,0.9486690,0.5650685,14.0
max accuracy,0.9805706,0.9624910,6.0
max precision,0.9997299,0.89,0.0
max recall,0.0272300,1.0,386.0
max specificity,0.9997299,0.9990754,0.0
max absolute_mcc,0.8716666,0.4774998,34.0
max min_per_class_accuracy,0.4638388,0.8068966,160.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9970539,18.5864276,18.5864276,0.864,0.864,0.1862069,0.1862069,1758.6427586,1758.6427586
,2,0.0200369,0.9592124,11.8746621,15.2305448,0.552,0.708,0.1189655,0.3051724,1087.4662069,1423.0544828
,3,0.0300553,0.9215527,8.2606345,12.9072414,0.384,0.6,0.0827586,0.3879310,726.0634483,1190.7241379
,4,0.0400737,0.8880065,6.0233793,11.1862759,0.28,0.52,0.0603448,0.4482759,502.3379310,1018.6275862
,5,0.0500120,0.8516879,6.9393771,10.3423408,0.3225806,0.4807692,0.0689655,0.5172414,593.9377086,934.2340849
,6,0.1000240,0.7106020,2.5511107,6.4467258,0.1185897,0.2996795,0.1275862,0.6448276,155.1110743,544.6725796
,7,0.1500361,0.5927580,1.5168767,4.8034427,0.0705128,0.2232906,0.0758621,0.7206897,51.6876658,380.3442750
,8,0.2000481,0.4931538,1.2410809,3.9128523,0.0576923,0.1818910,0.0620690,0.7827586,24.1080902,291.2852288
,9,0.2999920,0.3534018,0.9488082,2.9253655,0.0441059,0.1359872,0.0948276,0.8775862,-5.1191826,192.5365510




SMOTEENN
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 29410 records
Label ratio in training set: 52%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha         model_ids                  f1
0    [0.1]  SMOTEENN_model_1  0.8752074656311652
1    [0.0]  SMOTEENN_model_0  0.8745373565021014
2    [0.7]  SMOTEENN_model_3   0.874478431372549
3    [0.5]  SMOTEENN_model_2  0.8744266765836123
4    [1.0]  SMOTEENN_model_4  0.8732456140350877


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.128991402395
RMSE: 0.359153730865
LogLoss: 0.445510719536
Null degrees of freedom: 12476
Residual degrees of freedom: 12427
Null deviance: 18666.4153974
Residual deviance: 11117.2744953
AIC: 11217.2744953
AUC: 0.897636031106
Gini: 0.795272062212
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.949188853736: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,11665.0,232.0,0.0195,(232.0/11897.0)
1,299.0,281.0,0.5155,(299.0/580.0)
Total,11964.0,513.0,0.0426,(531.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.9491889,0.5141812,18.0
max f2,0.7937040,0.5455067,62.0
max f0point5,0.9706612,0.5629596,11.0
max accuracy,0.9962056,0.9619300,2.0
max precision,0.9998875,0.8291139,0.0
max recall,0.0118367,1.0,391.0
max specificity,0.9998875,0.9977305,0.0
max absolute_mcc,0.9491889,0.4930285,18.0
max min_per_class_accuracy,0.4761914,0.8171808,161.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9998563,18.9306207,18.9306207,0.88,0.88,0.1896552,0.1896552,1793.0620690,1793.0620690
,2,0.0200369,0.9924318,11.0141793,14.9724,0.512,0.696,0.1103448,0.3,1001.4179310,1397.24
,3,0.0300553,0.9727520,10.3257931,13.4235310,0.48,0.624,0.1034483,0.4034483,932.5793103,1242.3531034
,4,0.0400737,0.9499733,7.2280552,11.8746621,0.336,0.552,0.0724138,0.4758621,622.8055172,1087.4662069
,5,0.0500120,0.9210741,4.8575640,10.4802387,0.2258065,0.4871795,0.0482759,0.5241379,385.7563960,948.0238727
,6,0.1000240,0.7733233,2.7924320,6.6363354,0.1298077,0.3084936,0.1396552,0.6637931,179.2432029,563.6335378
,7,0.1500361,0.6362093,1.7926724,5.0217811,0.0833333,0.2334402,0.0896552,0.7534483,79.2672414,402.1781057
,8,0.2000481,0.5050443,1.0687086,4.0335129,0.0496795,0.1875,0.0534483,0.8068966,6.8708554,303.3512931
,9,0.2999920,0.3230517,0.7245444,2.9311128,0.0336808,0.1362543,0.0724138,0.8793103,-27.5455576,193.1112790




SMOTETomek
Train: 16636 records
Label ratio in training set: 4%

Train resampled: 31626 records
Label ratio in training set: 50%

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha           model_ids                  f1
0    [0.0]  SMOTETomek_model_0  0.8435946695941207
1    [0.1]  SMOTETomek_model_1  0.8433955919664965
2    [0.5]  SMOTETomek_model_2  0.8432177799251117
3    [0.7]  SMOTETomek_model_3  0.8417935702199661
4    [1.0]  SMOTETomek_model_4  0.8403737413443803


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.10813051969
RMSE: 0.328832053927
LogLoss: 0.371710734708
Null degrees of freedom: 12476
Residual degrees of freedom: 12426
Null deviance: 17296.7947437
Residual deviance: 9275.6696739
AIC: 9377.6696739
AUC: 0.893991458293
Gini: 0.787982916586
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.855124441925: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,11607.0,290.0,0.0244,(290.0/11897.0)
1,283.0,297.0,0.4879,(283.0/580.0)
Total,11890.0,587.0,0.0459,(573.0/12477.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.8551244,0.5089974,39.0
max f2,0.6828793,0.5363224,87.0
max f0point5,0.9618040,0.5622010,12.0
max accuracy,0.9924623,0.9620101,3.0
max precision,0.9997085,0.8615385,0.0
max recall,0.0173759,1.0,389.0
max specificity,0.9997085,0.9984870,0.0
max absolute_mcc,0.8632549,0.4850511,37.0
max min_per_class_accuracy,0.4085607,0.8121375,173.0


Gains/Lift Table: Avg response rate:  4,65 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100184,0.9985853,18.7585241,18.7585241,0.872,0.872,0.1879310,0.1879310,1775.8524138,1775.8524138
,2,0.0200369,0.9668990,11.3583724,15.0584483,0.528,0.7,0.1137931,0.3017241,1035.8372414,1405.8448276
,3,0.0300553,0.9270481,9.4653103,13.1940690,0.44,0.6133333,0.0948276,0.3965517,846.5310345,1219.4068966
,4,0.0400737,0.8854255,6.7117655,11.5734931,0.312,0.538,0.0672414,0.4637931,571.1765517,1057.3493103
,5,0.0500120,0.8397409,5.8984705,10.4457643,0.2741935,0.4855769,0.0586207,0.5224138,489.8470523,944.5764257
,6,0.1000240,0.6713966,2.6545342,6.5501492,0.1233974,0.3044872,0.1327586,0.6551724,165.4534151,555.0149204
,7,0.1500361,0.5403850,1.4824022,4.8609002,0.0689103,0.2259615,0.0741379,0.7293103,48.2402188,386.0900199
,8,0.2000481,0.4349507,1.3100298,3.9731826,0.0608974,0.1846955,0.0655172,0.7948276,31.0029841,297.3182609
,9,0.2999920,0.2955629,0.8625529,2.9368601,0.0400962,0.1365215,0.0862069,0.8810345,-13.7447114,193.6860070






  chunks = self.iterencode(o, _one_shot=True)


In [13]:
train.columns

Index([u'cd_pss', u'nm_mun_uor', u'sg_uf_uor', u'label', u'target',
       u'base_guess', u'v1', u'v2', u'v3', u'v4', u'v5', u'v6', u'v7', u'v8',
       u'v9', u'v10', u'v11', u'v12', u'v13', u'v14', u'v15', u'v16', u'v17',
       u'v18', u'v19', u'v20', u'v21', u'v22', u'v23', u'v24', u'v25', u'v26',
       u'v27', u'v28', u'v29', u'v30', u'v31', u'v32', u'v33', u'v34', u'v35',
       u'v36', u'v37', u'v38', u'v39', u'v40', u'v41', u'v42', u'v43', u'v44',
       u'v45', u'v46', u'v47', u'v48', u'v49', u'v50'],
      dtype='object')

  chunks = self.iterencode(o, _one_shot=True)


In [14]:
# save best sampler
remove = ['cd_pss', 'nm_mun_uor', 'sg_uf_uor', 'target', 'label']
features = [x for x in train.columns if x not in remove]

s = RandomOverSampler(random_state=my_seed)
x_s, y_s = s.fit_sample(train[features], train['label'])
train_s = pd.concat([pd.Series(y_s, name='label'),
                   pd.DataFrame(x_s, columns=features)], axis=1)

  chunks = self.iterencode(o, _one_shot=True)


In [15]:
train_s.to_csv(my_path + 'c2_e6_2_train.csv')

  chunks = self.iterencode(o, _one_shot=True)


In [16]:
# create libsvm files inspired in https://github.com/zygmuntz/phraug

def create_libsvm(df, features, out):

    with open(out, 'w') as w:
        for index, row in df.iterrows():
            new_row = str(int(row['label']))
            for i, col in enumerate(features):
                new_row += ' %s:%s' % (i + 1, str(row[col]))
            w.write(new_row + '\n')

  chunks = self.iterencode(o, _one_shot=True)


In [17]:
create_libsvm(train_s, slim, my_path + 'c2_e6_2_train.svm')
create_libsvm(test, slim, my_path + 'c2_e6_2_test.svm')

  chunks = self.iterencode(o, _one_shot=True)
