# XGBoost Model running in parallel on GPU's
#### Sample data use case: likely to upgrade to Hispanic Tier


## Check for number of GPU's



In [1]:
!nvidia-smi

Thu Aug  1 14:14:21 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.79       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-DGXS...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   40C    P0    38W / 300W |      0MiB / 16128MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   40C    P0    39W / 300W |      0MiB / 16128MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-DGXS...  On   | 00000000:0E:00.0 Off |                    0 |
| N/A   

In [2]:
!nproc

40


## CUDA Version

Next, let's see what CUDA version we have.

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130


## Load our libraries

In [6]:
#import cudf; print('cuDF Version:', cudf.__version__)
#import cuml; print('cuML Version:', '0.2.0')
#import dask; print('dask Version:', dask.__version__)
# import dask_gdf; print('dask_gdf Version:', dask_gdf.__version__)
# import dask_xgboost; print('dask_xgboost Version:', dask_xgboost.__version__)
#import numba; print('numba Version:', numba.__version__)
import numpy; print('numpy Version:', numpy.__version__)
import matplotlib; print('matplotlib Version:', matplotlib.__version__)
import pandas; print('pandas Version:', pandas.__version__)
import pyarrow; print('pyarrow Version:', pyarrow.__version__)


#xgboost library has been re-compiled with NCCL to support running on GPU
import xgboost; print('XGBoost Version:', xgboost.__version__)

numpy Version: 1.16.2
matplotlib Version: 3.0.2
pandas Version: 0.24.1
pyarrow Version: 0.12.1
XGBoost Version: 0.81


## Load/Simulate data

### Load data

We can load the data using `pandas.read_csv`.

### Simulate data

Alternatively, we can simulate data for our train and validation datasets. The features will be tabular with `n_rows` and `n_columns` in the training dataset, where each value is either of type `np.float32` if the data is numerical or `np.uint8` if the data is categorical. Both numerical and categorical data can also be combined; for this experiment, we have ignored this combination.

In [7]:
import numpy as np
import pandas as pd


# helper function for simulating data
def simulate_data(m, n, k=2, numerical=False):
    if numerical:
        features = np.random.rand(m, n)
    else:
        features = np.random.randint(2, size=(m, n))
    labels = np.random.randint(k, size=m)
    return np.c_[labels, features].astype(np.float32)


# helper function for loading data
def load_data(filename, n_rows):
    if n_rows >= 1e9:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, nrows=n_rows)
    return df.values.astype(np.float32)

In [12]:
df = pd.read_csv('hisp_tier_sample.txt', delimiter="\t", header='infer',skip_blank_lines='True',engine='python')

In [13]:
print(df.shape)

(1045736, 301)


In [14]:
# settings
LOAD = False
n_gpus = 4
n_rows, n_columns = df.shape
n_categories = 2

In [17]:
df.columns

Index(['_u1.e_c_pct_hh_speakspanish', '_u1.r_cr_cust_sol_cnt',
       '_u1.telesales_est_cntcts', '_u1.r_hispanic_acculturation',
       '_u1.r_second_indv_age', '_u1.tts_past_past_ttl_tickets',
       '_u1.sc_max_up_fec_uncorrect_l30d', '_u1.mrm_recurring_hsd_amt_10m',
       '_u1.e_pcthh_fam_no_child_lt_18', '_u1.svcs_days_since_had_last_vid',
       ...
       '_u1.truckroll_tot_points_7d', '_u1.r_vid_mos_on_books_gap_18mo',
       '_u1.vi_tbn_max_dvr', '_u1.eml_snt_strat_cust_l7d',
       '_u1.r_vid_active_mos_on_books', '_u1.e_pct_wrst_stat_30',
       '_u1.acct_prev_busunit_principal', '_u1.sc_avg_ds_receive_pwr_l3d',
       '_u1.e_pct_pop_no_sch_age_3plus', '_u1.telesales_cntct_nm'],
      dtype='object', length=301)

In [18]:
#remove the prefix from column names
df.rename(columns=lambda x: x[4:], inplace=True)

In [19]:
df.columns

Index(['e_c_pct_hh_speakspanish', 'r_cr_cust_sol_cnt', 'telesales_est_cntcts',
       'r_hispanic_acculturation', 'r_second_indv_age',
       'tts_past_past_ttl_tickets', 'sc_max_up_fec_uncorrect_l30d',
       'mrm_recurring_hsd_amt_10m', 'e_pcthh_fam_no_child_lt_18',
       'svcs_days_since_had_last_vid',
       ...
       'truckroll_tot_points_7d', 'r_vid_mos_on_books_gap_18mo',
       'vi_tbn_max_dvr', 'eml_snt_strat_cust_l7d', 'r_vid_active_mos_on_books',
       'e_pct_wrst_stat_30', 'acct_prev_busunit_principal',
       'sc_avg_ds_receive_pwr_l3d', 'e_pct_pop_no_sch_age_3plus',
       'telesales_cntct_nm'],
      dtype='object', length=301)

In [20]:
df.head(10)

Unnamed: 0,e_c_pct_hh_speakspanish,r_cr_cust_sol_cnt,telesales_est_cntcts,r_hispanic_acculturation,r_second_indv_age,tts_past_past_ttl_tickets,sc_max_up_fec_uncorrect_l30d,mrm_recurring_hsd_amt_10m,e_pcthh_fam_no_child_lt_18,svcs_days_since_had_last_vid,...,truckroll_tot_points_7d,r_vid_mos_on_books_gap_18mo,vi_tbn_max_dvr,eml_snt_strat_cust_l7d,r_vid_active_mos_on_books,e_pct_wrst_stat_30,acct_prev_busunit_principal,sc_avg_ds_receive_pwr_l3d,e_pct_pop_no_sch_age_3plus,telesales_cntct_nm
0,93.7,,0,HA5,U,32.0,92.0,6.71,41.7,483.0,...,0.0,238,,0.0,238,58.0,6000.0,-0.025,78.8,
1,70.7,,0,HA5,U,23.0,,0.0,34.0,813.0,...,0.0,68,,,68,67.0,6000.0,,77.4,
2,69.0,,0,,53,18.0,9940.0,47.83,34.2,,...,0.0,17,,4.0,17,36.0,6000.0,-4.0875,86.5,
3,2.3,,0,,U,17.0,81.0,,42.8,,...,0.0,5,,,5,100.0,7400.0,5.291667,72.3,
4,6.0,,0,,U,23.0,94.0,,44.6,,...,,0,,0.0,0,53.0,7400.0,-4.683333,66.7,
5,9.8,,0,,77,48.0,98.0,42.0,34.3,,...,0.0,25,,0.0,25,50.0,7500.0,-2.009524,76.6,
6,24.1,1.0,0,,U,23.0,94.0,57.22,35.9,322.0,...,0.0,10,,3.0,10,68.0,7500.0,-3.363636,77.2,
7,2.3,,0,,U,36.0,,0.0,25.3,286.0,...,,20,,2.0,20,56.0,2000.0,,78.0,
8,1.5,,0,,55,15.0,413.0,,38.4,,...,0.0,36,,1.0,36,57.0,7000.0,-1.886667,70.1,
9,63.2,0.0,0,HA3,U,4.0,9.0,56.79,35.9,,...,0.0,25,,,25,56.0,9500.0,-0.656522,71.7,


In [21]:
df.r_geo_hispanicity.value_counts()

HA3    35447
HA2    35396
HA1    12088
HA4    10033
HA5     5077
Name: r_geo_hispanicity, dtype: int64

In [28]:
dataset = df[['r_first_indv_age', 'r_demo_first_indv_age',
       'nsd_demo_first_indv_age', 'vi_cine_max_totl',
       'e_c_child_age_7t9_s', 'e_c_child_age_4t6_s',
       'r_bill_pmt_cycle', 'vi_cine_max_lin_sum','r_geo_hispanicity',
       'vi_hispaniclinearmin','nsd_hispanic_tier_ind','r_spanish_bill','r_geo_hispanicity',
       'target']]
dataset.head(10)

Unnamed: 0,r_first_indv_age,r_demo_first_indv_age,nsd_demo_first_indv_age,vi_cine_max_totl,e_c_child_age_7t9_s,e_c_child_age_4t6_s,r_bill_pmt_cycle,vi_cine_max_lin_sum,r_geo_hispanicity,vi_hispaniclinearmin,nsd_hispanic_tier_ind,r_spanish_bill,r_geo_hispanicity.1,target
0,48,81,81,,5.0,11.0,12,,HA5,,,,HA5,1
1,57,58,71,,2.0,1.0,28,,HA5,8.64,,,HA5,1
2,53,50,50,,2.0,2.0,1,,,8.42,,,,1
3,45,29,29,,16.0,17.0,20,,,,,,,1
4,45,39,39,,14.0,18.0,1,,,,,,,1
5,75,73,25,,1.0,1.0,2,,,,,,,1
6,33,34,34,,5.0,8.0,25,,,32.92,,,,1
7,58,74,74,,1.0,1.0,5,,,,,,,1
8,61,58,46,,1.0,1.0,23,,,,,,,1
9,51,51,53,,0.0,0.0,26,,HA3,446.2,,,HA3,1


In [34]:
dataset = df[['r_first_indv_age', 'r_demo_first_indv_age',
       'nsd_demo_first_indv_age', 'vi_cine_max_totl',
       'e_c_child_age_7t9_s', 'e_c_child_age_4t6_s',
       'r_bill_pmt_cycle', 'vi_cine_max_lin_sum',
       'vi_hispaniclinearmin','nsd_hispanic_tier_ind','r_spanish_bill','r_geo_hispanicity',
       'target']]

In [35]:
dataset.columns

Index(['r_first_indv_age', 'r_demo_first_indv_age', 'nsd_demo_first_indv_age',
       'vi_cine_max_totl', 'e_c_child_age_7t9_s', 'e_c_child_age_4t6_s',
       'r_bill_pmt_cycle', 'vi_cine_max_lin_sum', 'vi_hispaniclinearmin',
       'nsd_hispanic_tier_ind', 'r_spanish_bill', 'r_geo_hispanicity',
       'target'],
      dtype='object')

In [36]:
dataset = dataset.fillna(0)
dataset.head(10)

Unnamed: 0,r_first_indv_age,r_demo_first_indv_age,nsd_demo_first_indv_age,vi_cine_max_totl,e_c_child_age_7t9_s,e_c_child_age_4t6_s,r_bill_pmt_cycle,vi_cine_max_lin_sum,vi_hispaniclinearmin,nsd_hispanic_tier_ind,r_spanish_bill,r_geo_hispanicity,target
0,48,81,81,0.0,5.0,11.0,12,0.0,0.0,0.0,0.0,HA5,1
1,57,58,71,0.0,2.0,1.0,28,0.0,8.64,0.0,0.0,HA5,1
2,53,50,50,0.0,2.0,2.0,1,0.0,8.42,0.0,0.0,0,1
3,45,29,29,0.0,16.0,17.0,20,0.0,0.0,0.0,0.0,0,1
4,45,39,39,0.0,14.0,18.0,1,0.0,0.0,0.0,0.0,0,1
5,75,73,25,0.0,1.0,1.0,2,0.0,0.0,0.0,0.0,0,1
6,33,34,34,0.0,5.0,8.0,25,0.0,32.92,0.0,0.0,0,1
7,58,74,74,0.0,1.0,1.0,5,0.0,0.0,0.0,0.0,0,1
8,61,58,46,0.0,1.0,1.0,23,0.0,0.0,0.0,0.0,0,1
9,51,51,53,0.0,0.0,0.0,26,0.0,446.2,0.0,0.0,HA3,1


In [37]:
dataset["r_first_indv_age"] = dataset.r_first_indv_age.replace("U", 50)
dataset["r_demo_first_indv_age"] = dataset.r_demo_first_indv_age.replace("U", 50)
dataset["nsd_demo_first_indv_age"] = dataset.nsd_demo_first_indv_age.replace("U", 50)
dataset["r_geo_hispanicity"] = dataset.r_geo_hispanicity.replace('HA1', 1)
dataset["r_geo_hispanicity"] = dataset.r_geo_hispanicity.replace('HA2', 2)
dataset["r_geo_hispanicity"] = dataset.r_geo_hispanicity.replace('HA3', 3)
dataset["r_geo_hispanicity"] = dataset.r_geo_hispanicity.replace('HA4', 4)
dataset["r_geo_hispanicity"] = dataset.r_geo_hispanicity.replace('HA5', 5)

In [38]:
dataset["r_first_indv_age"] = dataset.r_first_indv_age.astype(np.float32)
dataset["r_demo_first_indv_age"] = dataset.r_demo_first_indv_age.astype(np.float32)
dataset["nsd_demo_first_indv_age"] = dataset.nsd_demo_first_indv_age.astype(np.float32)
dataset["r_bill_pmt_cycle"] = dataset.r_bill_pmt_cycle.astype(np.float32)

In [39]:
dataset.head(10)

Unnamed: 0,r_first_indv_age,r_demo_first_indv_age,nsd_demo_first_indv_age,vi_cine_max_totl,e_c_child_age_7t9_s,e_c_child_age_4t6_s,r_bill_pmt_cycle,vi_cine_max_lin_sum,vi_hispaniclinearmin,nsd_hispanic_tier_ind,r_spanish_bill,r_geo_hispanicity,target
0,48.0,81.0,81.0,0.0,5.0,11.0,12.0,0.0,0.0,0.0,0.0,5,1
1,57.0,58.0,71.0,0.0,2.0,1.0,28.0,0.0,8.64,0.0,0.0,5,1
2,53.0,50.0,50.0,0.0,2.0,2.0,1.0,0.0,8.42,0.0,0.0,0,1
3,45.0,29.0,29.0,0.0,16.0,17.0,20.0,0.0,0.0,0.0,0.0,0,1
4,45.0,39.0,39.0,0.0,14.0,18.0,1.0,0.0,0.0,0.0,0.0,0,1
5,75.0,73.0,25.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0,1
6,33.0,34.0,34.0,0.0,5.0,8.0,25.0,0.0,32.92,0.0,0.0,0,1
7,58.0,74.0,74.0,0.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0,1
8,61.0,58.0,46.0,0.0,1.0,1.0,23.0,0.0,0.0,0.0,0.0,0,1
9,51.0,51.0,53.0,0.0,0.0,0.0,26.0,0.0,446.2,0.0,0.0,3,1


In [40]:
dataset.dtypes

r_first_indv_age           float32
r_demo_first_indv_age      float32
nsd_demo_first_indv_age    float32
vi_cine_max_totl           float64
e_c_child_age_7t9_s        float64
e_c_child_age_4t6_s        float64
r_bill_pmt_cycle           float32
vi_cine_max_lin_sum        float64
vi_hispaniclinearmin       float64
nsd_hispanic_tier_ind      float64
r_spanish_bill             float64
r_geo_hispanicity            int64
target                       int64
dtype: object

In [41]:
n_rows, n_columns = dataset.shape
train_size = 0.8

In [42]:
# split X, y
X, y  = dataset[['r_first_indv_age', 'r_demo_first_indv_age', 'nsd_demo_first_indv_age','vi_cine_max_totl', 
            'e_c_child_age_7t9_s', 'e_c_child_age_4t6_s','r_geo_hispanicity','vi_hispaniclinearmin',
                 'nsd_hispanic_tier_ind','r_spanish_bill',
                 'r_bill_pmt_cycle','vi_cine_max_lin_sum']] , dataset[['target']]

In [43]:
# split train data
X_train = X.sample(frac=train_size, replace=False, random_state=4)
y_train = y.sample(frac=train_size, replace=False, random_state=4)
train_index = X_train.index.values
#X_train, y_train = X[:train_index], y[:train_index]
X_train.count()

r_first_indv_age           836589
r_demo_first_indv_age      836589
nsd_demo_first_indv_age    836589
vi_cine_max_totl           836589
e_c_child_age_7t9_s        836589
e_c_child_age_4t6_s        836589
r_geo_hispanicity          836589
vi_hispaniclinearmin       836589
nsd_hispanic_tier_ind      836589
r_spanish_bill             836589
r_bill_pmt_cycle           836589
vi_cine_max_lin_sum        836589
dtype: int64

In [44]:
# split validation data
X_validation, y_validation = X.drop(train_index), y.drop(train_index)
X_validation.count()

r_first_indv_age           209147
r_demo_first_indv_age      209147
nsd_demo_first_indv_age    209147
vi_cine_max_totl           209147
e_c_child_age_7t9_s        209147
e_c_child_age_4t6_s        209147
r_geo_hispanicity          209147
vi_hispaniclinearmin       209147
nsd_hispanic_tier_ind      209147
r_spanish_bill             209147
r_bill_pmt_cycle           209147
vi_cine_max_lin_sum        209147
dtype: int64

In [45]:
y_train.target.value_counts()

0    799972
1     36617
Name: target, dtype: int64

In [46]:
y_validation.target.value_counts()

0    200028
1      9119
Name: target, dtype: int64

In [47]:
%%time

if LOAD:
    dataset = load_data('hisp_tier.txt', n_rows )
else:
    dataset = simulate_data(n_rows, n_columns, n_categories)
print(dataset.shape)

(1045736, 14)
CPU times: user 147 ms, sys: 0 ns, total: 147 ms
Wall time: 145 ms


### Split data for simulated ones here

We'll split our dataset into a 80% training dataset and a 20% validation dataset.

In [11]:
# identify shape and indices
n_rows, n_columns = dataset.shape
train_size = 0.80
train_index = int(n_rows * train_size)

# split X, y
X, y = dataset[:, 1:], dataset[:, 0]
#del dataset

# split train data
X_train, y_train = X[:train_index, :], y[:train_index]

# split validation data
X_validation, y_validation = X[train_index:, :], y[train_index:]

### Check dimensions

We can check the dimensions and proportions of our training and validation dataets.

In [12]:
# print(X_train[:3, :], y_train[:3])

In [48]:
# check dimensions
print('X_train: ', X_train.shape, X_train.dtypes, 'y_train: ', y_train.shape, y_train.dtypes)
print('X_validation', X_validation.shape, X_validation.dtypes, 'y_validation: ', y_validation.shape, y_validation.dtypes)

# check the proportions
total = X_train.shape[0] + X_validation.shape[0]
print('X_train proportion:', X_train.shape[0] / total)
print('X_validation proportion:', X_validation.shape[0] / total)

X_train:  (836589, 12) r_first_indv_age           float32
r_demo_first_indv_age      float32
nsd_demo_first_indv_age    float32
vi_cine_max_totl           float64
e_c_child_age_7t9_s        float64
e_c_child_age_4t6_s        float64
r_geo_hispanicity            int64
vi_hispaniclinearmin       float64
nsd_hispanic_tier_ind      float64
r_spanish_bill             float64
r_bill_pmt_cycle           float32
vi_cine_max_lin_sum        float64
dtype: object y_train:  (836589, 1) target    int64
dtype: object
X_validation (209147, 12) r_first_indv_age           float32
r_demo_first_indv_age      float32
nsd_demo_first_indv_age    float32
vi_cine_max_totl           float64
e_c_child_age_7t9_s        float64
e_c_child_age_4t6_s        float64
r_geo_hispanicity            int64
vi_hispaniclinearmin       float64
nsd_hispanic_tier_ind      float64
r_spanish_bill             float64
r_bill_pmt_cycle           float32
vi_cine_max_lin_sum        float64
dtype: object y_validation:  (209147, 1) targ

## Convert NumPy data to DMatrix format

With out data simulated and formatted as NumPy arrays, our next step is to convert this to a `DMatrix` object that XGBoost can work with. We can instantiate an object of the `xgboost.DMatrix` by passing in the feature matrix as the first argument followed by the label vector using the `label=` keyword argument. To learn more about XGBoost's support for data structures other than NumPy arrays, see the documentation for the Data Interface:


https://xgboost.readthedocs.io/en/latest/python/python_intro.html#data-interface


In [49]:
%%time

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalidation = xgb.DMatrix(X_validation, label=y_validation)

CPU times: user 143 ms, sys: 197 µs, total: 144 ms
Wall time: 142 ms


## Set parameters

There are a number of parameters that can be set before XGBoost can be run. 

* General parameters relate to which booster we are using to do boosting, commonly tree or linear model
* Booster parameters depend on which booster you have chosen
* Learning task parameters decide on the learning scenario. For example, regression tasks may use different parameters with ranking tasks.

For more information on the configurable parameters within the XGBoost module, see the documentation here:


https://xgboost.readthedocs.io/en/latest/parameter.html

In [50]:
# instantiate params
params = {}

# general params
general_params = {'silent': 1}
params.update(general_params)

# booster params
# n_gpus = 0
booster_params = {'max_depth': 3,'eta':0.06}

if n_gpus != 0:
    booster_params['tree_method'] = 'gpu_hist'
    booster_params['n_gpus'] = n_gpus
params.update(booster_params)

# learning task params
learning_task_params = {'eval_metric': 'auc', 'objective': 'binary:logistic'}
params.update(learning_task_params)
print(params)

{'silent': 1, 'max_depth': 3, 'eta': 0.06, 'tree_method': 'gpu_hist', 'n_gpus': 4, 'eval_metric': 'auc', 'objective': 'binary:logistic'}


## Train model

Now it's time to train our model! We can use the `xgb.train` function and pass in the parameters, training dataset, the number of boosting iterations, and the list of items to be evaluated during training. For more information on the parameters that can be passed into `xgb.train`, check out the documentation:


https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.train

In [51]:
# model training settings
evallist = [(dvalidation, 'validation'), (dtrain, 'train')]
num_round = 10000

In [52]:
%%time

bst = xgb.train(params, dtrain, num_round, evallist)

[0]	validation-auc:0.629269	train-auc:0.626114
[1]	validation-auc:0.629257	train-auc:0.626135
[2]	validation-auc:0.629399	train-auc:0.626583
[3]	validation-auc:0.629406	train-auc:0.626594
[4]	validation-auc:0.629409	train-auc:0.626598
[5]	validation-auc:0.630113	train-auc:0.627165
[6]	validation-auc:0.630112	train-auc:0.627165
[7]	validation-auc:0.630137	train-auc:0.627198
[8]	validation-auc:0.630124	train-auc:0.627178
[9]	validation-auc:0.633761	train-auc:0.630758
[10]	validation-auc:0.633873	train-auc:0.630893
[11]	validation-auc:0.633873	train-auc:0.630897
[12]	validation-auc:0.63401	train-auc:0.631017
[13]	validation-auc:0.666803	train-auc:0.663339
[14]	validation-auc:0.666802	train-auc:0.663339
[15]	validation-auc:0.666813	train-auc:0.663354
[16]	validation-auc:0.666805	train-auc:0.663342
[17]	validation-auc:0.66738	train-auc:0.663697
[18]	validation-auc:0.667486	train-auc:0.663803
[19]	validation-auc:0.667488	train-auc:0.663816
[20]	validation-auc:0.667512	train-auc:0.663831
[21]

In [53]:
bst

<xgboost.core.Booster at 0x7f91c9716160>