In [1]:
import cuml
import cudf
import nvcategory

import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score

In [2]:
import os
import urllib.request

data_dir = '../../data/blackfriday/'
if not os.path.exists(data_dir):
    print('creating black friday data directory')
    os.system('mkdir ../../data/blackfriday')

In [3]:
base_url = 'https://datahack-prod.s3.amazonaws.com/train_zip/'
ofn = 'train_oSwQCTC.zip'
fn = 'train.zip'
if not os.path.isfile(data_dir+ofn):
        print(f'Downloading {base_url+ofn} to {data_dir+fn}')
        urllib.request.urlretrieve(base_url+ofn, data_dir+fn)

Downloading https://datahack-prod.s3.amazonaws.com/train_zip/train_oSwQCTC.zip to ../../data/blackfriday/train.zip


In [4]:
#Read in the data. Notice how it decompresses as it reads the data into memory. 
gdf = cudf.read_csv(data_dir+fn)

In [5]:
len(gdf)

550068

In [6]:
#Taking a look at the data.
gdf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [7]:
#grabbing the first character of the years in city string to get rid of plus sign, and converting to int
gdf['city_years'] = gdf.Stay_In_Current_City_Years.str.get(0)

In [8]:
#Here we can see how we can control the value of our category variables with the replace method and turn strings to ints
gdf['City_Category'] = gdf.City_Category.str.replace('A', '1')
gdf['City_Category'] = gdf.City_Category.str.replace('B', '2')
gdf['City_Category'] = gdf.City_Category.str.replace('C', '3')
gdf['City_Category'] = gdf['City_Category'].str.stoi()

In [9]:
gdf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,city_years
0,1000001,P00069042,F,0-17,10,1,2,0,3,,,8370,2
1,1000001,P00248942,F,0-17,10,1,2,0,1,6.0,14.0,15200,2
2,1000001,P00087842,F,0-17,10,1,2,0,12,,,1422,2
3,1000001,P00085442,F,0-17,10,1,2,0,12,14.0,,1057,2
4,1000002,P00285442,M,55+,16,3,4+,0,8,,,7969,4


In [10]:
gdf['Gender'] = gdf.Gender.str.replace('F', '1')
gdf['Gender'] = gdf.Gender.str.replace('M', '0')
gdf['Gender'] = gdf.Gender.str.stoi()

In [11]:
#Let's take a look at how many products we have
prod_count = gdf.Product_ID.unique().count()
print("Unique Products: {}".format(prod_count))

Unique Products: 3631


In [12]:
#Let's take a look at how many primary product categories we have
#We do it differently here because the variable is a number, not a string
prod1_count = gdf.Product_Category_1.unique().count()
print("Unique Product Categories: {}".format(prod1_count))

Unique Product Categories: 20


In [13]:
#Dummy for multi-category products
gdf['Product_Category_2'] = gdf['Product_Category_2'].fillna(0)
gdf['Product_Category_3'] = gdf['Product_Category_3'].fillna(0)
gdf['multi'] = ((gdf['Product_Category_2'] + gdf['Product_Category_3'])>0).astype('int')

In [14]:
gdf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,city_years,multi
0,1000001,P00069042,1,0-17,10,1,2,0,3,0.0,0.0,8370,2,0
1,1000001,P00248942,1,0-17,10,1,2,0,1,6.0,14.0,15200,2,1
2,1000001,P00087842,1,0-17,10,1,2,0,12,0.0,0.0,1422,2,0
3,1000001,P00085442,1,0-17,10,1,2,0,12,14.0,0.0,1057,2,1
4,1000002,P00285442,0,55+,16,3,4+,0,8,0.0,0.0,7969,4,0


In [15]:
#Gender/Marital Status interaction variable
gdf['gen_mar_interaction'] = gdf['Gender']*gdf['Marital_Status']

In [16]:
#Because Occupation is a code, it should converted into indicator variables
gdf = gdf.one_hot_encoding('Occupation', 'occ_dummy', gdf.Occupation.unique())

In [17]:
gdf.dtypes

User_ID                         int64
Product_ID                     object
Gender                          int64
Age                            object
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
city_years                     object
multi                           int64
gen_mar_interaction             int64
occ_dummy_0                   float64
occ_dummy_1                   float64
occ_dummy_2                   float64
occ_dummy_3                   float64
occ_dummy_4                   float64
occ_dummy_5                   float64
occ_dummy_6                   float64
occ_dummy_7                   float64
occ_dummy_8                   float64
occ_dummy_9                   float64
occ_dummy_10                  float64
occ_dummy_11

In [18]:
#Dummy variable from Int
gdf = gdf.one_hot_encoding('City_Category', 'city_cat', gdf.City_Category.unique())

#Dummy from string
gdf = gdf.one_hot_encoding('Age', 'age', gdf.Age.unique())

In [19]:
gdf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,city_cat_1,city_cat_2,city_cat_3,age_0-17,age_18-25,age_26-35,age_36-45,age_46-50,age_51-55,age_55+
0,1000001,P00069042,1,0-17,10,1,2,0,3,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1000001,P00248942,1,0-17,10,1,2,0,1,6.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1000001,P00087842,1,0-17,10,1,2,0,12,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000001,P00085442,1,0-17,10,1,2,0,12,14.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000002,P00285442,0,55+,16,3,4+,0,8,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
gdf.dtypes

User_ID                         int64
Product_ID                     object
Gender                          int64
Age                            object
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
city_years                     object
multi                           int64
gen_mar_interaction             int64
occ_dummy_0                   float64
occ_dummy_1                   float64
occ_dummy_2                   float64
occ_dummy_3                   float64
occ_dummy_4                   float64
occ_dummy_5                   float64
occ_dummy_6                   float64
occ_dummy_7                   float64
occ_dummy_8                   float64
occ_dummy_9                   float64
occ_dummy_10                  float64
occ_dummy_11

In [21]:
#Solution:
gdf = gdf.one_hot_encoding('Product_Category_1', 'product', gdf.Product_Category_1.unique())

In [22]:
#We're going to drop the variables we've transformed
drop_list = ['User_ID', 'Age', 'Stay_In_Current_City_Years', 'City_Category','Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
gdf = gdf.drop(drop_list)

In [23]:
#We're going to make a list of all the first indicator variables in a series now so it will be
#easier to exclude them when we're doing regressions later

In [24]:
dummy_list = ['occ_dummy_0', 'city_cat_1', 'age_0-17', 'product_1', 'Purchase']

In [25]:
#All variables currently have to have the same type for some methods in cuML
for col in gdf.columns.tolist():
    gdf[col] = gdf[col].astype('float32')

In [26]:
#cuml.preprocessing.model_selection.train_test_split
test_size = round(len(gdf)*0.2)
train_size = round(len(gdf)-test_size)

In [27]:
test = gdf.iloc[0:test_size]
gdf_train = gdf.iloc[train_size:]

In [28]:
#Deleting the main gdf because we're going to be making other subsets and other stuff, so it will be nice to have the memory. 
del(gdf)

In [29]:
y_train = gdf_train['Purchase'].log()
X_reg = gdf_train.drop(dummy_list)

In [30]:
# # I'm going to perform a hyperparameter search for alpha in a ridge regression
output_ridge = {}
for alpha in np.around(np.arange(0.01, 1, 0.01), decimals=2):
    
    Ridge = cuml.Ridge(alpha=alpha, fit_intercept=False, normalize=True)
    _fit = Ridge.fit(X_reg, y_train)
    _y_hat = _fit.predict(X_reg)
    _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
    output_ridge['RMSE_RIDGE_{}'.format(alpha)] = _mse

print('MAX RMSE: {}'.format(min(output_ridge, key=output_ridge.get)))

MAX RMSE: RMSE_RIDGE_0.15


In [31]:
Ridge = cuml.Ridge(alpha=.1, fit_intercept=False, normalize=True)
_fit = Ridge.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

323.17560048679417


In [32]:
##Lasso
output_lasso = {}
for alpha in np.around(np.arange(0.1, 10, 0.1), decimals=2):
    
    Lasso = cuml.Lasso(alpha=alpha, fit_intercept=False, normalize=True)
    _fit = Lasso.fit(X_reg, y_train)
    _y_hat = _fit.predict(X_reg)
    _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
    output_lasso['RMSE_Lasso_{}'.format(alpha)] = _mse

print('MAX RMSE: {}'.format(min(output_lasso, key=output_lasso.get)))

MAX RMSE: RMSE_Lasso_0.1


In [33]:
Lasso = cuml.Lasso(alpha=.1, fit_intercept=False, normalize=True)
_fit = Lasso.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

2,944.920712005673


In [34]:
##Elastic Net
output_en = {}
for alpha in np.around(np.arange(0.1, 10, 0.1), decimals=2):
    for ratio in np.around(np.arange(0.1, 1, 0.1), decimals=2):
    
        ElasticNet = cuml.ElasticNet(alpha=alpha, l1_ratio=ratio, fit_intercept=False, normalize=True)
        _fit = ElasticNet.fit(X_reg, y_train)
        _y_hat = _fit.predict(X_reg)
        _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
        output_en['RMSE_ElasticNet_{}_{}'.format(alpha, ratio)] = _mse

print('MAX RMSE: {}'.format(min(output_en, key=output_en.get)))

MAX RMSE: RMSE_ElasticNet_0.1_0.1


In [35]:
ElasticNet = cuml.ElasticNet(alpha=.1, l1_ratio=.1, fit_intercept=False, normalize=True)
_fit = ElasticNet.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

2,944.920712005673


In [36]:
y_xgb = gdf_train[['Purchase']].log()
X_xgb = gdf_train.drop('Purchase')
xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

In [37]:
xgb_params = {
    'nround':100,
    'max_depth':4,
    'max_leaves':2**4,
    'tree_method':'gpu_hist',
    'n_gpus':1,
    'loss':'ls',
    'objective':'reg:squarederror',
    'max_features':'auto',
    'criterion':'friedman_mse',
    'grow_policy':'lossguide',
    'verbose':True
}

In [38]:
xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)

n_gpus: 
	Deprecated. Single process multi-GPU training is no longer supported.
	Please switch to distributed training with one process per GPU.
	This can be done using Dask or Spark.  See documentation for details.
Parameters: { criterion, loss, max_features, nround, verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [39]:
y_hat_xgb = xgb_model.predict(xgb_train_set)

In [40]:
RMSE = np.sqrt(mean_squared_error(y_xgb['Purchase'].to_pandas(), y_hat_xgb)) 

In [41]:
print(RMSE)

0.4940293
