In [2]:
import cuml
import cudf
import nvcategory

import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score

In [3]:
import os
import urllib.request

data_dir = '../../data/blackfriday/'
if not os.path.exists(data_dir):
    print('creating black friday data directory')
    os.system('mkdir ../../data/blackfriday')

In [4]:
base_url = 'https://datahack-prod.s3.amazonaws.com/train_zip/'
ofn = 'train_oSwQCTC.zip'
fn = 'train.zip'
if not os.path.isfile(data_dir+ofn):
        print(f'Downloading {base_url+ofn} to {data_dir+fn}')
        urllib.request.urlretrieve(base_url+ofn, data_dir+fn)

Downloading https://datahack-prod.s3.amazonaws.com/train_zip/train_oSwQCTC.zip to ../../data/blackfriday/train.zip


In [None]:
#Read in the data. Notice how it decompresses as it reads the data into memory. 
gdf = cudf.read_csv(data_dir+fn)

In [3]:
len(gdf)

537577

In [4]:
#Taking a look at the data. We use "to_pandas()" to get the pretty printing. 
gdf.head().to_pandas()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [5]:
#grabbing the first character of the years in city string to get rid of plus sign, and converting to int
gdf['city_years'] = gdf.Stay_In_Current_City_Years.str.get(0)

In [6]:
#Here we can see how we can control the value of our category variables with the replace method and turn strings to ints
gdf['City_Category'] = gdf.City_Category.str.replace('A', '1')
gdf['City_Category'] = gdf.City_Category.str.replace('B', '2')
gdf['City_Category'] = gdf.City_Category.str.replace('C', '3')
gdf['City_Category'] = gdf['City_Category'].str.stoi()

In [7]:
gdf['Gender'] = gdf.Gender.str.replace('F', '1')
gdf['Gender'] = gdf.Gender.str.replace('M', '0')
gdf['Gender'] = gdf.Gender.str.stoi()

In [8]:
#Let's take a look at how many products we have
prod_count = cudf.Series(nvcategory.from_strings(gdf.Product_ID.data).values()).unique().count() #hideous one-liner
print("Unique Products: {}".format(prod_count))

Unique Products: 3623


In [9]:
#Let's take a look at how many primary product categories we have
#We do it differently here because the variable is a number, not a string
prod1_count = gdf.Product_Category_1.unique().count()
print("Unique Product Categories: {}".format(prod1_count))

Unique Product Categories: 18


In [10]:
#Dummy for multi-category products
gdf['Product_Category_2'] = gdf['Product_Category_2'].fillna(0)
gdf['Product_Category_3'] = gdf['Product_Category_3'].fillna(0)
gdf['multi'] = ((gdf['Product_Category_2'] + gdf['Product_Category_3'])>0).astype('int')

In [11]:
#Gender/Marital Status interaction variable
gdf['gen_mar_interaction'] = gdf['Gender']*gdf['Marital_Status']

In [12]:
#Because Occupation is a code, it should converted into indicator variables
gdf = gdf.one_hot_encoding('Occupation', 'occ_dummy', gdf.Occupation.unique())

In [13]:
#Dummy variable from Int
gdf = gdf.one_hot_encoding('City_Category', 'city_cat', gdf.City_Category.unique())

#Dummy from string
cat = nvcategory.from_strings(gdf.Age.data)
gdf['Age'] = cudf.Series(cat.values())
gdf = gdf.one_hot_encoding('Age', 'age', gdf.Age.unique())

In [14]:
#Solution:
gdf = gdf.one_hot_encoding('Product_Category_1', 'product', gdf.Product_Category_1.unique())

In [15]:
#We're going to drop the variables we've transformed
drop_list = ['User_ID', 'Age', 'Stay_In_Current_City_Years', 'City_Category','Product_ID', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
gdf = gdf.drop(drop_list)

In [16]:
#We're going to make a list of all the first indicator variables in a series now so it will be
#easier to exclude them when we're doing regressions later

In [17]:
dummy_list = ['occ_dummy_0', 'city_cat_1', 'age_0', 'product_1', 'Purchase']

In [18]:
#All variables currently have to have the same type for some methods in cuML
for col in gdf.columns.tolist():
    gdf[col] = gdf[col].astype('float32')

In [19]:
#cuml.preprocessing.model_selection.train_test_split
test_size = round(len(gdf)*0.2)
train_size = round(len(gdf)-test_size)

In [20]:
test = gdf.iloc[0:test_size]
gdf_train = gdf.iloc[train_size:]

In [21]:
#Deleting the main gdf because we're going to be making other subsets and other stuff, so it will be nice to have the memory. 
del(gdf)

In [22]:
y_train = gdf_train['Purchase'].log()
X_reg = gdf_train.drop(dummy_list)

In [23]:
# # I'm going to perform a hyperparameter search for alpha in a ridge regression
output_ridge = {}
for alpha in np.around(np.arange(0.01, 1, 0.01), decimals=2):
    
    Ridge = cuml.Ridge(alpha=alpha, fit_intercept=False, normalize=True)
    _fit = Ridge.fit(X_reg, y_train)
    _y_hat = _fit.predict(X_reg)
    _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
    output_ridge['RMSE_RIDGE_{}'.format(alpha)] = _mse

print('MAX RMSE: {}'.format(min(output_ridge, key=output_ridge.get)))

MAX RMSE: RMSE_RIDGE_0.19


In [24]:
Ridge = cuml.Ridge(alpha=.1, fit_intercept=False, normalize=True)
_fit = Ridge.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

314.5797419772926


In [25]:
##Lasso
output_lasso = {}
for alpha in np.around(np.arange(0.1, 10, 0.1), decimals=2):
    
    Lasso = cuml.Lasso(alpha=alpha, fit_intercept=False, normalize=True)
    _fit = Lasso.fit(X_reg, y_train)
    _y_hat = _fit.predict(X_reg)
    _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
    output_lasso['RMSE_Lasso_{}'.format(alpha)] = _mse

print('MAX RMSE: {}'.format(min(output_lasso, key=output_lasso.get)))

MAX RMSE: RMSE_Lasso_0.1


In [26]:
Lasso = cuml.Lasso(alpha=.1, fit_intercept=False, normalize=True)
_fit = Lasso.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

2,949.3877330727473


In [27]:
##Elastic Net
output_en = {}
for alpha in np.around(np.arange(0.1, 10, 0.1), decimals=2):
    for ratio in np.around(np.arange(0.1, 1, 0.1), decimals=2):
    
        ElasticNet = cuml.ElasticNet(alpha=alpha, l1_ratio=ratio, fit_intercept=False, normalize=True)
        _fit = ElasticNet.fit(X_reg, y_train)
        _y_hat = _fit.predict(X_reg)
        _mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
        output_en['RMSE_ElasticNet_{}_{}'.format(alpha, ratio)] = _mse

print('MAX RMSE: {}'.format(min(output_en, key=output_en.get)))

MAX RMSE: RMSE_ElasticNet_0.1_0.1


In [28]:
ElasticNet = cuml.ElasticNet(alpha=.1, l1_ratio=.1, fit_intercept=False, normalize=True)
_fit = ElasticNet.fit(X_reg, y_train)
_y_hat = _fit.predict(X_reg)
_mse = np.sqrt((y_train.reset_index(drop=True).sub(_y_hat)**2).sum())
print('{:,}'.format(_mse))

2,949.3877330727473


In [29]:
y_xgb = gdf_train[['Purchase']].log()
X_xgb = gdf_train.drop('Purchase')
xgb_train_set = xgb.DMatrix(data=X_xgb, label=y_xgb)

In [30]:
xgb_params = {
    'nround':100,
    'max_depth':4,
    'max_leaves':2**4,
    'tree_method':'gpu_hist',
    'n_gpus':1,
    'loss':'ls',
    'objective':'reg:squarederror',
    'max_features':'auto',
    'criterion':'friedman_mse',
    'grow_policy':'lossguide',
    'verbose':True
}

In [31]:
xgb_model = xgb.train(xgb_params, dtrain=xgb_train_set)

In [32]:
y_hat_xgb = xgb_model.predict(xgb_train_set)

In [33]:
RMSE = np.sqrt(mean_squared_error(y_xgb['Purchase'].to_pandas(), y_hat_xgb)) 

In [34]:
print(RMSE)

0.4661744
