In [1]:
## parameters:

#################################
## data parameters:
#################################

## number of rows to process
num_rows = 10**7
## number of categorical variables; will process categorical variables 0:n_sparse_features
n_sparse_features = 26
## number of numeric features; numeric variables 0:n_num_features 
n_num_features = 13

#################################
## Feature Engineering parameters:
#################################

# categorical_var_strategy = 'targ-enc' ## options are 'string-index' or 'targ-enc'
# replace_small_levels = False
categorical_var_strategy = 'string-index' ## options are 'string-index' or 'targ-enc'
replace_small_levels = True
small_level_freq_thresh = 10 ## same threshold used by winners: https://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf
## constructed variables:
file_tail = 'try2'
table_to_save = 'criteo_dac_proc_{}sparse_{}num_{}freqthresh_{}rows_{}catstrat+{}'.format(n_sparse_features,n_num_features, small_level_freq_thresh, num_rows, categorical_var_strategy, file_tail)

output_dir = 'dbfs:/mnt/adlsgen2'

#################################
## LightGBM params
#################################

classifier_lightgbm_iterations = 3  
n_folds = 4                         
num_leaves_grid = [32,64]

#################################
## Control and Verbosity Parameters
#################################

## run describe on the input table
describe = False
## save the result of the preprocessing pipeline as a table.
save_as_table = False
run_crossvalidation = False

import os

outfile = os.path.join(output_dir,table_to_save+'.parquet')

print(outfile)


## load data loader

In [3]:
# from reco_utils.dataset.criteo_dac import load_spark_df

In [4]:
## read in the data - this takes some time...8-10 minutes
# df = load_spark_df(spark=spark, dbutils=dbutils)
## print('writing to parquet...')
## df.write.parquet('dbfs:/FileStore/dac_train.parquet')
# df = sqlContext.read.parquet("/FileStore/dac_train.parquet")
df = sqlContext.read.parquet('dbfs:/mnt/adlsgen2/dac_train_nocatna.parquet')
# Could ADLS be causing issues?
# df = sqlContext.read.parquet("/mnt/adlsgen2/dac_train.parquet")

## Get number of rows

In [6]:
total_rows = df.count()
print('{} rows in raw data file. Limiting this to {} ({}\%)'.format(total_rows, num_rows, num_rows/total_rows))
if num_rows > 0:
  df = df.limit(num_rows)

## Describe the data

In [8]:
if describe:
  ## This can take quite a bit of time...
  cur_descr = df.describe()
  display(cur_descr)

In [9]:
## boundary check n_sparse_features
if n_sparse_features < 0 or n_sparse_features > 26:
  raise ValueError('n_sparse_features must be between 0 and 26...')
else:
  print('Running with {} sparse (i.e. categorical) features.'.format(n_sparse_features))
  
## boundary check n_num_features
if n_num_features < 0 or n_num_features > 13:
  raise ValueError('n_num_features must be between 0 and 13...')
else:
  print('Running with {} numeric features.'.format(n_num_features))
  
if n_num_features+n_sparse_features < 1:
  raise ValueError('total number of features is less than 1.')

## Imports

In [11]:
## for feature engineering:
from pyspark.sql.functions import col, when, count, isnan
from pyspark.ml.feature import (Imputer,StringIndexer,VectorAssembler)
from pyspark.ml.pipeline import Pipeline

## for modeling:
from mmlspark import LightGBMClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



## Define what features to process

- `features` maps to numeric features that need to have missing values replaced
- `sparse_features` maps to the first `n_sparse_features` categorical / string variables

In [13]:

## features are int features (does median imputation)
features = [x for x in df.columns if x[0:3] == 'int'][0:n_num_features]
## sparse_features are str features 
sparse_features = [x for x in df.columns if x[0:3] == 'cat'][0:n_sparse_features]

print(sparse_features)

## Fill Missing Values in String Vars

In [15]:
## fill  missing values in string variables
## done already and saved above - something weird was going on, where this transformation wasn't propagating to fcut...
## df = df.na.fill('M', subset = sparse_features)
## df.write.mode('overwrite').parquet('dbfs:/mnt/adlsgen2/dac_train_nocatna.parquet')

print(sparse_features)
print(df.count())
for i in sparse_features:
  display(df.select([count(when(col(c).isNull(), c)).alias(c) for c in sparse_features]))


## Replace infrequent levels with a special value

In [17]:
## only do it if replace_small_levels is true AND small_level_freq_thresh
if small_level_freq_thresh > 0 and replace_small_levels:
  ## count frequency of levels, and repalce if F <= threshold for each categorical variable
  print('Assigning Rare levels a special value')
  fcut_cat_levels_dict = {i: df.groupby(i).count().
   select(i, when(col('count') > small_level_freq_thresh, col(i)).otherwise("RARE").alias(i+'_fcut')) for i in sparse_features}
  ## now join them back...
  for i in sparse_features:
    df = df.join(fcut_cat_levels_dict[i], i, how = 'left')
  ## update the variables we're using as sparse variables:
  sparse_features = [f + '_fcut' for f in sparse_features]
  print('Categorical Features updated to be: {}'.format(' '.join(sparse_features)))

In [18]:
## do additional work if doing targ-enc
if categorical_var_strategy == 'targ-enc':
  ## have to do additional work to compute p(label|level)
  ## this should be done only on training data then applied to testing. Note for later.
  labelbylevel_dict = {i: df.groupby(i).mean('label').select([i, col("avg(label)").alias(i+"_trgt")]) for i in sparse_features}  
  ## set up the graph to do all the joins:
  for i in sparse_features:
    df = df.join(labelbylevel_dict[i], i, how = "left")
  ## update the variables we're using as sparse:
  sparse_features = [f + '_trgt' for f in sparse_features]
  print('Categorical Features updated to be: {}'.format(' '.join(sparse_features)))

## Recast `int` variables to `float`

`Imputer()` only works with `float` or `double` type. We could import the data as floats, or run directly on ints using the `df.na.fill()` method.

Currently using this approach to keep the work in the pipeline.

In [20]:
## cast ints to floats, because Imputer only works with floats
## and only pull out the strings with the frequency cutoff
sql_lst = ['cast({} as float) {}'.format(x, x) for x in features] + sparse_features + ['label']
recast_df = df.selectExpr(*[sql_lst])

In [21]:
## persist to disk to trigger transforms
## about 25 minutes on l16s x 4
## time depends on steps and parameters above.
# recast_df.write.mode('overwrite').parquet(outfile)
# del recast_df
recast_df = sqlContext.read.parquet(outfile)

In [22]:
num_imputer = Imputer(strategy='median',
              inputCols=features,
              outputCols=[f + '_imp' for f in features])

if categorical_var_strategy == 'string-index':
  print('Using StringIndexer for categorical variables.')
  pipeline = Pipeline(stages=[
    num_imputer,
    # LightGBM can handle categoricals directly if StringIndexer is used through meta-data
    *[StringIndexer(inputCol=f, outputCol=f+'_vec') for f in sparse_features],
    VectorAssembler(inputCols= [f + '_imp' for f in features] +
                    [f + '_vec' for f in sparse_features],
                    outputCol='features')
  ])
elif categorical_var_strategy == 'targ-enc':
  print('Using target-encoding for categorical variables.')
  ## build the pipeline
  pipeline = Pipeline(stages=[
    num_imputer,
    VectorAssembler(inputCols= [f + '_imp' for f in features] +
                    [f for f in sparse_features],
                    outputCol='features')
  ])  
else:
  raise ValueError('Unknown strategy for categorical_var_strategy. Should be either "string-index" or "targ-enc"')

In [23]:
## run the pipeline:
train_proc_df = pipeline.fit(recast_df).transform(recast_df)

In [24]:
from pyspark.sql.functions import isnan, when, count, col

print(sparse_features)
for i in sparse_features:
  display(train_proc_df.select([count(when(col(c).isNull(), c)).alias(c) for c in sparse_features]))
#  display(train_proc_df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]))

In [25]:
## save after pipeline? This can be an issue...
## train_proc_df.write.mode('overwrite').parquet(outfile)

In [26]:
## describe label to see balance
if describe:
  display(train_proc_df.select(['label']).describe())

In [27]:
if describe:
  display(train_proc_df.select('features').limit(2))

## Set up the Classifier:

In [29]:
model = LightGBMClassifier(featuresCol='features',
                           labelCol='label',
                           numIterations=classifier_lightgbm_iterations,
                           numLeaves=31,
                           maxDepth=10,
                           isUnbalance=True)

## Fit the model.

to see if simple use-case works

In [31]:
## try just fitting the model, not with CV
## model fit works, sometimes.
## failed I think with full data and 26 cat features l16s x 4
## 10M rows and full columns: 7.5 minutes
model_fit = model.fit(train_proc_df)

In [32]:
if run_crossvalidation:
  grid = (ParamGridBuilder()
          .addGrid(model.numLeaves, num_leaves_grid) 
          .build())
  evaluator = BinaryClassificationEvaluator(labelCol='label')
  cv = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=n_folds)
  cv_fit = cv.fit(train_proc_df)
  print(cv_fit)