In [1]:
## parameters:
## run describe on the input table
describe = False
## save the result of the preprocessing pipeline as a table.
save_as_table = True

## for testing - pipeline will include categorical variables 0:n_sparse_features in the features table to estimate the model.
## third string variable was the first with missing values...
n_sparse_features = 26
## for testing - pipeline will include numeric variables 0:n_num_features in the features table to estimate the model.
n_num_features = 13
classifier_lightgbm_iterations = 3  
n_folds = 4                         
num_leaves_grid = [32,64]

replace_small_levels = True
small_level_freq_thresh = 10 ## same threshold used by winners: https://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf
## constructed variables:
table_to_save = 'criteo_dac_proc_{}sparse_{}num_freqthresh{}'.format(n_sparse_features,n_num_features, small_level_freq_thresh)

print(table_to_save)

output_dir = 'dbfs:/FileStore'

## load data loader

In [3]:
# from reco_utils.dataset.criteo_dac import load_spark_df

In [4]:
## read in the data - this takes some time...8-10 minutes
# df = load_spark_df(spark=spark, dbutils=dbutils)
## print('writing to parquet...')
## df.write.parquet('dbfs:/FileStore/dac_train.parquet')
df = sqlContext.read.parquet("/FileStore/dac_train.parquet")
# Could ADLS be causing issues?
# df = sqlContext.read.parquet("/mnt/adlsgen2/dac_train.parquet")

In [5]:
if describe:
  ## This can take quite a bit of time...
  cur_descr = df.describe()
  display(cur_descr)

In [6]:
## boundary check n_sparse_features
if n_sparse_features < 0 or n_sparse_features > 26:
  raise ValueError('n_sparse_features must be between 0 and 26...')
else:
  print('Running with {} sparse (i.e. categorical) features.'.format(n_sparse_features))
  
if n_num_features < 0 or n_num_features > 13:
  raise ValueError('n_num_features must be between 0 and 13...')
else:
  print('Running with {} numeric features.'.format(n_num_features))
  
if n_num_features+n_sparse_features < 1:
  raise ValueError('total number of features is less than 1.')

## Imports

In [8]:
import os

## for feature engineering:
from pyspark.ml.feature import (Imputer,StringIndexer,VectorAssembler)
from pyspark.ml.pipeline import Pipeline

## for modeling:
from mmlspark import LightGBMClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import col, when


## Define what features to process

- `features` maps to numeric features that need to have missing values replaced
- `sparse_features` maps to the first `n_sparse_features` categorical / string variables

In [10]:

## features are int features (does median imputation)
features = [x for x in df.columns if x[0:3] == 'int'][0:n_num_features]
## sparse_features are str features 
sparse_features = [x for x in df.columns if x[0:3] == 'cat'][0:n_sparse_features]


## Fill Missing Values in String Vars

In [12]:
## fill  missing values in string variables
df = df.na.fill('M', subset = sparse_features)

## Replace infrequent levels with a special value

In [14]:
level_counts_dict = {i: df.groupby(i).count().
 select(i, when(col('count') > small_level_freq_thresh, col(i)).otherwise("RARE").alias(i+'_fcut')) for i in sparse_features}

In [15]:
## now join them back...
for i in sparse_features:
  df = df.join(level_counts_dict[i], i, how = 'left')

## Recast `int` variables to `float`

`Imputer()` only works with `float` or `double` type. We could import the data as floats, or run directly on ints using the `df.na.fill()` method.

Currently using this approach to keep the work in the pipeline.

In [17]:
## cast ints to floats, because Imputer only works with floats
## and only pull out the strings with the frequency cutoff
sql_lst = ['cast({} as float) {}'.format(x, x) for x in features] + [f + '_fcut' for f in sparse_features] + ['label']
recast_df = df.selectExpr(*[sql_lst])

In [18]:
## save for now...
## about 25 minutes on l16s x 4
outfile = os.path.join('dbfs:/FileStore/',table_to_save+'.parquet')
dbutils.fs.rm(outfile, recurse = True)
recast_df.write.parquet(outfile)

In [19]:
recast_df = sqlContext.read.parquet(os.path.join('dbfs:/FileStore/',table_to_save+'.parquet'))

In [20]:
pipeline = Pipeline(stages=[
  Imputer(strategy='median',
          inputCols=features,
          outputCols=[f + '_imp' for f in features]),
  # LightGBM can handle categoricals directly if StringIndexer is used through meta-data
  *[StringIndexer(inputCol=f + '_fcut' , outputCol=f+'_vec') for f in sparse_features],
  VectorAssembler(inputCols= [f + '_imp' for f in features] +
                  [f + '_vec' for f in sparse_features],
                  outputCol='features')
])

In [21]:
# fit is needed if you use imputer..
## 2.5 minutes
train_proc_df = pipeline.fit(recast_df).transform(recast_df)

In [22]:

try:
  if save_as_table:
    outFile = os.path.join(output_dir, table_to_save+'_postpipe.parquet')
    train_proc_df.write.parquet(outFile)
except:
  pass

In [23]:
## describe label to see balance
display(train_proc_df.select(['label']).describe())

In [24]:
# train_proc_df.printSchema()
# train_proc_df.select('features').printSchema()
# display(train_proc_df.select('features').limit(2))

## Set up the Classifier:

In [26]:
help(LightGBMClassifier)

In [27]:
model = LightGBMClassifier(featuresCol='features',
                           labelCol='label',
                           numIterations=classifier_lightgbm_iterations,
                           numLeaves=31,
                           maxDepth=10,
                           isUnbalance=True)

## Fit the model.

to see if simple use-case works

In [29]:
## try just fitting the model, not with CV
## model fit works, sometimes.
## took 5 minutes with 3 cat features...
model_fit = model.fit(train_proc_df)

In [30]:
grid = (ParamGridBuilder()
        .addGrid(model.numLeaves, num_leaves_grid) 
        .build())

evaluator = BinaryClassificationEvaluator(labelCol='label')

cv = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=n_folds)

In [31]:
## estimate the model:
## throws an error:
cv_fit = cv.fit(train_proc_df)
