In [1]:
## parameters:
## run describe on the input table
describe = False
## save the result of the preprocessing pipeline as a table.
save_as_table = True

## for testing - pipeline will include categorical variables 0:n_sparse_features in the features table to estimate the model.
n_sparse_features = 0  
## for testing - pipeline will include numeric variables 0:n_num_features in the features table to estimate the model.
n_num_features = 5
classifier_lightgbm_iterations = 3  
n_folds = 4                         
num_leaves_grid = [2, 4, 8]

## load data loader

In [3]:
## from reco_utils.dataset.criteo_dac import load_spark_df

In [4]:
## read in the data - this takes some time...8-10 minutes
## df = load_spark_df(spark=spark, dbutils=dbutils)
df = sqlContext.read.parquet("/FileStore/dac_train.parquet")
# Could ADLS be causing issues?
# df = sqlContext.read.parquet("/mnt/adlsgen2/dac_train.parquet")

In [5]:
if describe:
  ## This can take quite a bit of time...
  cur_descr = df.describe()
  display(cur_descr)

In [6]:
## boundary check n_sparse_features
if n_sparse_features < 0 or n_sparse_features > 26:
  raise ValueError('n_sparse_features must be between 0 and 26...')
else:
  print('Running with {} sparse (i.e. categorical) features.'.format(n_sparse_features))
  
if n_num_features < 0 or n_num_features > 13:
  raise ValueError('n_sparse_features must be between 0 and 26...')
else:
  print('Running with {} numeric features.'.format(n_num_features))
  
if n_num_features+n_sparse_features < 1:
  raise ValueError('total number of features is less than 1.')

## Imports

In [8]:
## for feature engineering:
from pyspark.ml.feature import (Imputer,StringIndexer,VectorAssembler)
from pyspark.ml.pipeline import Pipeline

## for modeling:
from mmlspark import LightGBMClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator


## Define what features to process

- `features` maps to numeric features that need to have missing values replaced
- `sparse_features` maps to the first `n_sparse_features` categorical / string variables

In [10]:

## features are int features (does median imputation)
features = [x for x in df.columns if x[0:3] == 'int'][0:n_num_features]
## sparse_features are str features 
sparse_features = [x for x in df.columns if x[0:3] == 'cat'][0:n_sparse_features]


## Recast `int` variables to `float`

`Imputer()` only works with `float` or `double` type. We could import the data as floats, or run directly on ints using the `df.na.fill()` method.

Currently using this approach to keep the work in the pipeline.

In [12]:
## cast ints to floats, because Imputer only works with floats
sql_lst = ['cast({} as float) {}'.format(x, x) for x in features] + sparse_features + ['label']
recast_df = df.selectExpr(*[sql_lst])

In [13]:
pipeline = Pipeline(stages=[
  Imputer(strategy='median',
          inputCols=features,
          outputCols=[f + '_imp' for f in features]),
  # LightGBM can handle categoricals directly if StringIndexer is used through meta-data
  *[StringIndexer(inputCol=f , outputCol=f+'_vec') for f in sparse_features],
  VectorAssembler(inputCols= [f + '_imp' for f in features] +
                  [f + '_vec' for f in sparse_features],
                  outputCol='features')
])

In [14]:
# fit is needed if you use imputer..
train_proc_df = pipeline.fit(recast_df).transform(recast_df)

In [15]:
table_to_save = 'criteo_dac_proc_{}sparse_{}num'.format(n_sparse_features,n_num_features)

try:
  train_proc_df.write.saveAsTable(table_to_save)
except:
  pass

## Set up the Classifier:

In [17]:
model = LightGBMClassifier(featuresCol='features',
                           labelCol='label',
                           numIterations=classifier_lightgbm_iterations,
                           numLeaves=8,
                           isUnbalance=True)

grid = (ParamGridBuilder()
        .addGrid(model.numLeaves, num_leaves_grid) 
        .build())

evaluator = BinaryClassificationEvaluator(labelCol='label')

cv = CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=n_folds)

## Fit the model.

In [19]:
## try just fitting the model, not with CV
## model fit works, sometimes.
model_fit = model.fit(train_proc_df)

In [20]:
## estimate the model:
## throws an error:
cv_fit = cv.fit(train_proc_df)
