In [0]:
!pip install joblibspark
!pip install rfpimp

In [0]:
from joblibspark import register_spark
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils import parallel_backend  
from rfpimp import importances

In [0]:
#1) function to obtain cleaned dfp and dfnp from the original dataset, which are rows that do and do not contain sales data (various sales columns in the dataset) respectively. we require to obtain these 2 separate dataframes as we will fit a different random forest model for feature selection to each of these 2 dataframes separately.

def obtain_cleaned_dfp_dfnp(df):  #takes in a spark dataframe named df
  #convert all col names to lowercase
  for col in df.columns:
      df = df.withColumnRenamed(col, col.lower())
  
  #convert any nulls in various order quantity columns to 0. this is applicable to this specific use-case only and can be removed for other datasets
  order_quant_cols = [col for col in df.columns if 'order' in col]
  df = df.na.fill(value=0, subset=order_quant_cols)

  #promo column is a boolean depicting if sales data is available
  #separate original df into 2 dfs, promo=YES and promo=NO, named dfp and dfnp respectively
  dfp = df.filter(col('promo')=='YES')
  dfnp = df.filter(col('promo')=='NO')

  #retain only cols required for modelling for dfp and dfnp respectively
  cols_to_drop = ['pl2_business_id', 'material', 'cust_lev6', 'week', 'min_date', 'promo', 'month', 'year']
  desired_cols = [col for col in df.columns if col not in cols_to_drop]
  dfp = dfp.select(desired_cols)

  cols_to_drop = ['pl2_business_id', 'material', 'cust_lev6', 'week', 'min_date', 'promo', 'month', 'year'] + [col for col in df.columns if 'sales' in col]
  desired_cols = [col for col in df.columns if col not in cols_to_drop]
  dfnp = dfnp.select(desired_cols)
  
  #drop any nulls in any feature. this can be changed to impute nulls instead in other use-cases. however, in our specific use-case here, only few rows contain nulls in features so we can afford to drop them without losing too much data
  dfp = dfp.na.drop()
  dfnp = dfnp.na.drop()

  return (dfp, dfnp)  #note: both dfp and dfnp are spark dataframes, not pandas dataframes

In [0]:
#2) function to obtain tuned random forest

def run_rf(df):  #takes in a spark dataframe
  register_spark()  #register spark backend for parallelization later

  feature_cols = [col for col in df.columns if col != 'order_quantity']
  X = df.select(feature_cols).toPandas()
  y = df.select('order_quantity').toPandas()
  
  #obtain training and test sets. note that since we are working with timeseries data, we cannot simply use scikitlearn's train_test_split() function to do this since rows in the test set must always belong to a time period after rows in the training set to avoid data leakage
  train_test_split_index = 0.7*len(df)
  train = df.iloc[:train_test_split_index]
  test = df.iloc[train_test_split_index:]
  X_train = train[feature_cols]
  X_test = test[feature_cols]
  y_train = train['order_quantity']
  y_test = test['order_quantity']

  rf = RandomForestRegressor(n_estimators=300, random_state=15, max_features='sqrt', max_samples=0.9)  
  param_grid = {'max_depth': [20, 25, 30, 40, 50], 'min_samples_leaf': [1, 2, 4, 8], 'criterion': ['mae', 'mse'], 'min_samples_split': [1, 2, 4, 8]}  #tune the most important hyperparameters of a random forest model
  timeseries_cv = TimeSeriesSplit(n_splits=5)  #we cannot use a normal cross-validator since we must ensure rows in the test set must always belong to a time period after rows in the training set to avoid data leakage, hence we use a special cross-validator type here to ensure this
  rs_rf = RandomizedSearchCV(rf, param_grid, cv=timeseries_cv, n_iter=120, verbose=10, random_state=15, refit=True)  #n_iter refers to the number of hyperparameter set combinations to test out under this randomized search
  
  print('starting to tune rf')
  
  with parallel_backend('spark', n_jobs=15):  #using n_jobs=-1 takes v long to parallelize! so use 15
    rs_rf.fit(X_train, y_train)  
  
  #get best hyperparams
  best_params = rs_rf.best_params_   
  print(f'best hyperparams are: {best_params}')
  
  opt_max_depth = best_params['max_depth']
  opt_min_samples_leaf = best_params['min_samples_leaf']
  opt_min_samples_split = best_params['min_samples_split']
  opt_criterion = best_params['criterion']

  return ((opt_max_depth, opt_min_samples_leaf, opt_min_samples_split, opt_criterion), (X_train, X_test, y_train, y_test))

In [0]:
#3) function to obtain permutation importances

def get_perm_imptance(df, opt_max_depth, opt_min_samples_leaf, opt_min_samples_split, opt_criterion, X_train, X_test, y_train, y_test, biz_name):
  rf = RandomForestRegressor(n_estimators=300, random_state=15, max_features='sqrt', max_samples=0.9, max_depth=opt_max_depth, min_samples_leaf=opt_min_samples_leaf, min_samples_split=opt_min_samples_split, criterion=opt_criterion)  
  rf.fit(X_train, y_train)  #takes ~3min
  
  print('rf built. running perm impt')
  
  #run perm imptance
  #1) working with dfp
  feature_cols = [col for col in df.columns if col != 'order_quantity']
  if 'sales' in feature_cols:  
    all_sales_cols = [col for col in df.columns if 'sales' in col]
    all_other_cols = [col for col in df.columns if ('sales' not in col) and (col!='order_quantity')]
    
    grouped_features = []
    grouped_features.append(all_sales_cols)
    grouped_features.extend(all_other_cols)

    feature_impt = importances(model=rf, X_valid=X_test, y_valid=y_test, features=grouped_features)  #takes 25s. gives a pandas df with rows alr sorted by importance!! (most impt first)
    feature_impt.reset_index(inplace=True)
    
    print('feature impportance obtained. saving it as a file')
    
    #the importances() function above concatenates all the sales columns' names together since they are passed in as a grouped feature due to high collinearity between the sales columns. change this long concatenated name into simply 'all_sales_cols' for easier reading
    row_count = 0
    for feature in feature_impt['Feature']:
      if 'sales' not in feature:
        row_count+=1
      else:
        break

    feature_impt.iloc[row_count, 0] = 'all_sales_cols'
  
    #save feature_impt to dbfs so we don't lose this after obtaining the results
    spark.createDataFrame(feature_impt).coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').save(f'dbfs:/FileStore/feature_selection_project/{biz_name}_promo.csv')
    
  
  #2) working with dfnp
  #dfnp does not contain any sales column since it contains rows that have promo=NO, hence we do not run into the problem of the concatenation of column names above
  else:  
    
    feature_impt = importances(model=rf, X_valid=X_test, y_valid=y_test)  
    feature_impt.reset_index(inplace=True)
    
    print('feature importance obtained. saving it as a file')

    #save fi to dbfs
    spark.createDataFrame(feature_impt).coalesce(1).write.format('com.databricks.spark.csv').option('header', 'true').save(f'dbfs:/FileStore/feature_selection_project/{biz_name}_nopromo.csv')
    
  return feature_impt

In [0]:
#4) function to obtain final selected features. here, we select all features with perm imptance > 0. this can be changed in the future as per requried

def get_final_features(feature_impt):  
  feature_col_list = list(feature_impt['Feature'])
    
  new_feature_impt = feature_impt[feature_impt['Importance']>0]
  selected_features = list(new_feature_impt['Feature'])
    
  #1) working with dfp
  if 'all_sales_cols' in feature_col_list:
    if 'all_sales_cols' in selected_features:
      print(f'number of features selected: {len(new_feature_impt)+13} out of {len(feature_impt)+13}')  #add 13 because there are 13 sales columns
      final_selected_features = []
      for feature in selected_features:
        if feature == 'all_sales_cols':
          final_selected_features.extend([col for col in dfp.columns if 'sales' in col]) 
        else:
          final_selected_features.append(feature)

    else:
      print(f'number of features selected: {len(new_feature_impt)} out of {len(feature_impt)+13}')
      final_selected_features = selected_features
      
  #2) working with dfnp
  else:
    print(f'number of features selected: {len(new_feature_impt)} out of {len(feature_impt)}')
    final_selected_features = selected_features
  
  return final_selected_features

In [0]:
#example of the flow of using the above functions, on USR business

biz_name = 'usr'
(dfp, dfnp) = obtain_cleaned_dfp_dfnp(usr)  #note: usr is a spark dataframe of the file sales_history_sell_out_usr.csv

for df in [dfp, dfnp]:
  ((opt_max_depth, opt_min_samples_leaf, opt_min_samples_split, opt_criterion), (X_train, X_test, y_train, y_test))= run_rf(df)  
  feature_impt = get_perm_imptance(df, opt_max_depth, opt_min_samples_leaf, opt_min_samples_split, opt_criterion, X_train, X_test, y_train, y_test, biz_name)
  final_selected_features = get_final_features(fi)
  print(final_selected_features)  #print to see the final selected features
  