In [0]:
# imports 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline 
from pyspark.ml.regression import RandomForestRegressor 
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow 
from mlflow.tracking import MlflowClient
import pandas as pd
from pyspark.ml.feature import RFormula

client = MlflowClient()

# load the datasets 
userhome = 'dbfs:/user/skim658@gwu.edu'

redDF = spark.read.parquet(userhome + '/final-project/reddf.parquet')
whiteDF = spark.read.parquet(userhome + '/final-project/whitedf.parquet')

In [0]:
# split the dataset into train and test sets 
redTrainDF, redTestDF = redDF.repartition(8).randomSplit([0.8, 0.2], seed = 42)
print(redTrainDF.cache().count())
whiteTrainDF, whiteTestDF = whiteDF.repartition(8).randomSplit([0.8, 0.2], seed = 42)
print(whiteTrainDF.cache().count())

## Random Forest with Cross Validation 

Max Depth: 2, 5, 10; Num Trees: 10, 20, 100

## Red Wine

In [0]:
# log random forest regressor model 
with mlflow.start_run(run_name = 'RED-RF-All-Features') as run:
  # model 
  redRF = RandomForestRegressor(labelCol = 'quality')
  vecAssemblerRF = VectorAssembler(inputCols = [x for x in redTrainDF.columns if x != 'quality'], outputCol = 'features')
  # cross validation 
  rfParamGrid = ParamGridBuilder().addGrid(redRF.maxDepth, [2, 5, 10]).addGrid(redRF.numTrees, [10, 20, 100]).build()
  evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality')
  cv = CrossValidator(estimator = redRF, evaluator = evaluator, estimatorParamMaps = rfParamGrid, numFolds = 3, parallelism = 4)
  
  # pipeline 
  pipeline = Pipeline(stages = [vecAssemblerRF, cv])
  pipelineModel = pipeline.fit(redTrainDF)
  
  cvModel = pipelineModel.stages[-1]
  rfModel = cvModel.bestModel 
  
  # create and evaluate predictions 
  redPredDF = pipelineModel.transform(redTestDF)
  display(redPredDF.select('features', 'quality', 'prediction'))
  
  # log parameters
  mlflow.log_param('features', 'all')
  mlflow.log_param('color', 'red')
  # log metric
  rmse = evaluator.setMetricName('rmse').evaluate(redPredDF)
  mlflow.log_metric('rmse', rmse)
  

features,quality,prediction
"Map(vectorType -> dense, length -> 11, values -> List(5.4, 0.74, 0.0, 1.2, 0.041, 16.0, 46.0, 0.99258, 4.01, 0.59, 12.5))",6,5.93423442814284
"Map(vectorType -> dense, length -> 11, values -> List(5.9, 0.61, 0.08, 2.1, 0.071, 16.0, 24.0, 0.99376, 3.56, 0.77, 11.1))",6,5.946039611242907
"Map(vectorType -> dense, length -> 11, values -> List(6.0, 0.5, 0.0, 1.4, 0.057, 15.0, 26.0, 0.99448, 3.36, 0.45, 9.5))",5,5.042511296387279
"Map(vectorType -> dense, length -> 11, values -> List(6.2, 0.64, 0.09, 2.5, 0.081, 15.0, 26.0, 0.99538, 3.57, 0.63, 12.0))",5,6.126822591147557
"Map(vectorType -> dense, length -> 11, values -> List(6.4, 0.56, 0.15, 1.8, 0.078, 17.0, 65.0, 0.99294, 3.33, 0.6, 10.5))",6,5.713720241453626
"Map(vectorType -> dense, length -> 11, values -> List(6.6, 0.815, 0.02, 2.7, 0.072, 17.0, 34.0, 0.9955, 3.58, 0.89, 12.3))",7,6.394349623698464
"Map(vectorType -> dense, length -> 11, values -> List(6.8, 0.91, 0.06, 2.0, 0.06, 4.0, 11.0, 0.99592, 3.53, 0.64, 10.9))",4,5.552409827055385
"Map(vectorType -> dense, length -> 11, values -> List(7.0, 0.36, 0.21, 2.4, 0.086, 24.0, 69.0, 0.99556, 3.4, 0.53, 10.1))",6,5.464368203044319
"Map(vectorType -> dense, length -> 11, values -> List(7.0, 0.685, 0.0, 1.9, 0.067, 40.0, 63.0, 0.9979, 3.6, 0.81, 9.9))",5,5.28340437170132
"Map(vectorType -> dense, length -> 11, values -> List(7.0, 0.78, 0.08, 2.0, 0.093, 10.0, 19.0, 0.9956, 3.4, 0.47, 10.0))",5,5.019069493173255


In [0]:
# see the best model parameters 
paramDict = rfModel.extractParamMap()
tempDict = {}
for k, v in paramDict.items():
  tempDict[k.name] = v

bestMaxDepth = tempDict['maxDepth']
bestnumTrees = tempDict['numTrees']

tempDict = {'Max Depth': [bestMaxDepth], 'Num Trees': [bestnumTrees]}
bestCVModel = pd.DataFrame(tempDict, index = [0])
display(bestCVModel)

Max Depth,Num Trees
10,100


In [0]:
# get best model metrics 
experiment_id = run.info.experiment_id
runs = client.search_runs(experiment_id, order_by=["attributes.start_time desc"])
for run in runs:
  if run.data.params['mlModelClass'] == 'RandomForestRegressor' and int(run.data.params['numTrees']) == bestnumTrees and int(run.data.params['maxDepth']) == bestMaxDepth:
    print(run) 
    redMetricsDF = pd.DataFrame(run.data.metrics, index = [0])
    break
display(redMetricsDF)

avg_rmse,std_rmse
0.6099007342616175,0.0166693445213393


In [0]:
# feature importance 
pandasDF = pd.DataFrame(list(zip([x for x in redTrainDF.columns if x != 'quality'], rfModel.featureImportances)), columns=["feature", "importance"])
topFeatures = pandasDF.sort_values(["importance"], ascending=False)
topFeatures

Unnamed: 0,feature,importance
10,alcohol,0.235373
9,sulphates,0.165005
1,volatile_acidity,0.120866
6,total_sulfur_dioxide,0.078331
0,fixed_acidity,0.068922
4,chlorides,0.064406
7,density,0.064012
8,pH,0.053194
5,free_sulfur_dioxide,0.051725
2,citric_acid,0.050607


## White Wine

In [0]:
# log random forest regressor model 
with mlflow.start_run(run_name = 'WHITE-RF-All-Features') as run:
  # model 
  whiteRF = RandomForestRegressor(labelCol = 'quality')
  vecAssemblerRF = VectorAssembler(inputCols = [x for x in whiteTrainDF.columns if x != 'quality'], outputCol = 'features')
  # cross validation 
  rfParamGrid = ParamGridBuilder().addGrid(whiteRF.maxDepth, [2, 5, 10]).addGrid(whiteRF.numTrees, [10, 20, 100]).build()
  evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality')
  cv = CrossValidator(estimator = whiteRF, evaluator = evaluator, estimatorParamMaps = rfParamGrid, numFolds = 3, parallelism = 4)
  
  # pipeline 
  pipeline = Pipeline(stages = [vecAssemblerRF, cv])
  pipelineModel = pipeline.fit(whiteTrainDF)
  
  cvModel = pipelineModel.stages[-1]
  rfModel = cvModel.bestModel 
  
  # create and evaluate predictions 
  whitePredDF = pipelineModel.transform(whiteTestDF)
  display(whitePredDF.select('features', 'quality', 'prediction'))
  
  # log parameters
  mlflow.log_param('features', 'all')
  mlflow.log_param('color', 'white')
  # log metric
  rmse = evaluator.setMetricName('rmse').evaluate(whitePredDF)
  mlflow.log_metric('rmse', rmse)
  

features,quality,prediction
"Map(vectorType -> dense, length -> 11, values -> List(4.9, 0.47, 0.17, 1.9, 0.035, 60.0, 148.0, 0.98964, 3.27, 0.35, 11.5))",6,6.378933526347721
"Map(vectorType -> dense, length -> 11, values -> List(5.1, 0.35, 0.26, 6.8, 0.034, 36.0, 120.0, 0.99188, 3.38, 0.4, 11.5))",6,6.370790906744824
"Map(vectorType -> dense, length -> 11, values -> List(5.2, 0.3, 0.34, 1.5, 0.038, 18.0, 96.0, 0.98942, 3.56, 0.48, 13.0))",8,6.614513653705502
"Map(vectorType -> dense, length -> 11, values -> List(5.4, 0.29, 0.38, 1.2, 0.029, 31.0, 132.0, 0.98895, 3.28, 0.36, 12.4))",6,6.368252371060446
"Map(vectorType -> dense, length -> 11, values -> List(5.6, 0.18, 0.27, 1.7, 0.03, 31.0, 103.0, 0.98892, 3.35, 0.37, 12.9))",6,6.726008184447489
"Map(vectorType -> dense, length -> 11, values -> List(5.6, 0.26, 0.18, 1.4, 0.034, 18.0, 135.0, 0.99174, 3.32, 0.35, 10.2))",6,5.657450815132791
"Map(vectorType -> dense, length -> 11, values -> List(5.7, 0.2, 0.3, 2.5, 0.046, 38.0, 125.0, 0.99276, 3.34, 0.5, 9.9))",6,5.938171655128006
"Map(vectorType -> dense, length -> 11, values -> List(5.7, 0.28, 0.35, 1.2, 0.052, 39.0, 141.0, 0.99108, 3.44, 0.69, 11.3))",6,6.37545221691866
"Map(vectorType -> dense, length -> 11, values -> List(5.8, 0.22, 0.29, 1.3, 0.036, 25.0, 68.0, 0.98865, 3.24, 0.35, 12.6))",6,6.302192592568122
"Map(vectorType -> dense, length -> 11, values -> List(5.8, 0.23, 0.27, 1.8, 0.043, 24.0, 69.0, 0.9933, 3.38, 0.31, 9.4))",6,5.788330841691096


In [0]:
# see the best model parameters 
paramDict = rfModel.extractParamMap()
tempDict = {}
for k, v in paramDict.items():
  tempDict[k.name] = v

bestMaxDepth = tempDict['maxDepth']
bestnumTrees = tempDict['numTrees']

tempDict = {'Max Depth': [bestMaxDepth], 'Num Trees': [bestnumTrees]}
bestCVModel = pd.DataFrame(tempDict, index = [0])
display(bestCVModel)

Max Depth,Num Trees
10,100


In [0]:
# get best model metrics 
experiment_id = run.info.experiment_id
runs = client.search_runs(experiment_id, order_by=["attributes.start_time desc"])
for run in runs:
  if run.data.params['mlModelClass'] == 'RandomForestRegressor' and int(run.data.params['numTrees']) == bestnumTrees and int(run.data.params['maxDepth']) == bestMaxDepth:
    print(run) 
    whiteMetricsDF = pd.DataFrame(run.data.metrics, index = [0])
    break
display(whiteMetricsDF)

avg_rmse,std_rmse
0.6535970142964916,0.0190223494929699


In [0]:
# feature importance 
pandasDF = pd.DataFrame(list(zip([x for x in redTrainDF.columns if x != 'quality'], rfModel.featureImportances)), columns=["feature", "importance"])
topFeatures = pandasDF.sort_values(["importance"], ascending=False)
topFeatures

Unnamed: 0,feature,importance
10,alcohol,0.221868
1,volatile_acidity,0.121461
5,free_sulfur_dioxide,0.108984
7,density,0.10068
4,chlorides,0.081319
8,pH,0.065586
6,total_sulfur_dioxide,0.064543
2,citric_acid,0.061542
0,fixed_acidity,0.060646
3,residual_sugar,0.060332


## Random Forests with Hyperopt Hyperparameter Tuning

In [0]:
# imports 
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, Trials 

In [0]:
# define the search space 
search_space = {
  'maxDepth': hp.randint('maxDepth', 2, 30),
  'maxBins': hp.randint('maxBins', 10, 40),
  'numTrees': hp.randint('numTrees', 10, 100)
}

## Red Wine

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    # hyperparameters to tune 
    maxDepth = params['maxDepth']
    maxBins = params['maxBins']
    numTrees = params['numTrees']
    
    # model 
    vecAssemblerRF = VectorAssembler(inputCols = [x for x in redTrainDF.columns if x != 'quality'], outputCol = 'features')
    rf = RandomForestRegressor(labelCol = 'quality', maxBins = maxBins, maxDepth = maxDepth, numTrees = numTrees, seed = 42)
    
    # pipeline 
    pipeline = Pipeline(stages = [vecAssemblerRF, rf])
    pipelineModel = pipeline.fit(redTrainDF)
    
    # evaluate predictions 
    redPredDF = pipelineModel.transform(redTestDF)
    regressionEvaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality')
    rmse = regressionEvaluator.evaluate(redPredDF)
    
    # log parameters
    mlflow.log_param('maxDepth', maxDepth)
    mlflow.log_param('maxBins', maxBins)
    mlflow.log_param('numTrees', numTrees)
    mlflow.log_param('color', 'red')
    mlflow.log_param('features', 'all')
    mlflow.log_param('tuning', 'hyperopt')
    mlflow.log_metric('rmse', rmse)
  
  return {'loss': rmse, 'status': STATUS_OK}

# create parent run 
with mlflow.start_run(run_name = 'RED-RF-All-Features-Hyperopt') as run:
  num_evals = 30
  trials = Trials()
  best_hyperparam = fmin(fn = objective_function,
                        space = search_space,
                        algo = tpe.suggest,
                        max_evals = num_evals,
                        trials = trials)
  # log param and metric for best model 
  for name, value in best_hyperparam.items():
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss'])

In [0]:
redResult = pd.DataFrame(best_hyperparam, index = [0])
display(redResult)

maxBins,maxDepth,numTrees
14,26,54


In [0]:
bestMaxBins = best_hyperparam['maxBins']
bestMaxDepth = best_hyperparam['maxDepth']
bestNumTrees = best_hyperparam['numTrees']

In [0]:
# get best model metrics 
experiment_id = run.info.experiment_id
runs = client.search_runs(experiment_id, order_by=["attributes.start_time desc"])
for run in runs:
  if run.data.params['tuning'] == 'hyperopt' and int(run.data.params['maxBins']) == bestMaxBins and int(run.data.params['maxDepth']) == bestMaxDepth and int(run.data.params['numTrees']) == bestNumTrees:
    print(run) 
    redMetricsDF = pd.DataFrame(run.data.metrics, index = [0])
    break
display(redMetricsDF)

rmse
0.5858820704844101


## White Wine

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    # hyperparameters to tune 
    maxDepth = params['maxDepth']
    maxBins = params['maxBins']
    numTrees = params['numTrees']
    
    # model 
    vecAssemblerRF = VectorAssembler(inputCols = [x for x in whiteTrainDF.columns if x != 'quality'], outputCol = 'features')
    rf = RandomForestRegressor(labelCol = 'quality', maxBins = maxBins, maxDepth = maxDepth, numTrees = numTrees, seed = 42)
    
    # pipeline 
    pipeline = Pipeline(stages = [vecAssemblerRF, rf])
    pipelineModel = pipeline.fit(whiteTrainDF)
    
    # evaluate predictions 
    whitePredDF = pipelineModel.transform(whiteTestDF)
    regressionEvaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality')
    rmse = regressionEvaluator.evaluate(whitePredDF)
    
    # log parameters
    mlflow.log_param('maxDepth', maxDepth)
    mlflow.log_param('maxBins', maxBins)
    mlflow.log_param('numTrees', numTrees)
    mlflow.log_param('color', 'white')
    mlflow.log_param('features', 'all')
    mlflow.log_param('tuning', 'hyperopt')
    mlflow.log_metric('rmse', rmse)
  
  return {'loss': rmse, 'status': STATUS_OK}

# create parent run 
with mlflow.start_run(run_name = 'WHITE-RF-All-Features-Hyperopt') as run:
  num_evals = 10
  trials = Trials()
  best_hyperparam = fmin(fn = objective_function,
                        space = search_space,
                        algo = tpe.suggest,
                        max_evals = num_evals,
                        trials = trials)
  # log param and metric for best model 
  for name, value in best_hyperparam.items():
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss'])

In [0]:
whiteResult = pd.DataFrame(best_hyperparam, index = [0])
display(whiteResult)

maxBins,maxDepth,numTrees
21,29,94


In [0]:
bestMaxBins = best_hyperparam['maxBins']
bestMaxDepth = best_hyperparam['maxDepth']
bestNumTrees = best_hyperparam['numTrees']

In [0]:
# get best model metrics 
experiment_id = run.info.experiment_id
runs = client.search_runs(experiment_id, order_by=["attributes.start_time desc"])
for run in runs:
  if run.data.params['tuning'] == 'hyperopt' and int(run.data.params['maxBins']) == bestMaxBins and int(run.data.params['maxDepth']) == bestMaxDepth and int(run.data.params['numTrees']) == bestNumTrees:
    print(run) 
    whiteMetricsDF = pd.DataFrame(run.data.metrics, index = [0])
    break
display(whiteMetricsDF)

rmse
0.63561053499133


## with Feature Selection (Test)

## Red Wine

In [0]:
def objective_function(params):
  maxDepth = params['maxDepth']
  maxBins = params['maxBins']
  numTrees = params['numTrees']
  
  # model 
  vecAssembler = VectorAssembler(inputCols = [x for x in redTrainDF.columns if x != 'quality' and x != 'pH' and x != 'free_sulfur_dioxide' and x != 'citric_acid' and x != 'residual_sugar'], outputCol = 'features')
  rf = RandomForestRegressor(labelCol = 'quality', maxBins = maxBins, maxDepth = maxDepth, numTrees = numTrees, seed = 42)
  
  # pipeline 
  pipeline = Pipeline(stages = [vecAssembler, rf])
  pipelineModel = pipeline.fit(redTrainDF)
  
  # evaluate predictions 
  redPredDF = pipelineModel.transform(redTestDF) 
  regressionEvaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality')
  rmse = regressionEvaluator.setMetricName('rmse').evaluate(redPredDF)
  
  return {'loss': rmse, 'status': STATUS_OK}

num_evals = 30 
trials = Trials() 
best_hyperparam = fmin(fn = objective_function, 
                      space = search_space, 
                      algo = tpe.suggest,  
                      max_evals = num_evals,
                      trials = trials)
print(trials.best_trial['result']['loss'])

In [0]:
print(best_hyperparam)

## White Wine

In [0]:
def objective_function(params):
  maxDepth = params['maxDepth']
  maxBins = params['maxBins']
  numTrees = params['numTrees']
  
  # model 
  vecAssembler = VectorAssembler(inputCols = [x for x in whiteTrainDF.columns if x != 'quality' and x != 'citric_acid' and x != 'residual_sugar' and x != 'fixed_acidity' and x != 'sulphates'], outputCol = 'features')
  rf = RandomForestRegressor(labelCol = 'quality', maxBins = maxBins, maxDepth = maxDepth, numTrees = numTrees, seed = 42)
  
  # pipeline 
  pipeline = Pipeline(stages = [vecAssembler, rf])
  pipelineModel = pipeline.fit(whiteTrainDF)
  
  # evaluate predictions  
  whitePredDF = pipelineModel.transform(whiteTestDF)  
  regressionEvaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'quality') 
  rmse = regressionEvaluator.setMetricName('rmse').evaluate(whitePredDF) 
  
  return {'loss': rmse, 'status': STATUS_OK}

num_evals = 10 
trials = Trials()
best_hyperparam = fmin(fn = objective_function, 
                      space = search_space, 
                      algo = tpe.suggest, 
                      max_evals = num_evals,
                      trials = trials)
print(trials.best_trial['result']['loss'])

In [0]:
print(best_hyperparam)