In [0]:
import mlflow
from sklearn.model_selection import train_test_split
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from mlflow.tracking import MlflowClient
import pandas as pd
from sklearn import svm 
from sklearn.metrics import mean_squared_error
import math

# load the datasets 
userhome = 'dbfs:/user/skim658@gwu.edu'

redDF = spark.read.parquet(userhome + '/final-project/reddf.parquet')
whiteDF = spark.read.parquet(userhome + '/final-project/whitedf.parquet')

In [0]:
# split the dataset into train and test sets 
redPDDF = redDF.toPandas() 
redQuality = redPDDF['quality']
redPDDF.drop('quality', axis = 1, inplace = True)
redXTrain, redXTest, redYTrain, redYTest = train_test_split(redPDDF, redQuality, test_size = 0.2, random_state = 1)

In [0]:
whitePDDF = whiteDF.toPandas()
whiteQuality = whitePDDF['quality']
whitePDDF.drop('quality', axis = 1, inplace = True)
whiteXTrain, whiteXTest, whiteYTrain, whiteYTest = train_test_split(whitePDDF, whiteQuality, test_size = 0.2, random_state = 1)

## Support Vector Machine (With Hyperopt)

In [0]:
searchSpace = {
  'c': hp.uniform('c', 1, 30),
  'epsilon': hp.uniform('epsilon', 0, 1),
  'kernel': hp.choice('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
}

## Red Wine

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    c = params['c']
    epsilon = params['epsilon']
    kernel = params['kernel']
    
    # model 
    svmModel = svm.SVR(C = c, epsilon = epsilon, kernel = kernel)
    
    # fit the model 
    svmModel.fit(redXTrain, redYTrain) 
    
    # evaluate predictions 
    redPred = svmModel.predict(redXTest)
    rmse = math.sqrt(mean_squared_error(redYTest, redPred))
    mlflow.log_metric('rmse', rmse)
   
    # log parameters 
    mlflow.log_param('c', c) 
    mlflow.log_param('epsilon', epsilon)
    mlflow.log_param('kernel', kernel)
  
  return {'loss': rmse, 'status': STATUS_OK}

# create parent run 
with mlflow.start_run(run_name = 'RED-SVC-All-Features') as run:
  num_evals = 100 
  trials = Trials()
  best_hyperparam = fmin(fn = objective_function,
                        space = searchSpace,
                        algo = tpe.suggest,
                        max_evals = num_evals,
                        trials = trials)
  
  # log param and metric for best model 
  for name, value in best_hyperparam.items():
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss'])

In [0]:
if best_hyperparam['kernel'] == 0:
  best_hyperparam['kernel'] = 'linear'
elif best_hyperparam['kernel'] == 1:
  best_hyperparam['kernel'] = 'poly'
elif best_hyperparam['kernel'] == 2:
  best_hyperparam['kernel'] = 'rbf'
else:
  best_hyperparam['kernel'] = 'sigmoid'
redResult = pd.DataFrame(best_hyperparam, index = [0])
display(redResult)

c,epsilon,kernel
19.69920882164071,0.314123751937602,linear


In [0]:
# get best trial rmse 
temp = {}
temp['rmse'] = trials.best_trial['result']['loss']
redMetricsDF = pd.DataFrame(temp, index = [0])
display(redMetricsDF)

rmse
0.6177887936957958


## White Wine

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    c = params['c']
    epsilon = params['epsilon']
    kernel = params['kernel']
    
    # model 
    svmModel = svm.SVR(C = c, epsilon = epsilon, kernel = kernel)
    
    # fit the model 
    svmModel.fit(whiteXTrain, whiteYTrain) 
    
    # evaluate predictions 
    whitePred = svmModel.predict(whiteXTest) 
    rmse = math.sqrt(mean_squared_error(whiteYTest, whitePred))
    mlflow.log_metric('rmse', rmse) 
   
    # log parameters 
    mlflow.log_param('c', c) 
    mlflow.log_param('epsilon', epsilon) 
    mlflow.log_param('kernel', kernel)
  
  return {'loss': rmse, 'status': STATUS_OK} 

# create parent run  
with mlflow.start_run(run_name = 'WHITE-SVC-All-Features') as run: 
  num_evals = 40   
  trials = Trials() 
  best_hyperparam = fmin(fn = objective_function, 
                        space = searchSpace, 
                        algo = tpe.suggest, 
                        max_evals = num_evals,  
                        trials = trials) 
  
  # log param and metric for best model 
  for name, value in best_hyperparam.items():  
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss']) 
  
  

In [0]:
if best_hyperparam['kernel'] == 0:
  best_hyperparam['kernel'] = 'linear'
elif best_hyperparam['kernel'] == 1:
  best_hyperparam['kernel'] = 'poly'
elif best_hyperparam['kernel'] == 2:
  best_hyperparam['kernel'] = 'rbf'
else:
  best_hyperparam['kernel'] = 'sigmoid'
whiteResult = pd.DataFrame(best_hyperparam, index = [0])
display(whiteResult)

c,epsilon,kernel
29.93188577379107,0.2942173453301596,rbf


In [0]:
# get best trial rmse 
temp = {}
temp['rmse'] = trials.best_trial['result']['loss']
whiteMetricsDF = pd.DataFrame(temp, index = [0])
display(whiteMetricsDF)

rmse
0.737228438775591


## Scale

In [0]:
scaler = StandardScaler()
redXTrainS = scaler.fit_transform(redXTrain)
redXTestS = scaler.transform(redXTest) 

## Red Wine

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    c = params['c']
    epsilon = params['epsilon']
    kernel = params['kernel']
    
    # model 
    svmModel = svm.SVR(C = c, epsilon = epsilon, kernel = kernel)
    
    # fit the model 
    svmModel.fit(redXTrainS, redYTrain)
    
    # evaluate predictions 
    redPred = svmModel.predict(redXTestS)
    rmse = math.sqrt(mean_squared_error(redYTest, redPred))
    mlflow.log_metric('rmse', rmse)
  
    # log parameters 
    mlflow.log_param('c', c)
    mlflow.log_param('epsilon', epsilon)
    mlflow.log_param('kernel', kernel)
  
  return {'loss': rmse, 'status': STATUS_OK}

# create parent run 
with mlflow.start_run(run_name = 'RED-SVC-All-Features') as run:
  num_evals = 200
  trials = Trials()
  best_hyperparam = fmin(fn = objective_function,
                        space = searchSpace,
                        algo = tpe.suggest,
                        max_evals = num_evals,
                        trials = trials)
  
  # log param and metric for best model 
  for name, value in best_hyperparam.items():
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss'])

In [0]:
if best_hyperparam['kernel'] == 0:
  best_hyperparam['kernel'] = 'linear'
elif best_hyperparam['kernel'] == 1:
  best_hyperparam['kernel'] = 'poly'
elif best_hyperparam['kernel'] == 2:
  best_hyperparam['kernel'] = 'rbf'
else:
  best_hyperparam['kernel'] = 'sigmoid'
redResult = pd.DataFrame(best_hyperparam, index = [0])
display(redResult)

c,epsilon,kernel
20.967432504027283,0.2483199370717698,linear


In [0]:
# get best trial rmse 
temp = {}
temp['rmse'] = trials.best_trial['result']['loss']
redMetricsDF = pd.DataFrame(temp, index = [0])
display(redMetricsDF)

rmse
0.6207436384472562


## White Wine

In [0]:
scaler = StandardScaler()
whiteXTrainS = scaler.fit_transform(whiteXTrain)
whiteXTestS = scaler.transform(whiteXTest) 

In [0]:
def objective_function(params):
  with mlflow.start_run(nested = True):
    c = params['c']
    epsilon = params['epsilon']
    kernel = params['kernel']
    
    # model 
    svmModel = svm.SVR(C = c, epsilon = epsilon, kernel = kernel)
    
    # fit the model 
    svmModel.fit(whiteXTrainS, whiteYTrain)
    
    # evaluate predictions 
    whitePred = svmModel.predict(whiteXTestS)
    rmse = math.sqrt(mean_squared_error(whiteYTest, whitePred))
    mlflow.log_metric('rmse', rmse)
  
    # log parameters 
    mlflow.log_param('c', c)
    mlflow.log_param('epsilon', epsilon)
    mlflow.log_param('kernel', kernel)
  
  return {'loss': rmse, 'status': STATUS_OK}

# create parent run 
with mlflow.start_run(run_name = 'WHITE-SVC-All-Features') as run:
  num_evals = 200
  trials = Trials()
  best_hyperparam = fmin(fn = objective_function,
                        space = searchSpace,
                        algo = tpe.suggest,
                        max_evals = num_evals,
                        trials = trials)
  
  # log param and metric for best model 
  for name, value in best_hyperparam.items():
    mlflow.log_param(name, value)
  mlflow.log_metric('loss', trials.best_trial['result']['loss'])

In [0]:
if best_hyperparam['kernel'] == 0:
  best_hyperparam['kernel'] = 'linear'
elif best_hyperparam['kernel'] == 1:
  best_hyperparam['kernel'] = 'poly'
elif best_hyperparam['kernel'] == 2:
  best_hyperparam['kernel'] = 'rbf'
else:
  best_hyperparam['kernel'] = 'sigmoid'
whiteResult = pd.DataFrame(best_hyperparam, index = [0])
display(whiteResult)

c,epsilon,kernel
7.69538546472743,0.4775977960706389,rbf


In [0]:
# get best trial rmse 
temp = {}
temp['rmse'] = trials.best_trial['result']['loss']
whiteMetricsDF = pd.DataFrame(temp, index = [0])
display(whiteMetricsDF)

rmse
0.6783760018076169
