## SparkR - Classification on Wine Data
#### Adapted from https://blog.learningtree.com/machine-learning-using-spark-r/

In [None]:
# Load SparkR library
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

In [None]:
# Start Spark session
sparkR.session(master="local[*]", sparkConfig=list(spark.driver.memory="2g"))

In [None]:
# Read data into a Spark dataframe
sdf <- read.df("winequality-white.csv", <<<FILL-IN>>>, header="true", inferSchema="true") 

In [None]:
# Cache dataframe
cache(<<<FILL-IN>>>)

In [None]:
# Examine schema
<<<FILL-IN>>>(sdf)

In [None]:
# Split into train & test sets
seed <- 12345
train_df <- sample(sdf, withReplacement=FALSE, fraction=0.7, seed=seed)
test_df <- except (sdf, train_df)
dim(train_df)
<<<FILL-IN>>>(test_df)

In [None]:
# Train RF model
model <- spark.randomForest(train_df, taste ~ ., type="classification", numTrees=<<<FILL-IN>>>, seed=seed)
head(<<<FILL-IN>>>(model))

In [None]:
# Predict on test data
predictions <- predict(model, test_df)
prediction_df <- collect(select(predictions, "id", "prediction"))

In [None]:
# Evaluate
library(dplyr)

actual_vs_predicted <- dplyr::inner_join(as.data.frame(sdf), prediction_df, "id") %>%
    dplyr::select (id, actual=taste, predicted=prediction)
mean(actual_vs_predicted$actual == actual_vs_predicted$predicted)
table(actual_vs_predicted$actual, actual_vs_predicted$predicted)

In [None]:
# Examine first few rows of actual vs predicted wine quality values
<<<FILL-IN>>>(actual_vs_predicted)

In [None]:
# Save model (NOTE:  path must not exist)
write.ml(model, "wine-RF-model")