## SparkR - Classification on Wine Data
#### Adapted from https://blog.learningtree.com/machine-learning-using-spark-r/

In [None]:
# Load SparkR library
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))

In [None]:
# Start Spark session
sparkR.session(master="local[*]", sparkConfig=list(spark.driver.memory="2g"))

In [None]:
# Read data into a Spark dataframe
sdf <- read.df("/srv/jupyterhub/read-only/data/winequality-white.csv", "csv", header="true", inferSchema="true") 

In [7]:
# Cache dataframe
cache(sdf)

SparkDataFrame[id:int, fixed_acidity:double, volatile_acidity:double, citric_acid:double, residual_sugar:double, chlorides:double, free_sulfur_dioxide:double, total_sulfur_dioxide:double, density:double, pH:double, sulphates:double, alcohol:double, taste:string]

In [8]:
# Examine schema
schema(sdf)

StructType
|-name = "id", type = "IntegerType", nullable = TRUE
|-name = "fixed_acidity", type = "DoubleType", nullable = TRUE
|-name = "volatile_acidity", type = "DoubleType", nullable = TRUE
|-name = "citric_acid", type = "DoubleType", nullable = TRUE
|-name = "residual_sugar", type = "DoubleType", nullable = TRUE
|-name = "chlorides", type = "DoubleType", nullable = TRUE
|-name = "free_sulfur_dioxide", type = "DoubleType", nullable = TRUE
|-name = "total_sulfur_dioxide", type = "DoubleType", nullable = TRUE
|-name = "density", type = "DoubleType", nullable = TRUE
|-name = "pH", type = "DoubleType", nullable = TRUE
|-name = "sulphates", type = "DoubleType", nullable = TRUE
|-name = "alcohol", type = "DoubleType", nullable = TRUE
|-name = "taste", type = "StringType", nullable = TRUE

In [9]:
# Split into train & test sets
seed <- 12345
train_df <- sample(sdf, withReplacement=FALSE, fraction=0.7, seed=seed)
test_df <- except (sdf, train_df)
dim(train_df)
dim(test_df)

In [10]:
# Train RF model
model <- spark.randomForest(train_df, taste ~ ., type="classification", numTrees=30, seed=seed)
head(summary(model))

In [11]:
# Predict on test data
predictions <- predict(model, test_df)
prediction_df <- collect(select(predictions, "id", "prediction"))

In [12]:
# Evaluate
library(dplyr)

actual_vs_predicted <- dplyr::inner_join(as.data.frame(sdf), prediction_df, "id") %>%
    dplyr::select (id, actual=taste, predicted=prediction)
mean(actual_vs_predicted$actual == actual_vs_predicted$predicted)
table(actual_vs_predicted$actual, actual_vs_predicted$predicted)


Attaching package: 'dplyr'

The following objects are masked from 'package:SparkR':

    arrange, between, coalesce, collect, contains, count, cume_dist,
    dense_rank, desc, distinct, explain, expr, filter, first, group_by,
    intersect, lag, last, lead, mutate, n, n_distinct, ntile,
    percent_rank, rename, row_number, sample_frac, select, sql,
    summarize, union

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



         
          average bad good
  average     443 165   73
  bad         163 342    7
  good        153  11  135

In [13]:
# Examine first few rows of actual vs predicted wine quality values
head(actual_vs_predicted)

id,actual,predicted
5,average,average
6,average,bad
8,average,bad
9,average,bad
16,good,good
17,average,bad


In [14]:
# Save model (NOTE:  path must not exist)
write.ml(model, "wine-RF-model")