In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, date_format, to_timestamp
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline


In [2]:
spark = SparkSession.builder.appName("Beer analysis").getOrCreate()

train = spark.read.csv(
    # "gs://shaun05122024/brewery_data_complete_extended.csv",
    "data/15_percent_validation_data.csv",
    header=True,
    inferSchema=True,
)

In [3]:
train.count()

1500000

In [4]:
train = train.dropDuplicates()
train = train.na.drop()

In [5]:
train = train.withColumn("Total_Sales", col("Total_Sales").cast(FloatType()))

train = train.withColumn(
    "Brew_Date", to_timestamp(col("Brew_Date"), "yyyy-MM-dd HH:mm:ss")
)
train = (
    train.withColumn("Month", date_format(col("Brew_Date"), "MM"))
    .withColumn("Day", date_format(col("Brew_Date"), "dd"))
    .withColumn("Year", date_format(col("Brew_Date"), "yyyy"))
)

In [6]:
train.show(3)

+--------+-------------------+----------+-------+-----------+-----------------+------------------+-----------------+------------------+-----------------+----------+-----+----------------+---------------+-----------+-----------------+--------------------+-------------------+------------------------+----------------------------+-----+---+----+
|Batch_ID|          Brew_Date|Beer_Style|    SKU|   Location|Fermentation_Time|       Temperature|         pH_Level|           Gravity|  Alcohol_Content|Bitterness|Color|Ingredient_Ratio|Volume_Produced|Total_Sales|    Quality_Score|Brewhouse_Efficiency|Loss_During_Brewing|Loss_During_Fermentation|Loss_During_Bottling_Kegging|Month|Day|Year|
+--------+-------------------+----------+-------+-----------+-----------------+------------------+-----------------+------------------+-----------------+----------+-----+----------------+---------------+-----------+-----------------+--------------------+-------------------+------------------------+-------------

In [7]:
categorical_columns = ['Beer_Style', 'SKU', 'Location']

In [8]:
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

In [9]:
numeric_columns = ['Fermentation_Time', 'Temperature', 'pH_Level', 'Gravity', 'Alcohol_Content', 'Bitterness', 'Color', 'Volume_Produced', 'Quality_Score', 'Brewhouse_Efficiency', 'Loss_During_Brewing', 'Loss_During_Fermentation', 'Loss_During_Bottling_Kegging']

In [10]:
assembler_inputs = [c + "_indexed" for c in categorical_columns] + numeric_columns

In [11]:
assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol="features",
)

In [12]:
train_data, test_data = train.randomSplit([0.8, 0.2], seed=42)

In [13]:
rf = RandomForestRegressor(featuresCol="features", labelCol="Total_Sales")

In [14]:
pipeline = Pipeline(stages=indexers + [assembler, rf])

In [15]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50]) \
    .addGrid(rf.maxDepth, [5, 10, 20]) \
    .build()


# Initialize the CrossValidator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="Total_Sales", predictionCol="prediction", metricName="rmse"),
                          numFolds=3)


In [16]:
model = crossval.fit(train_data)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\smend\miniconda3\envs\stevens\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\smend\miniconda3\envs\stevens\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\smend\miniconda3\envs\stevens\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\smend\miniconda3\envs\stevens\Lib\site-packages\py4j\clientserver.py", line 539, in send_command
    raise Py4JNet

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(
    labelCol="Total_Sales", predictionCol="prediction", metricName="rmse"
)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

evaluator = RegressionEvaluator(
    labelCol="Total_Sales", predictionCol="prediction", metricName="mse"
)

mse = evaluator.evaluate(predictions)
print(f"Mean Squared Error on test data = {mse}")

Root Mean Squared Error (RMSE) on test data = 5489.688546159573
Mean Squared Error on test data = 30136680.3338356
