In [None]:
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


df = spark.read.table('sales_data_sample_csv')


In [None]:
df.dtypes

In [None]:
display(df.select('QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER','SALES'))


In [None]:
# Drop unnecessary columns
df = df.drop("PHONE", "ADDRESSLINE1", "ADDRESSLINE2", "STATE", "POSTALCODE", "COUNTRY", "CONTACTFIRSTNAME", "CONTACTLASTNAME")
# Replace null values with 0
df = df.na.fill(0)
# Create a new column "total_sales" by multiplying "QUANTITYORDERED" and "PRICEEACH"
df = df.withColumn("total_sales", col("QUANTITYORDERED") * col("PRICEEACH"))


In [None]:
# Filter out any negative or zero values in "total_sales"
df = df.filter(col("total_sales") > 0)
# Select only the required columns for the model
df = df.select("QUANTITYORDERED", "PRICEEACH", "total_sales")

In [None]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['QUANTITYORDERED', 'PRICEEACH'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['features', 'total_sales'])
display(vhouse_df)

In [None]:
splits = vhouse_df.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]


display(test_df)

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='total_sales', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
lr_predictions = lr_model.transform(test_df)
display(lr_predictions.select("prediction","total_sales","features"))
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="total_sales",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [None]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","total_sales","features").show()

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'total_sales')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="total_sales", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
dt_predictions.show()

In [None]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'total_sales', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'total_sales', 'features').show(5)