In [37]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
spark = SparkSession.builder.appName('ReadCSV').getOrCreate()

In [38]:
numPartition = 5
# train_data = sc.textFile('./data/train.csv',numPartition).map(lambda line: line.split(","))
# test_data = sc.textFile('./data/test.csv',numPartition).map(lambda line: line.split(","))
# header = train_data.first()
# train_data = train_data.filter(lambda line: line != header)
# test_data = test_data.filter(lambda line: line != header)
train_df = spark.read.csv('./data/train.csv',header=True,inferSchema=True).repartition(numPartition)
test_df = spark.read.csv('./data/test.csv',header=True,inferSchema=True).repartition(numPartition)
train_df.show()

+-----+---------+--------+-------+-------+-----+----------------+----------------+--------------------+----------------+----------------+--------------------+-----------+------------------+-----------+-----------+-----------+----------+
|   id|clonesize|honeybee|bumbles|andrena|osmia|MaxOfUpperTRange|MinOfUpperTRange|AverageOfUpperTRange|MaxOfLowerTRange|MinOfLowerTRange|AverageOfLowerTRange|RainingDays|AverageRainingDays|   fruitset|  fruitmass|      seeds|     yield|
+-----+---------+--------+-------+-------+-----+----------------+----------------+--------------------+----------------+----------------+--------------------+-----------+------------------+-----------+-----------+-----------+----------+
|11383|     25.0|     0.5|   0.25|   0.38| 0.75|            69.7|            42.1|                58.2|            50.2|            24.3|                41.2|       24.0|              0.39|0.494677622|0.433081146|33.99420814|5643.36045|
|15110|     12.5|    0.25|   0.25|   0.75| 0.75|    

In [42]:
input_cols = train_df.columns[:-1]
train_output_col = train_df.columns[-1]

assembler = VectorAssembler(inputCols=input_cols,outputCol="features")
train_data = assembler.transform(train_df).select("features",train_output_col)
(train_data, val_data) = train_data.randomSplit([0.7,0.3])
test_data = assembler.transform(test_df)

rf = RandomForestRegressor(featuresCol = "features", labelCol = "yield",)
# print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
model = rf.fit(train_data)
predictions = model.transform(val_data)
predictions.select("prediction", "yield", "features").show(5)

evaluator = RegressionEvaluator(
    labelCol="yield", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

print(model)  # summary only

+------------------+----------+--------------------+
|        prediction|     yield|            features|
+------------------+----------+--------------------+
| 4917.964478479442|4476.81146|[0.0,25.0,0.5,0.2...|
| 3892.121636462323|4112.89349|[22.0,25.0,0.5,0....|
|6117.2562367932605|6687.25926|[23.0,12.5,0.25,0...|
| 4625.148697969389|4641.00151|[28.0,12.5,0.25,0...|
| 7083.189459823967|8263.28974|[46.0,12.5,0.25,0...|
+------------------+----------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 583.535
RandomForestRegressionModel: uid=RandomForestRegressor_997f0795b19d, numTrees=20, numFeatures=17
