In [3]:
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.types import DoubleType
from pyspark.sql import SparkSession


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
+--------------------+-----+
only showing top 20 rows



IllegalArgumentException: Volume does not exist. Available: features, label, prediction

In [7]:

spark = SparkSession.builder.appName("new").getOrCreate()

# Load data as a PySpark DataFrame
df = spark.read.parquet("./stats.parquet")

df = df.select('vol_moving_avg', 'adj_close_rolling_med','Date','Volume')

# Convert 'Date' column to datetime type
df = df.withColumn('Date', col('Date').cast('timestamp'))

# Remove rows with NaN values
df = df.dropna()

# Select features and target
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

assembler = VectorAssembler(inputCols=features, outputCol="features")
df = assembler.transform(df)
df = df.select('features',target).withColumnRenamed(target, 'label')

(train_df, test_df) = df.randomSplit([0.8, 0.2], seed=42)

train_df.show()

# Create a RandomForestRegressor model
rf = RandomForestRegressor(seed=42)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

model = rf.fit(train_df)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> modelfit")

# Make predictions on test data
predictions = model.transform(test_df)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> pr")

# Calculate the Mean Absolute Error and Mean Squared Error
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="label", metricName='mae')
mae = evaluator.evaluate(predictions)

print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mae")
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="label", metricName='mse')
mse = evaluator.evaluate(predictions)


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0020000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
|[0.0,0.0030000000...|  0.0|
+--------------------+-----+
only showing top 20 rows

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> modelfit
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> pr
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> mae


In [8]:
model.save('./randomForestpredictionmodel')