In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor as DTR
from pyspark.ml.regression import RandomForestRegressor as RFR
from pyspark.ml.regression import GBTRegressor as GBT
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("regressors").getOrCreate()

In [8]:
df =spark.read.format("libsvm").load("newd/sample_libsvm.txt")

In [9]:
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures",maxCategories=4).fit(df)

In [10]:
train,test = df.randomSplit([0.8,0.2])

In [11]:
train.count()

80

In [12]:
dt = DTR(featuresCol="indexedFeatures")
dt1 = RFR(featuresCol="indexedFeatures")
dt3 = GBT(featuresCol="indexedFeatures")

In [14]:
pip = Pipeline(stages=[featureIndexer,dt])
pip1 = Pipeline(stages=[featureIndexer,dt1])
pip2 = Pipeline(stages=[featureIndexer,dt3])

In [15]:
model = pip.fit(train)
model1=pip1.fit(train)
model2 = pip2.fit(train)

In [16]:
pred = model.transform(test)
pred1=model1.transform(test)
pred2 = model2.transform(test)

In [17]:
pred.select("prediction","label","features").show(3)
pred1.select("prediction","label","features").show(3)
pred2.select("prediction","label","features").show(3)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 3 rows

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|      0.05|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[124,125,126...|
|      0.15|  0.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 3 rows

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[125,126,127...|
+----------+-----+--------------------+
only showing top 3 rows



In [19]:
evaluator = RegressionEvaluator(labelCol="label",predictionCol="prediction",metricName="rmse")


In [20]:
rmse = evaluator.evaluate(pred)
rmse1 = evaluator.evaluate(pred1)
rmse2 = evaluator.evaluate(pred2)

In [21]:
print("RMSE OF TEST DATA USING DecisionTreeRegressor ",rmse)
print("RMSE OF TEST DATA USING RandomForestRegressor ",rmse1)
print("RMSE OF TEST DATA USING GBTRegressor ",rmse2)


RMSE OF TEST DATA USING DecisionTreeRegressor  0.22360679774997896
RMSE OF TEST DATA USING RandomForestRegressor  0.22994564575133836
RMSE OF TEST DATA USING GBTRegressor  0.22360679774997896


In [24]:
treemodel=model.stages[1]
print(treemodel)
rfModel = model1.stages[1]
print(rfModel)
gbtModel = model2.stages[1]
print(gbtModel)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_7fc8c61ef94d, depth=1, numNodes=3, numFeatures=692
RandomForestRegressionModel: uid=RandomForestRegressor_5a9f8e88c363, numTrees=20, numFeatures=692
GBTRegressionModel: uid=GBTRegressor_6c8af4167c81, numTrees=20, numFeatures=692
