In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Prepare training and test data.
data = spark.read.format("libsvm")\
    .load("/home/ec2-user/project/sparkML-labelNoise/spencer/sum_tab_1.data")
train, test = data.randomSplit([0.9, 0.1], seed=12345)

lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.transform(test)\
    .select("features", "label", "prediction")\
    .show()

+--------------+-----+--------------------+
|      features|label|          prediction|
+--------------+-----+--------------------+
| (1,[0],[6.0])|  0.0|0.018784371944667554|
|(1,[0],[11.0])|  0.0| 0.03443801523189051|
|(1,[0],[57.0])|  0.0| 0.17845153347434176|
|(1,[0],[76.0])|  0.0| 0.23793537796578898|
|(1,[0],[47.0])|  1.0| 0.14714424689989583|
|(1,[0],[65.0])|  1.0| 0.20349736273389848|
+--------------+-----+--------------------+



In [2]:
!find "$(pwd)"

/home/ec2-user/project/sparkML-labelNoise
/home/ec2-user/project/sparkML-labelNoise/.git
/home/ec2-user/project/sparkML-labelNoise/.git/branches
/home/ec2-user/project/sparkML-labelNoise/.git/description
/home/ec2-user/project/sparkML-labelNoise/.git/hooks
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/applypatch-msg.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/commit-msg.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/post-update.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/pre-applypatch.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/pre-commit.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/pre-push.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/pre-rebase.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/pre-receive.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/prepare-commit-msg.sample
/home/ec2-user/project/sparkML-labelNoise/.git/hooks/update.sample
/home/

In [1]:
import pyspark