In [1]:
#!gsutil cp gs://yelp-dataset-bucket/* .

In [2]:
#!hdfs dfs -put yelp_academic_dataset_review.json /user/

In [None]:
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
data = (
    spark.read
    .json("/user/yelp_academic_dataset_review.json")
)

In [4]:
pipeline = Pipeline(stages=[
    Tokenizer(inputCol="text", outputCol="words"),
    HashingTF(inputCol="words", outputCol="term_frequency"),
    IDF(inputCol="term_frequency", outputCol="features"),
    LinearRegression(labelCol="stars")
])

In [5]:
from pyspark.ml.tuning import ParamGridBuilder

param_grid = (
    ParamGridBuilder()
    .addGrid("regParam", [0])
    .build()
)

In [6]:
debug_data = data.sample(0.01).cache()

In [7]:
models = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator(labelCol="stars")
).fit(debug_data)

In [8]:
models.validationMetrics

[2.278652871361633]

In [9]:
models.getEvaluator().getMetricName()

'rmse'

In [10]:
some_model = pipeline.fit(debug_data)

In [11]:
RegressionEvaluator(labelCol="stars").evaluate(
    some_model.transform(debug_data) #some_predictions = some_model.transform(debug_data)
)

0.21572674528694322

In [20]:
some_model.save("/user/some.model")

In [16]:
!hdfs dfs -ls /user

Found 11 items
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/hbase
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/hdfs
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/hive
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/mapred
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/pig
-rw-r--r--   2 root hadoop 6325565224 2020-12-17 13:23 /user/qlr
drwxr-xr-x   - root hadoop          0 2020-12-17 13:20 /user/root
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/spark
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/yarn
-rw-r--r--   2 root hadoop 6325565224 2020-12-17 13:27 /user/yelp_academic_dataset_review.json
drwxrwxrwt   - hdfs hadoop          0 2020-12-17 13:17 /user/zookeeper


In [17]:
!hdfs dfs -ls /user/qlr

-rw-r--r--   2 root hadoop 6325565224 2020-12-17 13:23 /user/qlr


In [18]:
!hdfs dfs -ls /user/root

Found 1 items
drwxr-xr-x   - root hadoop          0 2020-12-17 14:00 /user/root/.sparkStaging


In [28]:
!pwd

/


In [29]:
!rm /home/qlr/some_model
!hdfs dfs -get /user/some.model /home/qlr/some_model

In [None]:
!gsutils cp -r 