# Spark sample showing read/write methods
In this sample notebook, we will read CSV file from HDFS, write it as parquet file and save a Hive table definition. We will also run some Spark SQL commands using the Hive table.


In [1]:
# Read the clickstream CSV file(s) into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/clickstream_data').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
results.printSchema()
results.show()

root
 |-- wcs_click_date_sk: integer (nullable = true)
 |-- wcs_click_time_sk: integer (nullable = true)
 |-- wcs_sales_sk: integer (nullable = true)
 |-- wcs_item_sk: integer (nullable = true)
 |-- wcs_web_page_sk: integer (nullable = true)
 |-- wcs_user_sk: integer (nullable = true)

+-----------------+-----------------+------------+-----------+---------------+-----------+
|wcs_click_date_sk|wcs_click_time_sk|wcs_sales_sk|wcs_item_sk|wcs_web_page_sk|wcs_user_sk|
+-----------------+-----------------+------------+-----------+---------------+-----------+
|            36890|            40052|        null|       4379|             34|       null|
|            36890|            41285|        null|       6245|             34|       null|
|            36890|            23115|        null|      13852|             34|       null|
|            36890|            17702|        null|      15975|             34|       null|
|            36890|            62676|        null|       2119|             3

In [1]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory where the parquet files will be stored
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")


hdfs:///user/hive/warehouse

In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT * FROM web_clickstreams LIMIT 100")
sqlDF.show()

sqlDF = spark.sql("SELECT wcs_user_sk, COUNT(*)\
                     FROM web_clickstreams\
                    WHERE wcs_user_sk IS NOT NULL\
                   GROUP BY wcs_user_sk\
                   ORDER BY COUNT(*) DESC LIMIT 100")
sqlDF.show()

+-----------------+-----------------+------------+-----------+---------------+-----------+
|wcs_click_date_sk|wcs_click_time_sk|wcs_sales_sk|wcs_item_sk|wcs_web_page_sk|wcs_user_sk|
+-----------------+-----------------+------------+-----------+---------------+-----------+
|            36890|            40052|        null|       4379|             34|       null|
|            36890|            41285|        null|       6245|             34|       null|
|            36890|            23115|        null|      13852|             34|       null|
|            36890|            17702|        null|      15975|             34|       null|
|            36890|            62676|        null|       2119|             34|       null|
|            36890|            34267|        null|      10273|             34|       null|
|            36890|             8502|        null|      17790|             34|       null|
|            36890|            54340|        null|       3453|             34|       null|

In [1]:
# Read the product reviews CSV files into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/product_review_data').toDF(
            "pr_review_sk", "pr_review_content"
            )
results.printSchema()
results.show()

root
 |-- pr_review_sk: integer (nullable = true)
 |-- pr_review_content: string (nullable = true)

+------------+--------------------+
|pr_review_sk|   pr_review_content|
+------------+--------------------+
|       72621|Works fine. Easy ...|
|       89334|great product to ...|
|       89335|Next time will go...|
|       84259|Great Gift Great ...|
|       84398|After trip to Par...|
|       66434|Simply the best t...|
|       66501|This is the exact...|
|       66587|Not super magnet;...|
|       66680|Installed as bath...|
|       66694|Our home was buil...|
|       84489|Hi ;We are runnin...|
|       79052|Terra cotta is th...|
|       73034|One of my fingern...|
|       73298|We installed thes...|
|       66810|needed silicone c...|
|       66912|Great Gift Great ...|
|       67028|Laguiole knives a...|
|       89770|Good sound timers...|
|       84679|AWESOME FEEDBACK ...|
|       84953|love the retro gl...|
+------------+--------------------+
only showing top 20 rows

In [1]:
# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("product_reviews")


In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT pr_review_sk, CHAR_LENGTH(pr_review_content) as len FROM product_reviews LIMIT 100")
sqlDF.show()

+------------+----+
|pr_review_sk| len|
+------------+----+
|       14868| 985|
|       14869|1601|
|       14875|1221|
|       14880| 665|
|       14886|  91|
|       14894| 697|
|       14899| 356|
|       14903|2361|
|       14908| 872|
|       14909|  74|
|       14917| 908|
|       14918|  50|
|       14919| 256|
|       14921| 723|
|       14925| 313|
|       14931|1304|
|       14939|1023|
|       14949| 552|
|       14954|2144|
|       14955| 123|
+------------+----+
only showing top 20 rows