# Spark sample showing read/write methods
In this sample notebook, we will read CSV file from HDFS, write it as parquet file and save a Hive table definition. We will also run some Spark SQL commands using the Hive table.


In [1]:
# Read the CSV into a spark data frame, print schema & top rows
results = spark.read.option("inferSchema", "true").csv('/clickstream_data/web_clickstreams.csv').toDF(
            "wcs_click_date_sk", "wcs_click_time_sk", "wcs_sales_sk", "wcs_item_sk", "wcs_web_page_sk", "wcs_user_sk"
            )
results.printSchema()
results.show()

root
 |-- wcs_click_date_sk: integer (nullable = true)
 |-- wcs_click_time_sk: integer (nullable = true)
 |-- wcs_sales_sk: integer (nullable = true)
 |-- wcs_item_sk: integer (nullable = true)
 |-- wcs_web_page_sk: integer (nullable = true)
 |-- wcs_user_sk: integer (nullable = true)

+-----------------+-----------------+------------+-----------+---------------+-----------+
|wcs_click_date_sk|wcs_click_time_sk|wcs_sales_sk|wcs_item_sk|wcs_web_page_sk|wcs_user_sk|
+-----------------+-----------------+------------+-----------+---------------+-----------+
|            36890|            40052|        null|       4379|             34|       null|
|            36890|            41285|        null|       6245|             34|       null|
|            36890|            23115|        null|      13852|             34|       null|
|            36890|            17702|        null|      15975|             34|       null|
|            36890|            62676|        null|       2119|             3

In [1]:
# Disable saving SUCCESS file
sc._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") 

# Print the current warehouse directory
print(spark.conf.get("spark.sql.warehouse.dir"))

# Save results as parquet file and create hive table
results.write.format("parquet").mode("overwrite").saveAsTable("web_clickstreams")


hdfs:///user/hive/warehouse

In [1]:
# Execute Spark SQL commands
sqlDF = spark.sql("SELECT * FROM web_clickstreams LIMIT 100")
sqlDF.show()

sqlDF = spark.sql("SELECT wcs_user_sk, COUNT(*)\
                     FROM web_clickstreams\
                    WHERE wcs_user_sk IS NOT NULL\
                   GROUP BY wcs_user_sk\
                   ORDER BY COUNT(*) DESC LIMIT 100")
sqlDF.show()

+-----------------+-----------------+------------+-----------+---------------+-----------+
|wcs_click_date_sk|wcs_click_time_sk|wcs_sales_sk|wcs_item_sk|wcs_web_page_sk|wcs_user_sk|
+-----------------+-----------------+------------+-----------+---------------+-----------+
|            37506|             7933|        null|       1384|              2|      39437|
|            37506|            56044|        null|      14689|              2|      26419|
|            37506|            52706|        null|       8541|              2|      44016|
|            37506|            67325|        null|      16129|              2|      83371|
|            37506|            84857|        null|       1869|              2|      13090|
|            37506|            49599|        null|       2994|              2|       8940|
|            37506|            78150|        null|      11392|              2|      65633|
|            37506|            38720|        null|      14366|              2|      22281|