# Saving Data from Dataframes

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '../Data'
file_path = data_path + '/location_temp.csv'

In [4]:
df = spark.read.format('csv').options(header=True).load(file_path)

In [5]:
df.show(10)

+-------------------+-----------+------------+
|         event_date|location_id|temp_celcius|
+-------------------+-----------+------------+
|03/04/2019 19:48:06|       loc0|          29|
|03/04/2019 19:53:06|       loc0|          27|
|03/04/2019 19:58:06|       loc0|          28|
|03/04/2019 20:03:06|       loc0|          30|
|03/04/2019 20:08:06|       loc0|          27|
|03/04/2019 20:13:06|       loc0|          27|
|03/04/2019 20:18:06|       loc0|          27|
|03/04/2019 20:23:06|       loc0|          29|
|03/04/2019 20:28:06|       loc0|          32|
|03/04/2019 20:33:06|       loc0|          35|
+-------------------+-----------+------------+
only showing top 10 rows



-------

# Save CSV file to the disk

In [33]:
df.write.csv('temp_data.csv')

# Using shell command to check the file
- We can see that spark can break up DataFrame and into Partition Subsets.
- In this case, 4 partitions. Each partition has its own file.

In [35]:
! ls temp_data.csv

_SUCCESS
part-00000-6daf7576-236d-4305-bea9-f9be7e1a9715-c000.csv
part-00001-6daf7576-236d-4305-bea9-f9be7e1a9715-c000.csv
part-00002-6daf7576-236d-4305-bea9-f9be7e1a9715-c000.csv
part-00003-6daf7576-236d-4305-bea9-f9be7e1a9715-c000.csv


# View data from the file
- As spark created directory, we need to use the folder name and look into the specific file.
- view data from the 1st partition of temp_data.csv file

In [36]:
! head temp_data.csv/part-00000-6daf7576-236d-4305-bea9-f9be7e1a9715-c000.csv

03/04/2019 19:48:06,loc0,29
03/04/2019 19:53:06,loc0,27
03/04/2019 19:58:06,loc0,28
03/04/2019 20:03:06,loc0,30
03/04/2019 20:08:06,loc0,27
03/04/2019 20:13:06,loc0,27
03/04/2019 20:18:06,loc0,27
03/04/2019 20:23:06,loc0,29
03/04/2019 20:28:06,loc0,32
03/04/2019 20:33:06,loc0,35


-------

# Save JSON file to the disk

In [37]:
df.write.json('temp_data.json')

In [38]:
! ls temp_data.json

_SUCCESS
part-00000-8acb09bc-eff2-4645-bf2c-edc70f3277f9-c000.json
part-00001-8acb09bc-eff2-4645-bf2c-edc70f3277f9-c000.json
part-00002-8acb09bc-eff2-4645-bf2c-edc70f3277f9-c000.json
part-00003-8acb09bc-eff2-4645-bf2c-edc70f3277f9-c000.json


We  can see that now data are in json, instead of csv.

In [39]:
! head temp_data.json/part-00000-8acb09bc-eff2-4645-bf2c-edc70f3277f9-c000.json

{"event_date":"03/04/2019 19:48:06","location_id":"loc0","temp_celcius":"29"}
{"event_date":"03/04/2019 19:53:06","location_id":"loc0","temp_celcius":"27"}
{"event_date":"03/04/2019 19:58:06","location_id":"loc0","temp_celcius":"28"}
{"event_date":"03/04/2019 20:03:06","location_id":"loc0","temp_celcius":"30"}
{"event_date":"03/04/2019 20:08:06","location_id":"loc0","temp_celcius":"27"}
{"event_date":"03/04/2019 20:13:06","location_id":"loc0","temp_celcius":"27"}
{"event_date":"03/04/2019 20:18:06","location_id":"loc0","temp_celcius":"27"}
{"event_date":"03/04/2019 20:23:06","location_id":"loc0","temp_celcius":"29"}
{"event_date":"03/04/2019 20:28:06","location_id":"loc0","temp_celcius":"32"}
{"event_date":"03/04/2019 20:33:06","location_id":"loc0","temp_celcius":"35"}
