# Chapter 5: Loading and Saving your Data (Python)

The problems included in this notebook are solved using different high-level data sources included in Spark SQL.

In [41]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Loading-Saving-Data").master("local[*]").getOrCreate()
sc = spark.sparkContext

## Parquet Format

Loading data

In [42]:
parquet_data = spark.read.parquet("../data/person.parquet")

In [43]:
parquet_data.show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



Saving data

In [44]:
parquet_data.write.mode("overwrite").parquet("../data/person_write.parquet")

In [45]:
spark.read.parquet("../data/person_write.parquet").show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



## CSV Format

Loading data

In [46]:
csv_data = spark.read.option("header", "true").option("inferschema", "true").csv("../data/person.csv")

In [47]:
csv_data.show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



In [48]:
csv_data_bis = spark.read.csv("../data/person.csv", header=True, inferSchema=True)

In [49]:
csv_data_bis.show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



In [50]:
import pyspark.sql.types as T

In [51]:
schema = T.StructType([T.StructField("Name", T.StringType(), True),
                       T.StructField("Age", T.IntegerType(), True)])

In [52]:
csv_data_schema = spark.read.csv("../data/person.csv", header=True, schema=schema)

In [53]:
csv_data_schema.show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



Writing data

In [54]:
csv_data.write.mode("overwrite").csv("../data/person_write.csv", header=True)

In [55]:
spark.read.csv("../data/person_write.csv", header=True, inferSchema=True).show()

+----+---+
|Name|Age|
+----+---+
|Raul| 29|
|Javi| 34|
+----+---+



## JSON Format

Loading data

In [56]:
json_data = spark.read.json("../data/person.json")

In [57]:
json_data.show()

+---+----+
|age|name|
+---+----+
| 29|Raul|
| 33|Javi|
+---+----+



Saving data

In [58]:
json_data.write.mode("overwrite").json("../data/person_write.json")

In [59]:
spark.read.json("../data/person_write.json").show()

+---+----+
|age|name|
+---+----+
| 29|Raul|
| 33|Javi|
+---+----+

