In [9]:
from pyspark.sql import SparkSession

In [10]:
session = SparkSession.builder.master('spark://spark-master:7077').appName('load_save').getOrCreate()

### Reading default parquet file format

In [11]:
# default file is parquet in the spark
data = session.read.load('/opt/bitnami/spark/data/users.parquet')

data_fil = data.select('name', 'favorite_color')
data_fil.show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          NULL|
|   Ben|           red|
+------+--------------+



#### Reading the file with file format

In [12]:
# specify the file format

people = session.read.load('/opt/bitnami/spark/data/people.json', format='json')
people.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



####  import files with options

In [13]:
# method 1
friends = session.read.load('/opt/bitnami/spark/data/fakefriends-header.csv',
                            format='csv',
                            inferSchema='true',
                            header='true',
                            sep=',')
friends.show()

+------+--------+---+-------+
|userID|    name|age|friends|
+------+--------+---+-------+
|     0|    Will| 33|    385|
|     1|Jean-Luc| 26|      2|
|     2|    Hugh| 55|    221|
|     3|  Deanna| 40|    465|
|     4|   Quark| 68|     21|
|     5|  Weyoun| 59|    318|
|     6|  Gowron| 37|    220|
|     7|    Will| 54|    307|
|     8|  Jadzia| 38|    380|
|     9|    Hugh| 27|    181|
|    10|     Odo| 53|    191|
|    11|     Ben| 57|    372|
|    12|   Keiko| 54|    253|
|    13|Jean-Luc| 56|    444|
|    14|    Hugh| 43|     49|
|    15|     Rom| 36|     49|
|    16|  Weyoun| 22|    323|
|    17|     Odo| 35|     13|
|    18|Jean-Luc| 45|    455|
|    19|  Geordi| 60|    246|
+------+--------+---+-------+
only showing top 20 rows



In [14]:
# method 2
friends_o = session.read.option('header', 'true').option('inferSchema', 'true').csv(
    '/opt/bitnami/spark/data/fakefriends-header.csv')
friends_o.show()

+------+--------+---+-------+
|userID|    name|age|friends|
+------+--------+---+-------+
|     0|    Will| 33|    385|
|     1|Jean-Luc| 26|      2|
|     2|    Hugh| 55|    221|
|     3|  Deanna| 40|    465|
|     4|   Quark| 68|     21|
|     5|  Weyoun| 59|    318|
|     6|  Gowron| 37|    220|
|     7|    Will| 54|    307|
|     8|  Jadzia| 38|    380|
|     9|    Hugh| 27|    181|
|    10|     Odo| 53|    191|
|    11|     Ben| 57|    372|
|    12|   Keiko| 54|    253|
|    13|Jean-Luc| 56|    444|
|    14|    Hugh| 43|     49|
|    15|     Rom| 36|     49|
|    16|  Weyoun| 22|    323|
|    17|     Odo| 35|     13|
|    18|Jean-Luc| 45|    455|
|    19|  Geordi| 60|    246|
+------+--------+---+-------+
only showing top 20 rows



#### Run sql directly on file

In [16]:
df = session.sql("select * from parquet.`/opt/bitnami/spark/data/users.parquet`")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          NULL|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+

