## DataFrameReader

### Read data for the “core” data formats (CSV, JSON, JDBC, ORC, Parquet, text and tables)

    read.csv
    read.json
    read.parquet
    read.orc
    read.text
    read.table
    read.jdbc

## Open Session

In [211]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

### CSV

In [152]:
# csv

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType()),
    StructField("ORIGIN_COUNTRY_NAME", StringType()),
    StructField("count", IntegerType())
])

# or so
# schema = "DESTINATION VARCHAR(255), ORIGIN VARCHAR(255), NUM_OF_FLIGHTS DOUBLE"

df = (spark
      .read
      .option("inferShema", "true")
      .option("header", "true")
      .schema(schema)
      .csv('data/2010-summary.csv'))  # json, parquet, ocr

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [153]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'int')]

In [260]:
# same with fomat load
df = (spark
      .read
      .format('csv')
      .option("inferShema", "true")
      .option("header", "true")
      .load('data/2010-summary.csv'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

### JSON

In [157]:
# json
# works perfectly without options

df = (spark
      .read
      .json('data/2010-summary.json'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [155]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

In [261]:
# same with 
df = (spark
      .read
      .format('json')
      .load('data/2010-summary.json'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

### Parquet

The default format for Spark and the best ever

In [160]:
# parquet. The best format ever
# self defined format
# works perfectly without options

df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [161]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

### ORC

Hadoop format

In [167]:
# orc
# works perfectly without options

df = (spark
      .read
      .orc('data/part-r-00000-2c4f7d96-e703-4de3-af1b-1441d172c80f.snappy.orc'))

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [166]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

## Text File
https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.text

In [227]:
df = (spark
      .read
      .text('data/sample_movielens_ratings.txt'))

df.show()

+--------------------+
|               value|
+--------------------+
| 0::2::3::1424380312|
| 0::3::1::1424380312|
| 0::5::2::1424380312|
| 0::9::4::1424380312|
|0::11::1::1424380312|
|0::12::2::1424380312|
|0::15::1::1424380312|
|0::17::1::1424380312|
|0::19::1::1424380312|
|0::21::1::1424380312|
|0::23::1::1424380312|
|0::26::3::1424380312|
|0::27::1::1424380312|
|0::28::1::1424380312|
|0::29::1::1424380312|
|0::30::1::1424380312|
|0::31::1::1424380312|
|0::34::1::1424380312|
|0::37::1::1424380312|
|0::41::2::1424380312|
+--------------------+
only showing top 20 rows



In [238]:
from pyspark.sql.functions import col, split

(df.withColumn("col1", split(col("value"), "::").getItem(0))
 .withColumn("col2", split(col("value"), "::").getItem(1))
 .withColumn("col3", split(col("value"), "::").getItem(2))
 .withColumn("col4", split(col("value"), "::").getItem(3))
 .drop("value")
 .show())

+----+----+----+----------+
|col1|col2|col3|      col4|
+----+----+----+----------+
|   0|   2|   3|1424380312|
|   0|   3|   1|1424380312|
|   0|   5|   2|1424380312|
|   0|   9|   4|1424380312|
|   0|  11|   1|1424380312|
|   0|  12|   2|1424380312|
|   0|  15|   1|1424380312|
|   0|  17|   1|1424380312|
|   0|  19|   1|1424380312|
|   0|  21|   1|1424380312|
|   0|  23|   1|1424380312|
|   0|  26|   3|1424380312|
|   0|  27|   1|1424380312|
|   0|  28|   1|1424380312|
|   0|  29|   1|1424380312|
|   0|  30|   1|1424380312|
|   0|  31|   1|1424380312|
|   0|  34|   1|1424380312|
|   0|  37|   1|1424380312|
|   0|  41|   2|1424380312|
+----+----+----+----------+
only showing top 20 rows



In [250]:
# using rdd
rdd = (spark.sparkContext.textFile('data/sample_movielens_ratings.txt'))

df = rdd.map(lambda x: x.split("::")).toDF().show()

+---+---+---+----------+
| _1| _2| _3|        _4|
+---+---+---+----------+
|  0|  2|  3|1424380312|
|  0|  3|  1|1424380312|
|  0|  5|  2|1424380312|
|  0|  9|  4|1424380312|
|  0| 11|  1|1424380312|
|  0| 12|  2|1424380312|
|  0| 15|  1|1424380312|
|  0| 17|  1|1424380312|
|  0| 19|  1|1424380312|
|  0| 21|  1|1424380312|
|  0| 23|  1|1424380312|
|  0| 26|  3|1424380312|
|  0| 27|  1|1424380312|
|  0| 28|  1|1424380312|
|  0| 29|  1|1424380312|
|  0| 30|  1|1424380312|
|  0| 31|  1|1424380312|
|  0| 34|  1|1424380312|
|  0| 37|  1|1424380312|
|  0| 41|  2|1424380312|
+---+---+---+----------+
only showing top 20 rows



## Table

In [258]:
df = (spark
      .read
      .parquet('data/part-r-00000-1a9822ba-b8fb-4d8e-844a-ea30d0801b9e.gz.parquet'))
df.createOrReplaceTempView('tmpTable')

###
spark.read.table('tmpTable').show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

## Read non-core formats

See for example how to read AVRO:
https://spark.apache.org/docs/latest/sql-data-sources-avro.html

In [266]:
%%writefile pyfiles/avro_read.py

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

spark.read.format("avro").load("data/userdata1.avro").show()

spark.stop()

Writing pyfiles/avro_read.py


In [1]:
! spark-submit \
--packages org.apache.spark:spark-avro_2.11:2.4.3 \
pyfiles/avro_read.py

Ivy Default Cache set to: /Users/esn/.ivy2/cache
The jars for the packages stored in: /Users/esn/.ivy2/jars
:: loading settings :: url = jar:file:/Users/esn/anaconda3/lib/python3.7/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-avro_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-beb05846-0d02-4f34-83ec-c5fbe70be759;1.0
	confs: [default]
	found org.apache.spark#spark-avro_2.11;2.4.3 in central
	found org.spark-project.spark#unused;1.0.0 in central
:: resolution report :: resolve 349ms :: artifacts dl 12ms
	:: modules in use:
	org.apache.spark#spark-avro_2.11;2.4.3 from central in [default]
	org.spark-project.spark#unused;1.0.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-----------------

# Close Session

In [None]:
spark.stop()

### JDBC

All installed jars can be found here:

    ll /Users/esn/anaconda3/lib/python3.7/site-packages/pyspark/jars/*

In [209]:
%%writefile pyfiles/jdbc_read.py

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .master("local")
         .appName("Spark session")
         .getOrCreate())

spark.sparkContext.setLogLevel('ERROR')

spark.read.format('jdbc').\
options(url='jdbc:sqlite:data/my-sqlite.db',\
dbtable='flight_info',driver='org.sqlite.JDBC').load().show()

spark.stop()

Overwriting pyfiles/jdbc_read.py


In [210]:
! spark-submit \
--driver-class-path drivers/sqlite-jdbc-3.15.1.jar \
--jars drivers/sqlite-jdbc-3.15.1.jar \
pyfiles/jdbc_read.py

19/09/25 21:29:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
19/09/25 21:29:08 INFO SparkContext: Running Spark version 2.4.4
19/09/25 21:29:08 INFO SparkContext: Submitted application: Spark session
19/09/25 21:29:08 INFO SecurityManager: Changing view acls to: esn
19/09/25 21:29:08 INFO SecurityManager: Changing modify acls to: esn
19/09/25 21:29:08 INFO SecurityManager: Changing view acls groups to: 
19/09/25 21:29:08 INFO SecurityManager: Changing modify acls groups to: 
19/09/25 21:29:08 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(esn); groups with view permissions: Set(); users  with modify permissions: Set(esn); groups with modify permissions: Set()
19/09/25 21:29:13 INFO Utils: Successfully started service 'sparkDriver' on port 54418.
19/09/25 21:2

## DDL formatted schema

In [268]:
schema = "DESTINATION VARCHAR(255), ORIGIN VARCHAR(255), NUM_OF_FLIGHTS DOUBLE"

df = (spark
      .read
      .option("inferShema", "true")
      .option("header", "true")
      .schema(schema)
      .csv('data/2010-summary.csv'))

df.show()

+--------------------+----------------+--------------+
|         DESTINATION|          ORIGIN|NUM_OF_FLIGHTS|
+--------------------+----------------+--------------+
|       United States|         Romania|           1.0|
|       United States|         Ireland|         264.0|
|       United States|           India|          69.0|
|               Egypt|   United States|          24.0|
|   Equatorial Guinea|   United States|           1.0|
|       United States|       Singapore|          25.0|
|       United States|         Grenada|          54.0|
|          Costa Rica|   United States|         477.0|
|             Senegal|   United States|          29.0|
|       United States|Marshall Islands|          44.0|
|              Guyana|   United States|          17.0|
|       United States|    Sint Maarten|          53.0|
|               Malta|   United States|           1.0|
|             Bolivia|   United States|          46.0|
|            Anguilla|   United States|          21.0|
|Turks and

In [269]:
df.printSchema()

root
 |-- DESTINATION: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- NUM_OF_FLIGHTS: double (nullable = true)



In [270]:
df.describe().show()

+-------+-----------+-----------+------------------+
|summary|DESTINATION|     ORIGIN|    NUM_OF_FLIGHTS|
+-------+-----------+-----------+------------------+
|  count|        255|        255|               255|
|   mean|       null|       null| 1655.956862745098|
| stddev|       null|       null|21801.481975969557|
|    min|Afghanistan|Afghanistan|               1.0|
|    max|    Vietnam|    Vietnam|          348113.0|
+-------+-----------+-----------+------------------+



## How to construct and specify a schema using the StructType classes


In [272]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType()),
    StructField("ORIGIN_COUNTRY_NAME", StringType()),
    StructField("count", IntegerType())
])

df = (spark
      .read
      .option("header", "true")
      .schema(schema)
      .csv('data/2010-summary.csv'))  # json, parquet, ocr

df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|    1|
|       United States|            Ireland|  264|
|       United States|              India|   69|
|               Egypt|      United States|   24|
|   Equatorial Guinea|      United States|    1|
|       United States|          Singapore|   25|
|       United States|            Grenada|   54|
|          Costa Rica|      United States|  477|
|             Senegal|      United States|   29|
|       United States|   Marshall Islands|   44|
|              Guyana|      United States|   17|
|       United States|       Sint Maarten|   53|
|               Malta|      United States|    1|
|             Bolivia|      United States|   46|
|            Anguilla|      United States|   21|
|Turks and Caicos ...|      United States|  136|
|       United States|        Afghanistan|    2|
|Saint Vincent and..

In [273]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



## Close Session

In [174]:
spark.stop()