# Data Sources for DataFrames and SQL Tables

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [9]:
spark = (SparkSession
        .builder
        .appName("departuresDFreadwrite")
        .getOrCreate())

In [45]:
file = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/flights/summary-data/parquet/2010-summary.parquet"
csv_file = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/flights/summary-data/csv/*"
json_file = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/flights/summary-data/json/*"
avro_file = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/flights/summary-data/avro/*"

## DataFrame Reader

In [19]:
### Parquet: Read in the folder with all of the partial files.
df = spark.read.format("parquet").load(file)

In [20]:
### Parquet is default format so we can drop the format in this case. Will produce identical df.
df2 = spark.read.load(file)

In [21]:
### CSV
### Can immediately see how much nicer it is to deal with parquets that contain saved metadata.
df3 = (spark.read.format("csv")
    .option("inferSchema", "true")
    .option("header", "true")
    .option("mode", "PERMISSIVE")
    .load(csv_file))

In [22]:
### JSON
df4 = spark.read.format("json").load(json_file)

## DataFrame Writer

In [23]:
### Example:
location = "C:/Users/sean.cornillie/Education/LearningSparkV2/Spark_Dev/datasets/2010-summary.parquet"
df.write.format("parquet").mode("overwrite").save(location)

## Parquet

#### Write DataFrames to Spark SQL Tables

In [33]:
df.write.mode("overwrite").option("path", "/tmp/data/us_flights_delay2").saveAsTable("us_delay_flights_tbl")

In [34]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Write DataFrames to parquet files (as in above example)

In [35]:
(df.write.format("parquet")
    .mode("overwrite")
    .option("compression", "snappy")
    .save("/tmp/data/us_flights_delay2/df_parquet"))

## JSON

#### Read JSON file into a DataFrame and Spark SQL Table

In [36]:
df4 = spark.read.format("json").load(json_file)

In [37]:
df4.write.mode("overwrite").option("path", "/tmp/data/us_flights_delay2").saveAsTable("us_delay_flights_tbl")

In [38]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Write DataFrames to JSON file

In [39]:
(df.write.format("json")
    .mode("overwrite")
    .option("compression", "snappy")
    .save("/tmp/data/us_flights_delay2/df_json"))

## CSV

#### Read CSV file into DF

In [40]:
schema = "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"

df3 = (spark.read.format("csv")
    .option("header", "true")
    .schema(schema)
    .option("mode", "FAILFAST") # Exit if any errors
    .option("nullValue", "")    # Replace any null data with quotes
    .load(csv_file))

#### Read into Spark SQL Table

In [43]:
df3.write.mode("overwrite").option("path", "/tmp/data/us_flights_delay2").saveAsTable("us_delay_flights_tbl")
spark.sql("SELECT * FROM us_delay_flights_tbl").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



#### Write to CSV file

In [44]:
df.write.format("csv").mode("overwrite").save("/tmp/data/us_flights_delay2/df_csv")

## Images
Supports deep learning & ML frameworks such as TensorFlow & PyTorch, etc.

In [48]:
from pyspark.ml import image

In [51]:
image_dir = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/cctvVideos/train_images/"

images_df = spark.read.format("image").load(image_dir)

In [52]:
images_df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



In [53]:
images_df.select("image.height", "image.width", "image.nChannels", "image.mode",
"label").show(5, truncate=False)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



## Binary Files

In [55]:
path = "C:/Users/sean.cornillie/Education/LearningSparkV2/databricks-datasets/cctvVideos/train_images/"

binary_files_df = (spark.read.format("binaryFile")
                      .option("pathGlobFilter", "*.jpg")
                      .load(path))

binary_files_df.show(5)

+--------------------+--------------------+------+--------------------+-----+
|                path|    modificationTime|length|             content|label|
+--------------------+--------------------+------+--------------------+-----+
|file:/C:/Users/se...|2022-11-25 10:01:...| 55037|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/se...|2022-11-25 10:01:...| 54634|[FF D8 FF E0 00 1...|    1|
|file:/C:/Users/se...|2022-11-25 10:01:...| 54624|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/se...|2022-11-25 10:01:...| 54505|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/se...|2022-11-25 10:01:...| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+--------------------+------+--------------------+-----+
only showing top 5 rows

