# Binary Files

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Binary Files") \
    .master("local[*]") \
    .getOrCreate()

spark

In [2]:
%%sh

ls -lhtr dataset/files

total 616K
-rwxr-xr-x 1 root root 460K May 21 07:13 form.pdf
-rwxr-xr-x 1 root root  46K Sep 13 10:59 aws.png
-rwxr-xr-x 1 root root  35K Sep 13 11:00 databricks.png
-rwxr-xr-x 1 root root  64K Sep 13 11:09 spark.png
-rwxr-xr-x 1 root root 6.6K Oct  1 14:56 spark.jpg


In [6]:
# Lets read a .png file

df_spark_png = spark \
    .read \
    .format("binaryFile") \
    .load("dataset/files/spark.png")

df_spark_png.printSchema()
df_spark_png.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/home/jovyan...|2022-09-13 11:09:39| 64792|[89 50 4E 47 0D 0...|
+--------------------+-------------------+------+--------------------+



In [7]:
# Lets read all .png file

df_spark_png = spark \
    .read \
    .format("binaryFile") \
    .load("dataset/files/*.png")

df_spark_png.printSchema()
df_spark_png.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/home/jovyan...|2022-09-13 11:09:39| 64792|[89 50 4E 47 0D 0...|
|file:/home/jovyan...|2022-09-13 10:59:22| 46809|[89 50 4E 47 0D 0...|
|file:/home/jovyan...|2022-09-13 11:00:32| 35619|[89 50 4E 47 0D 0...|
+--------------------+-------------------+------+--------------------+



In [14]:
# We can even read PDF files

df_spark_pdf = spark \
    .read \
    .format("binaryFile") \
    .load("dataset/files/*.pdf")

df_spark_pdf.printSchema()
df_spark_pdf.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/home/jovyan...|2022-05-21 07:13:29|470701|[25 50 44 46 2D 3...|
+--------------------+-------------------+------+--------------------+



In [4]:
# We can even read Text files as binary files

df_spark_txt = spark \
    .read \
    .format("binaryFile") \
    .load("dataset/example.txt")

df_spark_txt.printSchema()
df_spark_txt.show()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)

+--------------------+--------------------+------+--------------------+
|                path|    modificationTime|length|             content|
+--------------------+--------------------+------+--------------------+
|file:/home/jovyan...|2022-10-15 12:11:...|    29|[54 68 69 73 20 6...|
+--------------------+--------------------+------+--------------------+



In [12]:
# Lets generate the text file back from the binary content
byte_content = df_spark_txt.select("content").collect()[0][0]

# Lets write the byte content as file back
with open("dataset/new_example.txt", "wb") as f:
    f.write(byte_content)
    f.close()

In [13]:
%%sh

cat dataset/new_example.txt

This is an example text file
